From f3a9c380c71bce99a143b5794da51443da48e8d1 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:47 -0700 Subject: [PATCH 001/263] Fix incorrect update in trgtol --- src/trans/gpu/internal/trgtol_mod.F90 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 17cdce08e..bab50b8ad 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 1995- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -432,7 +433,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ! Copy local contribution !$ACC DATA PRESENT(PGLAT) COPYIN(KPTRGP,INDOFF,KINDEX) - !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) COPYIN(IGPTROFF) + !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) COPYIN(IUVLEVS,IUVPARS) !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) COPYIN(IGP2PARS) !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) COPYIN(IGP3ALEVS,IGP3APARS) From 020d37e17ab3cf9bf4ad69b8bef1632d616a9d65 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:48 -0700 Subject: [PATCH 002/263] Fix for undefined variable --- src/trans/gpu/internal/ledir_mod.F90 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 23de16dd3..bfc75abfb 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -299,6 +300,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ENDDO IF(KMLOC0 > 0) THEN + ISKIP = 2 !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KDGLU,ISL) DEFAULT(NONE) DO J=1,R_NDGNH DO JK=1,KFC From be36abcc51be78572c39bc54e302f56b80dbdeb1 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:48 -0700 Subject: [PATCH 003/263] ISKIP must not be private because defined before --- src/trans/gpu/internal/ledir_mod.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index bfc75abfb..281896aea 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -193,7 +193,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) print*,'computing m=0 in double precision' ISKIP = 2 - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,KDGLU,ISL,ISKIP) DEFAULT(NONE) + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,KDGLU,ISL) DEFAULT(NONE) DO J=1,R_NDGNH DO JK=1,KFC From 7647716f1f96f84f0a5da4de78a6b8e3c4dfc611 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:48 -0700 Subject: [PATCH 004/263] Format and restructure OpenACC Loop for packing kernels --- src/trans/gpu/internal/ledir_mod.F90 | 108 +++++++++++++-------------- 1 file changed, 52 insertions(+), 56 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 281896aea..c1d426d18 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -125,28 +125,25 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) IF ( KMODE == -1 ) THEN -!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM,KDGLU,ISL,ISKIP) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,KDGLU,ISL,ISKIP) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO J=1,R_NDGNH - DO JK=1,KFC - - KM = D_MYMS(KMLOC) - KDGLU = MIN(R_NDGNH,G_NDGLU(KM)) - IF (J .LE. KDGLU) THEN - ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) - - IF(KM == 0)THEN - ISKIP = 2 - ELSE - ISKIP = 1 - ENDIF - IF (MOD((JK-1),ISKIP) .EQ. 0) THEN - !DZBST((JK-1)/ISKIP+1+(J-1+(KMLOC-1)*R_NDGNH)*IF_FS_DIR)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) - DZBST((JK-1)/ISKIP+1+(J-1+(KMLOC-1)*DLDZBA)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) - END IF - END IF - ENDDO - ENDDO + DO JK=1,KFC + KM = D_MYMS(KMLOC) + KDGLU = MIN(R_NDGNH,G_NDGLU(KM)) + DO J=1,R_NDGNH + IF (J .LE. KDGLU) THEN + ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) + IF(KM == 0)THEN + ISKIP = 2 + ELSE + ISKIP = 1 + ENDIF + IF (MOD((JK-1),ISKIP) .EQ. 0) THEN + DZBST((JK-1)/ISKIP+1+(J-1+(KMLOC-1)*DLDZBA)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) + END IF + END IF + ENDDO + ENDDO END DO @@ -197,13 +194,13 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) DO J=1,R_NDGNH DO JK=1,KFC - KDGLU = MIN(R_NDGNH,G_NDGLU(0)) - IF (J .LE. KDGLU) THEN - ISL = MAX(R_NDGNH-G_NDGLU(0)+1,1) - IF (MOD((JK-1),ISKIP) .EQ. 0) THEN - DZBST0((JK-1)/ISKIP+1+(J-1)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) - END IF - END IF + KDGLU = MIN(R_NDGNH,G_NDGLU(0)) + IF (J .LE. KDGLU) THEN + ISL = MAX(R_NDGNH-G_NDGLU(0)+1,1) + IF (MOD((JK-1),ISKIP) .EQ. 0) THEN + DZBST0((JK-1)/ISKIP+1+(J-1)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) + END IF + END IF ENDDO ENDDO @@ -238,27 +235,26 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ! symmetric -!$acc parallel loop collapse(3) private(KM,KDGLU,ISL,ISKIP) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,KDGLU,ISL,ISKIP) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO J=1,R_NDGNH - DO JK=1,KFC - KM = D_MYMS(KMLOC) - KDGLU = MIN(R_NDGNH,G_NDGLU(KM)) - IF (J .LE. KDGLU) THEN - ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) - - IF(KM == 0)THEN - ISKIP = 2 - ELSE - ISKIP = 1 - ENDIF - IF (MOD((JK-1),ISKIP) .EQ. 0) THEN -! DZBST((JK-1)/ISKIP+1,J,KMLOC)=PSIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) - DZBST((JK-1)/ISKIP+1+(J-1+(KMLOC-1)*DLDZBS)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) - END IF - END IF - ENDDO - ENDDO + DO JK=1,KFC + KM = D_MYMS(KMLOC) + KDGLU = MIN(R_NDGNH,G_NDGLU(KM)) + DO J=1,R_NDGNH + IF (J .LE. KDGLU) THEN + ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) + + IF(KM == 0)THEN + ISKIP = 2 + ELSE + ISKIP = 1 + ENDIF + IF (MOD((JK-1),ISKIP) .EQ. 0) THEN + DZBST((JK-1)/ISKIP+1+(J-1+(KMLOC-1)*DLDZBS)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) + END IF + END IF + ENDDO + ENDDO END DO ! Get C in transpose format to get better memory access patterns later @@ -303,15 +299,15 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ISKIP = 2 !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KDGLU,ISL) DEFAULT(NONE) DO J=1,R_NDGNH - DO JK=1,KFC - KDGLU = MIN(R_NDGNH,G_NDGLU(0)) - IF (J .LE. KDGLU) THEN - ISL = MAX(R_NDGNH-G_NDGLU(0)+1,1) - IF (MOD((JK-1),ISKIP) .eq. 0) THEN - DZBST0((JK-1)/ISKIP+1+(J-1)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) - END IF + DO JK=1,KFC + KDGLU = MIN(R_NDGNH,G_NDGLU(0)) + IF (J .LE. KDGLU) THEN + ISL = MAX(R_NDGNH-G_NDGLU(0)+1,1) + IF (MOD((JK-1),ISKIP) .eq. 0) THEN + DZBST0((JK-1)/ISKIP+1+(J-1)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) END IF - ENDDO + END IF + ENDDO ENDDO ! Get C in transpose format to get better memory access patterns later From a66e78ad345f4abec7c41e6d066eb6e545aae459 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:48 -0700 Subject: [PATCH 005/263] Remove ISKIP --- src/trans/gpu/internal/ledir_mod.F90 | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index c1d426d18..e155b22aa 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -125,7 +125,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) IF ( KMODE == -1 ) THEN -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,KDGLU,ISL,ISKIP) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,KDGLU,ISL) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO JK=1,KFC KM = D_MYMS(KMLOC) @@ -133,14 +133,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) DO J=1,R_NDGNH IF (J .LE. KDGLU) THEN ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) - IF(KM == 0)THEN - ISKIP = 2 - ELSE - ISKIP = 1 - ENDIF - IF (MOD((JK-1),ISKIP) .EQ. 0) THEN - DZBST((JK-1)/ISKIP+1+(J-1+(KMLOC-1)*DLDZBA)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) - END IF + DZBST((JK-1)+1+(J-1+(KMLOC-1)*DLDZBA)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) END IF ENDDO ENDDO @@ -235,7 +228,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ! symmetric -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,KDGLU,ISL,ISKIP) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,KDGLU,ISL) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO JK=1,KFC KM = D_MYMS(KMLOC) @@ -243,15 +236,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) DO J=1,R_NDGNH IF (J .LE. KDGLU) THEN ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) - - IF(KM == 0)THEN - ISKIP = 2 - ELSE - ISKIP = 1 - ENDIF - IF (MOD((JK-1),ISKIP) .EQ. 0) THEN - DZBST((JK-1)/ISKIP+1+(J-1+(KMLOC-1)*DLDZBS)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) - END IF + DZBST((JK-1)+1+(J-1+(KMLOC-1)*DLDZBS)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) END IF ENDDO ENDDO From 3184538746549e83b27196c91e17137f3d5bb2d3 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:48 -0700 Subject: [PATCH 006/263] Remove unnecessary conditions --- src/trans/gpu/internal/ledir_mod.F90 | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index e155b22aa..5ee031661 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -125,16 +125,14 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) IF ( KMODE == -1 ) THEN -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,KDGLU,ISL) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO JK=1,KFC - KM = D_MYMS(KMLOC) - KDGLU = MIN(R_NDGNH,G_NDGLU(KM)) - DO J=1,R_NDGNH - IF (J .LE. KDGLU) THEN - ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) - DZBST((JK-1)+1+(J-1+(KMLOC-1)*DLDZBA)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) - END IF + KM = D_MYMS(KMLOC) + ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) + + DO J=1,MIN(R_NDGNH,G_NDGLU(KM)) + DZBST((JK-1)+1+(J-1+(KMLOC-1)*DLDZBA)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) ENDDO ENDDO END DO @@ -228,16 +226,14 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ! symmetric -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,KDGLU,ISL) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO JK=1,KFC KM = D_MYMS(KMLOC) - KDGLU = MIN(R_NDGNH,G_NDGLU(KM)) - DO J=1,R_NDGNH - IF (J .LE. KDGLU) THEN - ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) - DZBST((JK-1)+1+(J-1+(KMLOC-1)*DLDZBS)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) - END IF + ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) + + DO J=1,MIN(R_NDGNH,G_NDGLU(KM)) + DZBST((JK-1)+1+(J-1+(KMLOC-1)*DLDZBS)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) ENDDO ENDDO END DO From 449d8a8b16c8291ca53fef5fad9c855bfbea5f18 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:48 -0700 Subject: [PATCH 007/263] Clean up unpack --- src/trans/gpu/internal/ledir_mod.F90 | 62 ++++++++++------------------ 1 file changed, 22 insertions(+), 40 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 5ee031661..945b70490 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -153,27 +153,18 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) & D_NUMP) !$ACC END HOST_DATA -!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM,ISKIP,ILA,IA,ILS) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISKIP,ILA,IA,ILS) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO J=1,(R_NTMAX+2)/2 - DO JK=1,KFC - - KM = D_MYMS(KMLOC) - IF(KM == 0)THEN - ISKIP = 2 - ELSE - ISKIP = 1 - ENDIF - - IF (MOD((JK-1),ISKIP) .EQ. 0) THEN - ILA = (R_NTMAX-KM+2)/2 - IA = 1+MOD(R_NTMAX-KM+2,2) - IF (J .LE. ILA) THEN - POA1(JK,IA+(J-1)*2,KMLOC) = DZCAT((JK-1)/ISKIP+1+(J-1+(KMLOC-1)*DLDZCA)*DTDZCA) - END IF - END IF - ENDDO - ENDDO + DO JK=1,KFC + KM = D_MYMS(KMLOC) + ILA = (R_NTMAX-KM+2)/2 + IA = 1+MOD(R_NTMAX-KM+2,2) + DO J=1,(R_NTMAX+2)/2 + IF (J .LE. ILA) THEN + POA1(JK,IA+(J-1)*2,KMLOC) = DZCAT((JK-1)+1+(J-1+(KMLOC-1)*DLDZCA)*DTDZCA) + END IF + ENDDO + ENDDO ENDDO ! compute m=0 in double precision: @@ -253,27 +244,18 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) & D_NUMP) !$ACC END HOST_DATA -!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM,ISKIP,ILA,IA,ILS,IS) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISKIP,ILA,IA,ILS,IS) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO J=1,(R_NTMAX+3)/2 - DO JK=1,KFC - - KM = D_MYMS(KMLOC) - IF(KM == 0)THEN - ISKIP = 2 - ELSE - ISKIP = 1 - ENDIF - - IF (MOD((JK-1),ISKIP) .EQ. 0) THEN - ILS = (R_NTMAX-KM+3)/2 - IF (J .LE. ILS) THEN - IS = 1+MOD(R_NTMAX-KM+1,2) - POA1(JK,IS+(J-1)*2,KMLOC) = DZCST((JK-1)/ISKIP+1+(J-1+(KMLOC-1)*DLDZCS)*DTDZCS) - END IF - END IF - ENDDO - ENDDO + DO JK=1,KFC + KM = D_MYMS(KMLOC) + ILS = (R_NTMAX-KM+3)/2 + IS = 1+MOD(R_NTMAX-KM+1,2) + DO J=1,(R_NTMAX+3)/2 + IF (J .LE. ILS) THEN + POA1(JK,IS+(J-1)*2,KMLOC) = DZCST((JK-1)+1+(J-1+(KMLOC-1)*DLDZCS)*DTDZCS) + END IF + ENDDO + ENDDO ENDDO IF(KMLOC0 > 0) THEN From 16128455c45138c09e0f484c4d1297ca335036d1 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:49 -0700 Subject: [PATCH 008/263] Remove redundant if condition --- src/trans/gpu/internal/ledir_mod.F90 | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 945b70490..9661cb322 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -153,16 +153,13 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) & D_NUMP) !$ACC END HOST_DATA -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISKIP,ILA,IA,ILS) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO JK=1,KFC KM = D_MYMS(KMLOC) - ILA = (R_NTMAX-KM+2)/2 IA = 1+MOD(R_NTMAX-KM+2,2) - DO J=1,(R_NTMAX+2)/2 - IF (J .LE. ILA) THEN - POA1(JK,IA+(J-1)*2,KMLOC) = DZCAT((JK-1)+1+(J-1+(KMLOC-1)*DLDZCA)*DTDZCA) - END IF + DO J=1,(R_NTMAX-KM+2)/2 + POA1(JK,IA+(J-1)*2,KMLOC) = DZCAT((JK-1)+1+(J-1+(KMLOC-1)*DLDZCA)*DTDZCA) ENDDO ENDDO ENDDO @@ -244,16 +241,13 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) & D_NUMP) !$ACC END HOST_DATA -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISKIP,ILA,IA,ILS,IS) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO JK=1,KFC KM = D_MYMS(KMLOC) - ILS = (R_NTMAX-KM+3)/2 IS = 1+MOD(R_NTMAX-KM+1,2) - DO J=1,(R_NTMAX+3)/2 - IF (J .LE. ILS) THEN - POA1(JK,IS+(J-1)*2,KMLOC) = DZCST((JK-1)+1+(J-1+(KMLOC-1)*DLDZCS)*DTDZCS) - END IF + DO J=1,(R_NTMAX-KM+3)/2 + POA1(JK,IS+(J-1)*2,KMLOC) = DZCST((JK-1)+1+(J-1+(KMLOC-1)*DLDZCS)*DTDZCS) ENDDO ENDDO ENDDO From 0e2039937f7ca3be2eb45c746d472573f2ae872a Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:49 -0700 Subject: [PATCH 009/263] Fix loop bounds for KMLOC0 packing --- src/trans/gpu/internal/ledir_mod.F90 | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 9661cb322..c8f0bca12 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -169,16 +169,12 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) print*,'computing m=0 in double precision' ISKIP = 2 - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,KDGLU,ISL) DEFAULT(NONE) - DO J=1,R_NDGNH + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(ISL) DEFAULT(NONE) + DO J=1,MIN(R_NDGNH,G_NDGLU(0)) DO JK=1,KFC - - KDGLU = MIN(R_NDGNH,G_NDGLU(0)) - IF (J .LE. KDGLU) THEN - ISL = MAX(R_NDGNH-G_NDGLU(0)+1,1) - IF (MOD((JK-1),ISKIP) .EQ. 0) THEN - DZBST0((JK-1)/ISKIP+1+(J-1)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) - END IF + ISL = MAX(R_NDGNH-G_NDGLU(0)+1,1) + IF (MOD((JK-1),ISKIP) .EQ. 0) THEN + DZBST0((JK-1)/ISKIP+1+(J-1)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) END IF ENDDO ENDDO @@ -254,15 +250,12 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) IF(KMLOC0 > 0) THEN ISKIP = 2 - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KDGLU,ISL) DEFAULT(NONE) - DO J=1,R_NDGNH + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(ISL) DEFAULT(NONE) + DO J=1,MIN(R_NDGNH,G_NDGLU(0)) DO JK=1,KFC - KDGLU = MIN(R_NDGNH,G_NDGLU(0)) - IF (J .LE. KDGLU) THEN - ISL = MAX(R_NDGNH-G_NDGLU(0)+1,1) - IF (MOD((JK-1),ISKIP) .eq. 0) THEN - DZBST0((JK-1)/ISKIP+1+(J-1)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) - END IF + ISL = MAX(R_NDGNH-G_NDGLU(0)+1,1) + IF (MOD((JK-1),ISKIP) .eq. 0) THEN + DZBST0((JK-1)/ISKIP+1+(J-1)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) END IF ENDDO ENDDO From e41ca8b100cd45d55915582370c8980769036fc3 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:49 -0700 Subject: [PATCH 010/263] Remove conditions --- src/trans/gpu/internal/ledir_mod.F90 | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index c8f0bca12..67577708b 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -171,11 +171,9 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(ISL) DEFAULT(NONE) DO J=1,MIN(R_NDGNH,G_NDGLU(0)) - DO JK=1,KFC + DO JK=1,KFC,2 ISL = MAX(R_NDGNH-G_NDGLU(0)+1,1) - IF (MOD((JK-1),ISKIP) .EQ. 0) THEN - DZBST0((JK-1)/ISKIP+1+(J-1)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) - END IF + DZBST0((JK-1)/2+1+(J-1)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) ENDDO ENDDO @@ -252,11 +250,9 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ISKIP = 2 !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(ISL) DEFAULT(NONE) DO J=1,MIN(R_NDGNH,G_NDGLU(0)) - DO JK=1,KFC + DO JK=1,KFC,2 ISL = MAX(R_NDGNH-G_NDGLU(0)+1,1) - IF (MOD((JK-1),ISKIP) .eq. 0) THEN - DZBST0((JK-1)/ISKIP+1+(J-1)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) - END IF + DZBST0((JK-1)/2+1+(J-1)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) ENDDO ENDDO From 6d701aca8e1df3a54137857a4779e65d66714246 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:49 -0700 Subject: [PATCH 011/263] Remove ISKIP for unpack loop and globally --- src/trans/gpu/internal/ledir_mod.F90 | 48 ++++++++++++---------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 67577708b..adcaec316 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -91,7 +91,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) REAL(KIND=JPRBT), INTENT(OUT) :: POA1(:,:,:) ! LOCAL VARIABLES -INTEGER(KIND=JPIM) :: IA, ILA, ILS, IS, ISKIP, ISL, IF, J, JK, IRET +INTEGER(KIND=JPIM) :: IA, ILA, ILS, IS, ISL, IF, J, JK, IRET INTEGER(KIND=JPIM) :: ITHRESHOLD REAL(KIND=JPRB) :: RRPELTMDIR = 100.0_JPRB REAL(KIND=JPHOOK) :: ZHOOK_HANDLE @@ -167,7 +167,6 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ! compute m=0 in double precision: IF(KMLOC0 > 0) THEN print*,'computing m=0 in double precision' - ISKIP = 2 !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(ISL) DEFAULT(NONE) DO J=1,MIN(R_NDGNH,G_NDGLU(0)) @@ -190,18 +189,16 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ! &ZAA0,LDZAA,0._JPRD,DZCAT0,DTDZCA) !$ACC END HOST_DATA - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(ILA,IA,ILS) DEFAULT(NONE) - DO J=1,(R_NTMAX+2)/2 - DO JK=1,KFC - IF (MOD((JK-1),ISKIP) .EQ. 0) THEN - ILA = (R_NTMAX+2)/2 - IA = 1+MOD(R_NTMAX+2,2) - IF (J .LE. ILA) THEN - POA1(JK,IA+(J-1)*2,KMLOC0) = DZCAT0((JK-1)/ISKIP+1+(J-1)*DTDZCA) - END IF - END IF - ENDDO -ENDDO + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(ILA,IA,ILS) DEFAULT(NONE) + DO J=1,(R_NTMAX+2)/2 + DO JK=1,KFC,2 + ILA = (R_NTMAX+2)/2 + IA = 1+MOD(R_NTMAX+2,2) + IF (J .LE. ILA) THEN + POA1(JK,IA+(J-1)*2,KMLOC0) = DZCAT0((JK-1)/2+1+(J-1)*DTDZCA) + END IF + ENDDO + ENDDO ENDIF ELSE @@ -247,7 +244,6 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ENDDO IF(KMLOC0 > 0) THEN - ISKIP = 2 !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(ISL) DEFAULT(NONE) DO J=1,MIN(R_NDGNH,G_NDGLU(0)) DO JK=1,KFC,2 @@ -268,18 +264,16 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) & 0._JPRD,DZCST0,DTDZCS,DLDZCS,1) !$ACC end host_data - !$ACC parallel loop collapse(2) private(ILA,IA,ILS,IS) DEFAULT(NONE) - DO J=1,(R_NTMAX+3)/2 - DO JK=1,KFC - if (MOD((JK-1),ISKIP) .eq. 0) then - ILS = (R_NTMAX+3)/2 - if (J .le. ILS) then - IS = 1+MOD(R_NTMAX+1,2) - POA1(JK,IS+(J-1)*2,KMLOC0) = DZCST0((JK-1)/ISKIP+1+(J-1)*DTDZCS) - end if - end if - ENDDO - ENDDO + !$ACC parallel loop collapse(2) private(ILA,IA,ILS,IS) DEFAULT(NONE) + DO J=1,(R_NTMAX+3)/2 + DO JK=1,KFC,2 + ILS = (R_NTMAX+3)/2 + if (J .le. ILS) then + IS = 1+MOD(R_NTMAX+1,2) + POA1(JK,IS+(J-1)*2,KMLOC0) = DZCST0((JK-1)/2+1+(J-1)*DTDZCS) + end if + ENDDO + ENDDO ENDIF From c29edad13f932e53302ce19feeeef04bfdbfd810 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:49 -0700 Subject: [PATCH 012/263] Remove redundant condition --- src/trans/gpu/internal/ledir_mod.F90 | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index adcaec316..04081808d 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -189,14 +189,11 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ! &ZAA0,LDZAA,0._JPRD,DZCAT0,DTDZCA) !$ACC END HOST_DATA - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(ILA,IA,ILS) DEFAULT(NONE) + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IA) DEFAULT(NONE) DO J=1,(R_NTMAX+2)/2 DO JK=1,KFC,2 - ILA = (R_NTMAX+2)/2 IA = 1+MOD(R_NTMAX+2,2) - IF (J .LE. ILA) THEN - POA1(JK,IA+(J-1)*2,KMLOC0) = DZCAT0((JK-1)/2+1+(J-1)*DTDZCA) - END IF + POA1(JK,IA+(J-1)*2,KMLOC0) = DZCAT0((JK-1)/2+1+(J-1)*DTDZCA) ENDDO ENDDO ENDIF @@ -264,14 +261,11 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) & 0._JPRD,DZCST0,DTDZCS,DLDZCS,1) !$ACC end host_data - !$ACC parallel loop collapse(2) private(ILA,IA,ILS,IS) DEFAULT(NONE) + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IS) DEFAULT(NONE) DO J=1,(R_NTMAX+3)/2 DO JK=1,KFC,2 - ILS = (R_NTMAX+3)/2 - if (J .le. ILS) then - IS = 1+MOD(R_NTMAX+1,2) - POA1(JK,IS+(J-1)*2,KMLOC0) = DZCST0((JK-1)/2+1+(J-1)*DTDZCS) - end if + IS = 1+MOD(R_NTMAX+1,2) + POA1(JK,IS+(J-1)*2,KMLOC0) = DZCST0((JK-1)/2+1+(J-1)*DTDZCS) ENDDO ENDDO From 5b4d9703a5716854cae3421986d52f11dee05356 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:50 -0700 Subject: [PATCH 013/263] remaining small cleanup --- src/trans/gpu/internal/ledir_mod.F90 | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 04081808d..42503445a 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -79,30 +79,22 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ! DUMMY ARGUMENTS INTEGER(KIND=JPIM) :: KM INTEGER(KIND=JPIM) :: KMLOC -INTEGER(KIND=JPIM) :: KFC -INTEGER(KIND=JPIM) :: KIFC INTEGER(KIND=JPIM) :: KDGLU INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS INTEGER(KIND=JPIM), INTENT(IN) :: KLED2 INTEGER(KIND=JPIM), INTENT(IN) :: KMODE REAL(KIND=JPRBT), INTENT(IN) :: PAIA(:,:,:) -!REAL(KIND=JPRBT), INTENT(IN) :: PSIA(:,:,:), PAIA(:,:,:) REAL(KIND=JPRBT), INTENT(OUT) :: POA1(:,:,:) ! LOCAL VARIABLES -INTEGER(KIND=JPIM) :: IA, ILA, ILS, IS, ISL, IF, J, JK, IRET -INTEGER(KIND=JPIM) :: ITHRESHOLD -REAL(KIND=JPRB) :: RRPELTMDIR = 100.0_JPRB +INTEGER(KIND=JPIM) :: IA, IS, ISL, J, JK REAL(KIND=JPHOOK) :: ZHOOK_HANDLE INTEGER :: ISTAT IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) -KFC = 2*KF_FS -KIFC = KFC - !$ACC DATA & !$ACC& PRESENT(F,F%RW) & !$ACC& PRESENT(D,D_NUMP,D_MYMS,R,R_NDGNH,G,G_NDGLU,R_NSMAX,R_NTMAX) & @@ -127,7 +119,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO JK=1,KFC + DO JK=1,2*KF_FS KM = D_MYMS(KMLOC) ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) @@ -155,7 +147,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO JK=1,KFC + DO JK=1,2*KF_FS KM = D_MYMS(KMLOC) IA = 1+MOD(R_NTMAX-KM+2,2) DO J=1,(R_NTMAX-KM+2)/2 @@ -170,7 +162,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(ISL) DEFAULT(NONE) DO J=1,MIN(R_NDGNH,G_NDGLU(0)) - DO JK=1,KFC,2 + DO JK=1,2*KF_FS,2 ISL = MAX(R_NDGNH-G_NDGLU(0)+1,1) DZBST0((JK-1)/2+1+(J-1)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) ENDDO @@ -191,7 +183,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IA) DEFAULT(NONE) DO J=1,(R_NTMAX+2)/2 - DO JK=1,KFC,2 + DO JK=1,2*KF_FS,2 IA = 1+MOD(R_NTMAX+2,2) POA1(JK,IA+(J-1)*2,KMLOC0) = DZCAT0((JK-1)/2+1+(J-1)*DTDZCA) ENDDO @@ -204,7 +196,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO JK=1,KFC + DO JK=1,2*KF_FS KM = D_MYMS(KMLOC) ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) @@ -231,7 +223,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO JK=1,KFC + DO JK=1,2*KF_FS KM = D_MYMS(KMLOC) IS = 1+MOD(R_NTMAX-KM+1,2) DO J=1,(R_NTMAX-KM+3)/2 @@ -243,7 +235,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) IF(KMLOC0 > 0) THEN !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(ISL) DEFAULT(NONE) DO J=1,MIN(R_NDGNH,G_NDGLU(0)) - DO JK=1,KFC,2 + DO JK=1,2*KF_FS,2 ISL = MAX(R_NDGNH-G_NDGLU(0)+1,1) DZBST0((JK-1)/2+1+(J-1)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) ENDDO @@ -263,7 +255,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IS) DEFAULT(NONE) DO J=1,(R_NTMAX+3)/2 - DO JK=1,KFC,2 + DO JK=1,2*KF_FS,2 IS = 1+MOD(R_NTMAX+1,2) POA1(JK,IS+(J-1)*2,KMLOC0) = DZCST0((JK-1)/2+1+(J-1)*DTDZCS) ENDDO From ff4340a9f1c8dbabd57cf684af0ae68986195c89 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:50 -0700 Subject: [PATCH 014/263] Fix openacc --- src/trans/gpu/internal/ledir_mod.F90 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 42503445a..4f716b1ad 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -123,6 +123,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) KM = D_MYMS(KMLOC) ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) + !$ACC LOOP SEQ DO J=1,MIN(R_NDGNH,G_NDGLU(KM)) DZBST((JK-1)+1+(J-1+(KMLOC-1)*DLDZBA)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) ENDDO @@ -150,6 +151,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) DO JK=1,2*KF_FS KM = D_MYMS(KMLOC) IA = 1+MOD(R_NTMAX-KM+2,2) + !$ACC LOOP SEQ DO J=1,(R_NTMAX-KM+2)/2 POA1(JK,IA+(J-1)*2,KMLOC) = DZCAT((JK-1)+1+(J-1+(KMLOC-1)*DLDZCA)*DTDZCA) ENDDO @@ -200,6 +202,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) KM = D_MYMS(KMLOC) ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) + !$ACC LOOP SEQ DO J=1,MIN(R_NDGNH,G_NDGLU(KM)) DZBST((JK-1)+1+(J-1+(KMLOC-1)*DLDZBS)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) ENDDO @@ -226,6 +229,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) DO JK=1,2*KF_FS KM = D_MYMS(KMLOC) IS = 1+MOD(R_NTMAX-KM+1,2) + !$ACC LOOP SEQ DO J=1,(R_NTMAX-KM+3)/2 POA1(JK,IS+(J-1)*2,KMLOC) = DZCST((JK-1)+1+(J-1+(KMLOC-1)*DLDZCS)*DTDZCS) ENDDO From 56ea3b2f103ec5ffbfe197bcfb77ab4d09853e73 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:50 -0700 Subject: [PATCH 015/263] Do not compute KMLOC0 twice --- src/trans/gpu/internal/ledir_mod.F90 | 52 ++++++++++++++++------------ 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 4f716b1ad..71a9cecf6 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -121,12 +121,14 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) DO KMLOC=1,D_NUMP DO JK=1,2*KF_FS KM = D_MYMS(KMLOC) - ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) - - !$ACC LOOP SEQ - DO J=1,MIN(R_NDGNH,G_NDGLU(KM)) - DZBST((JK-1)+1+(J-1+(KMLOC-1)*DLDZBA)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) - ENDDO + IF (KM /= 0) THEN + ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) + + !$ACC LOOP SEQ + DO J=1,MIN(R_NDGNH,G_NDGLU(KM)) + DZBST((JK-1)+1+(J-1+(KMLOC-1)*DLDZBA)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) + ENDDO + ENDIF ENDDO END DO @@ -150,11 +152,13 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) DO KMLOC=1,D_NUMP DO JK=1,2*KF_FS KM = D_MYMS(KMLOC) - IA = 1+MOD(R_NTMAX-KM+2,2) - !$ACC LOOP SEQ - DO J=1,(R_NTMAX-KM+2)/2 - POA1(JK,IA+(J-1)*2,KMLOC) = DZCAT((JK-1)+1+(J-1+(KMLOC-1)*DLDZCA)*DTDZCA) - ENDDO + IF (KM /= 0) THEN + IA = 1+MOD(R_NTMAX-KM+2,2) + !$ACC LOOP SEQ + DO J=1,(R_NTMAX-KM+2)/2 + POA1(JK,IA+(J-1)*2,KMLOC) = DZCAT((JK-1)+1+(J-1+(KMLOC-1)*DLDZCA)*DTDZCA) + ENDDO + ENDIF ENDDO ENDDO @@ -200,12 +204,14 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) DO KMLOC=1,D_NUMP DO JK=1,2*KF_FS KM = D_MYMS(KMLOC) - ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) - - !$ACC LOOP SEQ - DO J=1,MIN(R_NDGNH,G_NDGLU(KM)) - DZBST((JK-1)+1+(J-1+(KMLOC-1)*DLDZBS)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) - ENDDO + IF (KM /= 0) THEN + ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) + + !$ACC LOOP SEQ + DO J=1,MIN(R_NDGNH,G_NDGLU(KM)) + DZBST((JK-1)+1+(J-1+(KMLOC-1)*DLDZBS)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) + ENDDO + ENDIF ENDDO END DO @@ -228,11 +234,13 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) DO KMLOC=1,D_NUMP DO JK=1,2*KF_FS KM = D_MYMS(KMLOC) - IS = 1+MOD(R_NTMAX-KM+1,2) - !$ACC LOOP SEQ - DO J=1,(R_NTMAX-KM+3)/2 - POA1(JK,IS+(J-1)*2,KMLOC) = DZCST((JK-1)+1+(J-1+(KMLOC-1)*DLDZCS)*DTDZCS) - ENDDO + IF (KM /= 0) THEN + IS = 1+MOD(R_NTMAX-KM+1,2) + !$ACC LOOP SEQ + DO J=1,(R_NTMAX-KM+3)/2 + POA1(JK,IS+(J-1)*2,KMLOC) = DZCST((JK-1)+1+(J-1+(KMLOC-1)*DLDZCS)*DTDZCS) + ENDDO + ENDIF ENDDO ENDDO From e5d2b06601418f2f52282918cc0bce5a524340ab Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:50 -0700 Subject: [PATCH 016/263] Simplify final write --- src/trans/gpu/internal/updspb_mod.F90 | 57 +++++++++++---------------- 1 file changed, 23 insertions(+), 34 deletions(-) diff --git a/src/trans/gpu/internal/updspb_mod.F90 b/src/trans/gpu/internal/updspb_mod.F90 index 8d66b5c7f..9d319e643 100755 --- a/src/trans/gpu/internal/updspb_mod.F90 +++ b/src/trans/gpu/internal/updspb_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 1988- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -95,41 +96,29 @@ SUBROUTINE UPDSPB(KFIELD,POA,PSPEC,KFLDPTR) !loop over wavenumber !$ACC DATA PRESENT(PSPEC,POA,R,D) - !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM,IASM0,INM,IR,II) DEFAULT(NONE) - DO KMLOC=1,D%NUMP - DO JN=R%NTMAX+2-R%NSMAX,R%NTMAX+2 - DO JFLD=1,KFIELD - - KM = D%MYMS(KMLOC) - IASM0 = D%NASM0(KM) - - IF(KM == 0) THEN - - if (JN .le. R%NTMAX+2-KM) then - - INM = IASM0+(R%NTMAX+2-JN)*2 - IR = 2*JFLD-1 - PSPEC(JFLD,INM) = POA(IR,JN,KMLOC) - PSPEC(JFLD,INM+1) = 0.0_JPRBT - - end if - ELSE - - - if (JN .le. R%NTMAX+2-KM) then - INM = IASM0+((R%NTMAX+2-JN)-KM)*2 - - IR = 2*JFLD-1 - II = IR+1 - PSPEC(JFLD,INM) = POA(IR,JN,KMLOC) - PSPEC(JFLD,INM+1) = POA(II,JN,KMLOC) - - end if - end if - - ENDDO - ENDDO + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IASM0,INM,IR,II) DEFAULT(NONE) + DO KMLOC=1,D%NUMP + DO JFLD=1,KFIELD + KM = D%MYMS(KMLOC) + IASM0 = D%NASM0(KM) + + IF(KM == 0) THEN + !$ACC LOOP SEQ + DO JN=R%NTMAX+2-R%NSMAX,R%NTMAX+2 + INM = IASM0+(R%NTMAX+2-JN)*2 + PSPEC(JFLD,INM) = POA(2*JFLD-1,JN,KMLOC) + PSPEC(JFLD,INM+1) = 0.0_JPRBT + ENDDO + ELSE + !$ACC LOOP SEQ + DO JN=R%NTMAX+2-R%NSMAX,R%NTMAX+2-KM + INM = IASM0+((R%NTMAX+2-JN)-KM)*2 + PSPEC(JFLD,INM) = POA(2*JFLD-1,JN,KMLOC) + PSPEC(JFLD,INM+1) = POA(2*JFLD ,JN,KMLOC) + ENDDO + END IF ENDDO + ENDDO !$ACC END PARALLEL !$ACC END DATA From 03d997765cd7d74d37f0880d4133d4f5251b7658 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:51 -0700 Subject: [PATCH 017/263] Simplify matrix multiplications --- src/trans/gpu/internal/ledir_mod.F90 | 106 +++++++++++++-------------- 1 file changed, 51 insertions(+), 55 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 71a9cecf6..5e6d4c15b 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -59,11 +59,9 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX,R_NTMAX USE TPM_GEOMETRY ,ONLY : G, G_NDGLU USE TPM_FIELDS ,ONLY : F, & - & ZAA,ZAS,LDZAA,LDZAS,TDZAA,TDZAS,& - & DZBST,DLDZBA,DLDZBS,DTDZBA,DTDZBS,& - & DZCST,DZCAT,DLDZCA,DLDZCS,DTDZCA,DTDZCS,& - & ZAMAX, ZSMAX,& - & IF_FS_DIR,ZAA0,DZBST0,DZCAT0,ZAS0,DZCST0,KMLOC0 + & ZAA,DZBST,DZCAT,ZAS,DZCST,& + & ZAA0,DZBST0,DZCAT0,ZAS0,DZCST0,& + & TDZAA,TDZAS,KMLOC0 USE TPM_DISTR USE TPM_GEN, ONLY: NOUT USE TPM_FLT @@ -103,16 +101,6 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) !$ACC& PRESENT(POA1,dzbst0,dzcat0,dzbst0,dzcst0) !& -!! Initialize rescaling arrays to zero -!!$ACC PARALLEL LOOP COLLAPSE(2) -!DO KMLOC=1,SIZE(ZAMAX,2) -! DO JK=1,SIZE(ZAMAX,1) -! ZAMAX(JK,KMLOC) = 0.0_JPRBT -! ZSMAX(JK,KMLOC) = 0.0_JPRBT -! ENDDO -!ENDDO - - ! anti-symmetric IF ( KMODE == -1 ) THEN @@ -122,11 +110,11 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) DO JK=1,2*KF_FS KM = D_MYMS(KMLOC) IF (KM /= 0) THEN - ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) + ISL = R_NDGNH-G_NDGLU(KM)+1 !$ACC LOOP SEQ - DO J=1,MIN(R_NDGNH,G_NDGLU(KM)) - DZBST((JK-1)+1+(J-1+(KMLOC-1)*DLDZBA)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) + DO J=1,G_NDGLU(KM) + DZBST((JK-1)+1+(J-1+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) ENDDO ENDIF ENDDO @@ -139,12 +127,12 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) !$ACC HOST_DATA USE_DEVICE(ZAA,DZBST,DZCAT) CALL CUDA_GEMM_BATCHED( & & 'N', 'N', & - & DTDZBA, TDZAA, DLDZBA, & + & 2*KF_FS, TDZAA, R_NDGNH, & & 1.0_JPRBT, & - & DZBST, DTDZBA, DLDZBA, & - & ZAA, LDZAA, TDZAA, & - & 0._JPRBT, & - & DZCAT, DTDZCA, DLDZCA, & + & DZBST, 2*KF_FS, R_NDGNH, & + & ZAA, R_NDGNH, TDZAA, & + & 0.0_JPRBT, & + & DZCAT, 2*KF_FS, TDZAA, & & D_NUMP) !$ACC END HOST_DATA @@ -155,8 +143,8 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) IF (KM /= 0) THEN IA = 1+MOD(R_NTMAX-KM+2,2) !$ACC LOOP SEQ - DO J=1,(R_NTMAX-KM+2)/2 - POA1(JK,IA+(J-1)*2,KMLOC) = DZCAT((JK-1)+1+(J-1+(KMLOC-1)*DLDZCA)*DTDZCA) + DO J=1,TDZAA + POA1(JK,IA+(J-1)*2,KMLOC) = DZCAT((JK-1)+1+(J-1+(KMLOC-1)*TDZAA)*2*KF_FS) ENDDO ENDIF ENDDO @@ -167,10 +155,10 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) print*,'computing m=0 in double precision' !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(ISL) DEFAULT(NONE) - DO J=1,MIN(R_NDGNH,G_NDGLU(0)) + DO J=1,G_NDGLU(0) DO JK=1,2*KF_FS,2 - ISL = MAX(R_NDGNH-G_NDGLU(0)+1,1) - DZBST0((JK-1)/2+1+(J-1)*DTDZBA)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) + ISL = R_NDGNH-G_NDGLU(0)+1 + DZBST0((JK-1)/2+1+(J-1)*2*KF_FS)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) ENDDO ENDDO @@ -180,18 +168,22 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ! C^T=B^T*A^T !$ACC HOST_DATA USE_DEVICE(ZAA0,DZBST0,DZCAT0) - CALL CUDA_DGEMM_BATCHED('N','N',DTDZBA,int(TDZAA,kind=jpim),int(DLDZBA,kind=jpim), & - & 1.0_JPRD,DZBST0,DTDZBA,int(DLDZBA,kind=jpim),& - & ZAA0,LDZAA,int(TDZAA,kind=jpim),0._JPRD,DZCAT0,DTDZCA,int(DLDZCA,kind=jpim),1) - !call CUDA_DGEMM('N','N',DTDZBA,TDZAA,DLDZBA,1.0_JPRD,DZBST0,DTDZBA,& - ! &ZAA0,LDZAA,0._JPRD,DZCAT0,DTDZCA) + CALL CUDA_DGEMM_BATCHED( & + & 'N', 'N', & + & 2*KF_FS, TDZAA, R_NDGNH, & + & 1.0_JPRD, & + & DZBST0, 2*KF_FS, R_NDGNH, & + & ZAA0, R_NDGNH, TDZAA, & + & 0.0_JPRD, & + & DZCAT0, 2*KF_FS, TDZAA, & + & 1) !$ACC END HOST_DATA !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IA) DEFAULT(NONE) - DO J=1,(R_NTMAX+2)/2 + DO J=1,TDZAA DO JK=1,2*KF_FS,2 IA = 1+MOD(R_NTMAX+2,2) - POA1(JK,IA+(J-1)*2,KMLOC0) = DZCAT0((JK-1)/2+1+(J-1)*DTDZCA) + POA1(JK,IA+(J-1)*2,KMLOC0) = DZCAT0((JK-1)/2+1+(J-1)*2*KF_FS) ENDDO ENDDO ENDIF @@ -205,11 +197,11 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) DO JK=1,2*KF_FS KM = D_MYMS(KMLOC) IF (KM /= 0) THEN - ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) + ISL = R_NDGNH-G_NDGLU(KM)+1 !$ACC LOOP SEQ - DO J=1,MIN(R_NDGNH,G_NDGLU(KM)) - DZBST((JK-1)+1+(J-1+(KMLOC-1)*DLDZBS)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) + DO J=1,G_NDGLU(KM) + DZBST(JK+(J-1+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) ENDDO ENDIF ENDDO @@ -221,12 +213,12 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) !$ACC HOST_DATA USE_DEVICE(ZAS,DZBST,DZCST) CALL CUDA_GEMM_BATCHED( & & 'N', 'N', & - & DTDZBS, TDZAS, DLDZBS, & + & 2*KF_FS, TDZAS, R_NDGNH, & & 1.0_JPRBT, & - & DZBST, DTDZBS, DLDZBS, & - & ZAS, LDZAS, TDZAS, & - & 0._JPRBT, & - & DZCST, DTDZCS, DLDZCS, & + & DZBST, 2*KF_FS, R_NDGNH, & + & ZAS, R_NDGNH, TDZAS, & + & 0.0_JPRBT, & + & DZCST, 2*KF_FS, TDZAS, & & D_NUMP) !$ACC END HOST_DATA @@ -237,8 +229,8 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) IF (KM /= 0) THEN IS = 1+MOD(R_NTMAX-KM+1,2) !$ACC LOOP SEQ - DO J=1,(R_NTMAX-KM+3)/2 - POA1(JK,IS+(J-1)*2,KMLOC) = DZCST((JK-1)+1+(J-1+(KMLOC-1)*DLDZCS)*DTDZCS) + DO J=1,TDZAS + POA1(JK,IS+(J-1)*2,KMLOC) = DZCST(JK+(J-1+(KMLOC-1)*TDZAS)*2*KF_FS) ENDDO ENDIF ENDDO @@ -246,10 +238,10 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) IF(KMLOC0 > 0) THEN !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(ISL) DEFAULT(NONE) - DO J=1,MIN(R_NDGNH,G_NDGLU(0)) + DO J=1,G_NDGLU(0) DO JK=1,2*KF_FS,2 - ISL = MAX(R_NDGNH-G_NDGLU(0)+1,1) - DZBST0((JK-1)/2+1+(J-1)*DTDZBS)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) + ISL = R_NDGNH-G_NDGLU(0)+1 + DZBST0((JK-1)/2+1+(J-1)*2*KF_FS)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) ENDDO ENDDO @@ -258,18 +250,22 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ! C^T=B^T*A^T !$ACC host_data use_device(ZAS0,DZBST0,DZCST0) - call CUDA_DGEMM_BATCHED('N','N',& - & DTDZBS,TDZAS,DLDZBS,& - & 1.0_JPRD,DZBST0,DTDZBS,DLDZBS,& - & ZAS0,LDZAS,TDZAS,& - & 0._JPRD,DZCST0,DTDZCS,DLDZCS,1) + call CUDA_DGEMM_BATCHED( & + & 'N', 'N', & + & 2*KF_FS, TDZAS, R_NDGNH, & + & 1.0_JPRD, & + & DZBST0, 2*KF_FS, R_NDGNH, & + & ZAS0, R_NDGNH, TDZAS, & + & 0.0_JPRD, & + & DZCST0, 2*KF_FS, TDZAS, & + & 1) !$ACC end host_data !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IS) DEFAULT(NONE) - DO J=1,(R_NTMAX+3)/2 + DO J=1,TDZAS DO JK=1,2*KF_FS,2 IS = 1+MOD(R_NTMAX+1,2) - POA1(JK,IS+(J-1)*2,KMLOC0) = DZCST0((JK-1)/2+1+(J-1)*DTDZCS) + POA1(JK,IS+(J-1)*2,KMLOC0) = DZCST0((JK-1)/2+1+(J-1)*2*KF_FS) ENDDO ENDDO From 1f4ac75a7d855bcdd333770d4c9d3351b35c5462 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:51 -0700 Subject: [PATCH 018/263] Format LEDIR --- src/trans/gpu/internal/ledir_mod.F90 | 46 ++++++++++++++-------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 5e6d4c15b..512bdaf6b 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -237,29 +237,29 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ENDDO IF(KMLOC0 > 0) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(ISL) DEFAULT(NONE) - DO J=1,G_NDGLU(0) - DO JK=1,2*KF_FS,2 - ISL = R_NDGNH-G_NDGLU(0)+1 - DZBST0((JK-1)/2+1+(J-1)*2*KF_FS)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) - ENDDO - ENDDO - - ! Get C in transpose format to get better memory access patterns later - !C=A*B => - ! C^T=B^T*A^T - - !$ACC host_data use_device(ZAS0,DZBST0,DZCST0) - call CUDA_DGEMM_BATCHED( & - & 'N', 'N', & - & 2*KF_FS, TDZAS, R_NDGNH, & - & 1.0_JPRD, & - & DZBST0, 2*KF_FS, R_NDGNH, & - & ZAS0, R_NDGNH, TDZAS, & - & 0.0_JPRD, & - & DZCST0, 2*KF_FS, TDZAS, & - & 1) - !$ACC end host_data + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(ISL) DEFAULT(NONE) + DO J=1,G_NDGLU(0) + DO JK=1,2*KF_FS,2 + ISL = R_NDGNH-G_NDGLU(0)+1 + DZBST0((JK-1)/2+1+(J-1)*2*KF_FS)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) + ENDDO + ENDDO + + ! Get C in transpose format to get better memory access patterns later + !C=A*B => + ! C^T=B^T*A^T + + !$ACC host_data use_device(ZAS0,DZBST0,DZCST0) + call CUDA_DGEMM_BATCHED( & + & 'N', 'N', & + & 2*KF_FS, TDZAS, R_NDGNH, & + & 1.0_JPRD, & + & DZBST0, 2*KF_FS, R_NDGNH, & + & ZAS0, R_NDGNH, TDZAS, & + & 0.0_JPRD, & + & DZCST0, 2*KF_FS, TDZAS, & + & 1) + !$ACC end host_data !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IS) DEFAULT(NONE) DO J=1,TDZAS From 0f081b753b21a400fa400def6e8e51b16868742a Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:51 -0700 Subject: [PATCH 019/263] Make output loops smaller --- src/trans/gpu/internal/ledir_mod.F90 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 512bdaf6b..e9349a0c4 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -143,7 +143,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) IF (KM /= 0) THEN IA = 1+MOD(R_NTMAX-KM+2,2) !$ACC LOOP SEQ - DO J=1,TDZAA + DO J=1,(R%NSMAX-KM+2)/2 POA1(JK,IA+(J-1)*2,KMLOC) = DZCAT((JK-1)+1+(J-1+(KMLOC-1)*TDZAA)*2*KF_FS) ENDDO ENDIF @@ -180,7 +180,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) !$ACC END HOST_DATA !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IA) DEFAULT(NONE) - DO J=1,TDZAA + DO J=1,(R_NSMAX+2)/2 DO JK=1,2*KF_FS,2 IA = 1+MOD(R_NTMAX+2,2) POA1(JK,IA+(J-1)*2,KMLOC0) = DZCAT0((JK-1)/2+1+(J-1)*2*KF_FS) @@ -229,7 +229,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) IF (KM /= 0) THEN IS = 1+MOD(R_NTMAX-KM+1,2) !$ACC LOOP SEQ - DO J=1,TDZAS + DO J=1,(R_NSMAX-KM+3)/2 POA1(JK,IS+(J-1)*2,KMLOC) = DZCST(JK+(J-1+(KMLOC-1)*TDZAS)*2*KF_FS) ENDDO ENDIF @@ -262,7 +262,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) !$ACC end host_data !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IS) DEFAULT(NONE) - DO J=1,TDZAS + DO J=1,(R_NSMAX+3)/2 DO JK=1,2*KF_FS,2 IS = 1+MOD(R_NTMAX+1,2) POA1(JK,IS+(J-1)*2,KMLOC0) = DZCST0((JK-1)/2+1+(J-1)*2*KF_FS) From d03e67b5a7098f9a7c0ce14a51209539e4a25fed Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:51 -0700 Subject: [PATCH 020/263] Cleanup uvtvd --- src/trans/gpu/internal/uvtvd_mod.F90 | 116 +++++++++++++-------------- 1 file changed, 58 insertions(+), 58 deletions(-) diff --git a/src/trans/gpu/internal/uvtvd_mod.F90 b/src/trans/gpu/internal/uvtvd_mod.F90 index 0c7c79f6e..3fe9a9778 100755 --- a/src/trans/gpu/internal/uvtvd_mod.F90 +++ b/src/trans/gpu/internal/uvtvd_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 1991- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -9,8 +10,8 @@ MODULE UVTVD_MOD CONTAINS -SUBROUTINE UVTVD(KFIELD) -!SUBROUTINE UVTVD(KFIELD,PEPSNM,PU,PV,PVOR,PDIV) +SUBROUTINE UVTVD(KF_UV) +!SUBROUTINE UVTVD(KF_UV,PEPSNM,PU,PV,PVOR,PDIV) !**** *UVTVD* - Compute vor/div from u and v in spectral space @@ -22,10 +23,10 @@ SUBROUTINE UVTVD(KFIELD) !** Interface. ! ---------- -! CALL UVTVD(KM,KFIELD,PEPSNM,PU,PV,PVOR,PDIV) +! CALL UVTVD(KM,KF_UV,PEPSNM,PU,PV,PVOR,PDIV) ! Explicit arguments : KM - zonal wave-number -! -------------------- KFIELD - number of fields (levels) +! -------------------- KF_UV - number of fields (levels) ! PEPSNM - REPSNM for wavenumber KM ! PU - u wind component for zonal ! wavenumber KM @@ -68,7 +69,7 @@ SUBROUTINE UVTVD(KFIELD) IMPLICIT NONE ! DUMMY INTEGER SCALARS -INTEGER(KIND=JPIM), INTENT(IN) :: KFIELD +INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV INTEGER(KIND=JPIM) :: KM, KMLOC !REAL(KIND=JPRBT), INTENT(IN) :: PEPSNM(1:d%nump,0:R%NTMAX+2) @@ -80,18 +81,17 @@ SUBROUTINE UVTVD(KFIELD) INTEGER(KIND=JPIM) :: IUS, IUE, IVS, IVE, IVORS, IVORE, IDIVS, IDIVE ! LOCAL REAL SCALARS -REAL(KIND=JPRBT) :: ZKM -REAL(KIND=JPRBT) :: ZN(-1:R%NTMAX+3) +REAL(KIND=JPRBT) :: ZKM,ZJN REAL(KIND=JPRBT), POINTER :: PU(:,:,:),PV(:,:,:),PVOR(:,:,:),PDIV(:,:,:) IUS = 1 -IUE = 2*KFIELD -IVS = 2*KFIELD+1 -IVE = 4*KFIELD +IUE = 2*KF_UV +IVS = 2*KF_UV+1 +IVE = 4*KF_UV IVORS = 1 -IVORE = 2*KFIELD -IDIVS = 2*KFIELD+1 -IDIVE = 4*KFIELD +IVORE = 2*KF_UV +IDIVS = 2*KF_UV+1 +IDIVE = 4*KF_UV ! ------------------------------------------------------------------ @@ -104,64 +104,64 @@ SUBROUTINE UVTVD(KFIELD) PDIV => ZOA2(IDIVS:IDIVE,:,:) !$ACC DATA& -!$ACC& CREATE(ZN) & -!$ACC& COPY(D_MYMS,D_NUMP,R_NTMAX) & -!$ACC& COPY(F,F%RN,F%NLTN) & +!$ACC& PRESENT(D_MYMS,D_NUMP,R_NTMAX) & +!$ACC& PRESENT(F,F%RN,F%NLTN) & !$ACC& PRESENT(ZEPSNM,PU,PV,PVOR,PDIV) -!$ACC PARALLEL LOOP DEFAULT(NONE) -DO J=-1,R_NTMAX+3 - ZN(j) = F%RN(j) -ENDDO !* 1.1 SET N=KM-1 COMPONENT TO 0 FOR U AND V !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IN) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO J=1,2*KFIELD + DO J=1,2*KF_UV KM = D_MYMS(KMLOC) - IN = F%NLTN(KM-1) -! IN=R_NTMAX+3-KM - PU(J,IN,KMLOC) = 0.0_JPRBT - PV(J,IN,KMLOC) = 0.0_JPRBT + PU(J,R_NTMAX+3-KM,KMLOC) = 0.0_JPRBT + PV(J,R_NTMAX+3-KM,KMLOC) = 0.0_JPRBT ENDDO ENDDO !* 1.2 COMPUTE VORTICITY AND DIVERGENCE. -!$ACC parallel loop collapse(3) private(IR,II,IN,KM,ZKM) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IR,II,IN,KM,ZKM,JN,ZJN) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO JN=0,R_NTMAX - DO J=1,KFIELD - IR = 2*J-1 - II = IR+1 - KM = D_MYMS(KMLOC) - ZKM = REAL(KM,JPRBT) - IN = R_NTMAX+2-JN - - IF(KM /= 0 .and. JN.GE.KM) THEN - PVOR(IR,IN,kmloc) = -ZKM*PV(II,IN,kmloc)-& - &ZN(JN)*ZEPSNM(KMLOC,JN+1)*PU(IR,IN-1,kmloc)+& - &ZN(JN+1)*ZEPSNM(KMLOC,JN)*PU(IR,IN+1,kmloc) - PVOR(II,IN,kmloc) = +ZKM*PV(IR,IN,kmloc)-& - &ZN(JN)*ZEPSNM(KMLOC,JN+1)*PU(II,IN-1,kmloc)+& - &ZN(JN+1)*ZEPSNM(KMLOC,JN)*PU(II,IN+1,kmloc) - PDIV(IR,IN,kmloc) = -ZKM*PU(II,IN,kmloc)+& - &ZN(JN)*ZEPSNM(KMLOC,JN+1)*PV(IR,IN-1,kmloc)-& - &ZN(JN+1)*ZEPSNM(KMLOC,JN)*PV(IR,IN+1,kmloc) - PDIV(II,IN,kmloc) = +ZKM*PU(IR,IN,kmloc)+& - &ZN(JN)*ZEPSNM(KMLOC,JN+1)*PV(II,IN-1,kmloc)-& - &ZN(JN+1)*ZEPSNM(KMLOC,JN)*PV(II,IN+1,kmloc) - ELSE - IF(KM == 0) THEN - PVOR(IR,IN,kmloc) = -& - &ZN(JN)*ZEPSNM(KMLOC,JN+1)*PU(IR,IN-1,kmloc)+& - &ZN(JN+1)*ZEPSNM(KMLOC,JN)*PU(IR,IN+1,kmloc) - PDIV(IR,IN,kmloc) = & - &ZN(JN)*ZEPSNM(KMLOC,JN+1)*PV(IR,IN-1,kmloc)-& - &ZN(JN+1)*ZEPSNM(KMLOC,JN)*PV(IR,IN+1,kmloc) - ENDIF - ENDIF - ENDDO + DO J=1,KF_UV + IR = 2*J-1 + II = IR+1 + KM = D_MYMS(KMLOC) + ZKM = REAL(KM,JPRBT) + + IF(KM /= 0) THEN + !$ACC LOOP SEQ + DO JN=KM,R_NTMAX + IN = R_NTMAX+2-JN + ZJN = JN + + PVOR(IR,IN,kmloc) = -ZKM*PV(II,IN,kmloc)-& + &ZJN*ZEPSNM(KMLOC,JN+1)*PU(IR,IN-1,kmloc)+& + &(ZJN+1)*ZEPSNM(KMLOC,JN)*PU(IR,IN+1,kmloc) + PVOR(II,IN,kmloc) = +ZKM*PV(IR,IN,kmloc)-& + &ZJN*ZEPSNM(KMLOC,JN+1)*PU(II,IN-1,kmloc)+& + &(ZJN+1)*ZEPSNM(KMLOC,JN)*PU(II,IN+1,kmloc) + PDIV(IR,IN,kmloc) = -ZKM*PU(II,IN,kmloc)+& + &ZJN*ZEPSNM(KMLOC,JN+1)*PV(IR,IN-1,kmloc)-& + &(ZJN+1)*ZEPSNM(KMLOC,JN)*PV(IR,IN+1,kmloc) + PDIV(II,IN,kmloc) = +ZKM*PU(IR,IN,kmloc)+& + &ZJN*ZEPSNM(KMLOC,JN+1)*PV(II,IN-1,kmloc)-& + &(ZJN+1)*ZEPSNM(KMLOC,JN)*PV(II,IN+1,kmloc) + ENDDO + ELSE + !$ACC LOOP SEQ + DO JN=0,R_NTMAX + IN = R_NTMAX+2-JN + ZJN = JN + + PVOR(IR,IN,kmloc) = -& + &ZJN*ZEPSNM(KMLOC,JN+1)*PU(IR,IN-1,kmloc)+& + &(ZJN+1)*ZEPSNM(KMLOC,JN)*PU(IR,IN+1,kmloc) + PDIV(IR,IN,kmloc) = & + &ZJN*ZEPSNM(KMLOC,JN+1)*PV(IR,IN-1,kmloc)-& + &(ZJN+1)*ZEPSNM(KMLOC,JN)*PV(IR,IN+1,kmloc) + ENDDO + ENDIF ENDDO ENDDO !$acc end data From 2b07cf91e388d6396a6aa20a479a12be1008e8e5 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:51 -0700 Subject: [PATCH 021/263] Cleanup ldfou2 --- src/trans/gpu/internal/ldfou2_mod.F90 | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/trans/gpu/internal/ldfou2_mod.F90 b/src/trans/gpu/internal/ldfou2_mod.F90 index d2ba772f6..d314f5820 100755 --- a/src/trans/gpu/internal/ldfou2_mod.F90 +++ b/src/trans/gpu/internal/ldfou2_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 1991- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -74,15 +75,14 @@ SUBROUTINE LDFOU2(KF_UV,PAIA) !REAL(KIND=JPRBT) ,INTENT(INOUT) :: PSIA(:,:,:), PAIA(:,:,:) ! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: J, JGL ,IFLD ,ISL, IGLS +INTEGER(KIND=JPIM) :: J, JGL ,ISL ! ------------------------------------------------------------------ !* 1. DIVIDE U V BY A*COS(THETA) ! -------------------------- -IFLD = 4*KF_UV -IF( IFLD > 0 ) THEN +IF( KF_UV > 0 ) THEN !$ACC DATA & !$ACC& PRESENT(F,F%RACTHE,D,D_NUMP,D_MYMS,R_NDGNH,R_NDGL,G_NDGLU) & @@ -90,18 +90,15 @@ SUBROUTINE LDFOU2(KF_UV,PAIA) !loop over wavenumber -!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM,ISL,IGLS) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO JGL=1,R_NDGNH - DO J=1,4*KF_UV - KM = D_MYMS(KMLOC) - ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) + DO J=1,4*KF_UV ! (real+complex) * (U+V) + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + !$ACC LOOP SEQ + DO JGL=ISL,R_NDGNH !* 1.1 U AND V - if (JGL .ge. ISL) then - IGLS = R_NDGL+1-JGL - PAIA(J,JGL,KMLOC) = PAIA(J,JGL,KMLOC)*F%RACTHE(JGL) -! PSIA(J,JGL,KMLOC) = PSIA(J,JGL,KMLOC)*F%RACTHE(JGL) - endif + PAIA(J,JGL,KMLOC) = PAIA(J,JGL,KMLOC)*F%RACTHE(JGL) ENDDO ENDDO ENDDO From e415b09c88345252f28c277e1565b663589516f7 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:51 -0700 Subject: [PATCH 022/263] Cleanup prfi2b --- src/trans/gpu/internal/prfi2b_mod.F90 | 41 +++++++++++++-------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/src/trans/gpu/internal/prfi2b_mod.F90 b/src/trans/gpu/internal/prfi2b_mod.F90 index 33962319f..ea9f4bc77 100755 --- a/src/trans/gpu/internal/prfi2b_mod.F90 +++ b/src/trans/gpu/internal/prfi2b_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 1990- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -9,7 +10,7 @@ MODULE PRFI2B_MOD CONTAINS - SUBROUTINE PRFI2B(KFIELD,PAIA,KMODE) + SUBROUTINE PRFI2B(KF_FS,PAIA,KMODE) !**** *PRFI2B* - Prepare input work arrays for direct transform @@ -24,7 +25,7 @@ SUBROUTINE PRFI2B(KFIELD,PAIA,KMODE) ! *CALL* *PRFI2B(..) ! Explicit arguments : - ! ------------------- KFIELD - number of fields + ! ------------------- KF_FS - number of fields ! KM - zonal wavenumber ! KMLOC - local zonal wavenumber ! PAOA - antisymmetric part of Fourier @@ -66,7 +67,7 @@ SUBROUTINE PRFI2B(KFIELD,PAIA,KMODE) IMPLICIT NONE - INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS INTEGER(KIND=JPIM),INTENT(IN) :: KMODE INTEGER(KIND=JPIM) :: KM,KMLOC REAL(KIND=JPRBT) , INTENT(OUT) :: PAIA(:,:,:) @@ -87,25 +88,23 @@ SUBROUTINE PRFI2B(KFIELD,PAIA,KMODE) !$ACC DATA PRESENT(PAIA,FOUBUF, D_NPNTGTB1,D_NSTAGT1B,D_MYMS,R_NDGL,R_NDGNH,G_NDGLU,D_NPROCL) -!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2) +!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL) DO KMLOC=1,D_NUMP - DO JGL=1,R_NDGNH - DO JF=1,KFIELD*2 - KM = D_MYMS(KMLOC) - ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) - if (JGL .ge. ISL) then - IGLS = R_NDGL+1-JGL - OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL) )+D_NPNTGTB1(KMLOC,JGL ))*2*KFIELD - OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS))+D_NPNTGTB1(KMLOC,IGLS))*2*KFIELD - IF( KMODE == -1 ) THEN - PAIA(JF,JGL,KMLOC) = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) - ELSE - PAIA(JF,JGL,KMLOC) = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) -! PSIA(JF,JGL,KMLOC) = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) - ENDIF - end if - ENDDO - ENDDO + DO JF=1,KF_FS*2 + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + !$ACC LOOP SEQ + DO JGL=ISL,R_NDGNH + IGLS = R_NDGL+1-JGL + OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL) )+D_NPNTGTB1(KMLOC,JGL ))*2*KF_FS + OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS))+D_NPNTGTB1(KMLOC,IGLS))*2*KF_FS + IF( KMODE == -1 ) THEN + PAIA(JF,JGL,KMLOC) = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) + ELSE + PAIA(JF,JGL,KMLOC) = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) + ENDIF + ENDDO + ENDDO END DO !$ACC END DATA From 4ce37723f1fc67538cd8b21097fe0193869200ae Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:52 -0700 Subject: [PATCH 023/263] simple rename --- src/trans/gpu/internal/trgtol_mod.F90 | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index bab50b8ad..160fad636 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -134,8 +134,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& INTEGER(KIND=JPIM) :: IUVPAR,IUVLEV,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR,IPAROFF INTEGER(KIND=JPIM) :: IOFF,IOFF1,IOFFNS,IOFFEW,J1,J2 INTEGER(KIND=JPIM) :: KINDEX(D%NLENGTF),INDOFF(NPROC),IFLDOFF(MAX(KF_GP,KF_FS)) - INTEGER(KIND=JPIM) :: ISEND_FLD_TOTAL(NPROC),ISEND_FLD_END - INTEGER(KIND=JPIM) :: IRECV_FLD_END + INTEGER(KIND=JPIM) :: ISEND_FLD_TOTAL(NPROC),ISEND_FLD_CNT + INTEGER(KIND=JPIM) :: IRECV_FLD_CNT INTEGER(KIND=JPIM) :: INUMFLDS INTEGER(KIND=JPIM) :: IGPTRSEND(2,NGPBLKS,NPRTRNS) INTEGER(KIND=JPIM) :: IGPTRRECV(NPRTRNS) @@ -525,7 +525,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ISEND=JSEND(INS) CALL PE2SET(ISEND,ISETA,ISETB,ISETW,ISETV) ISENDSET = ISETV - ISEND_FLD_END = ISEND_FLD_TOTAL(ISEND) + ISEND_FLD_CNT = ISEND_FLD_TOTAL(ISEND) IFLD = 0 IPOS = 0 DO JFLD=1,KF_GP @@ -548,7 +548,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLDT,IFIRST,ILAST) COPYIN(INS,JK_MAX,IJPOS,IFLDA) - DO JJ=1,ISEND_FLD_END + DO JJ=1,ISEND_FLD_CNT DO JBLK=1,NGPBLKS DO JKL=1, JK_MAX IFLDT=IFLDA(JJ) @@ -665,9 +665,9 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& DO INR=1,INRECV IRECV=JRECV(INR) ILEN = IRECVTOT(IRECV)/KF_FS - IRECV_FLD_END = ICOMBUFR_FLD(INR) - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(II) COPYIN(IRECV,ILEN,IRECV_FLD_END) - DO JFLD=1,IRECV_FLD_END + IRECV_FLD_CNT = ICOMBUFR_FLD(INR) + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(II) COPYIN(IRECV,ILEN,IRECV_FLD_CNT) + DO JFLD=1,IRECV_FLD_CNT DO JL=1,ILEN II = KINDEX(INDOFF(IRECV)+JL) PGLAT(JFLD,II) = ZCOMBUFR(JL+(JFLD-1)*ILEN,INR) From 184350bd420f84766ce6551652e7471e53227bf8 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:52 -0700 Subject: [PATCH 024/263] Improve performance for scaling/ kernel --- src/trans/gpu/internal/ftdir_mod.F90 | 46 +++++++++++++--------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 87cfeb031..c28c7f81b 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -89,10 +90,6 @@ SUBROUTINE FTDIR(KFIELDS) IINC=-1 ENDIF -OFFSET_VAR=D_NPTRLS(MYSETW) - -IMAX = G_NLOEN_MAX + 2 + R_NNOEXTZL - allocate(zgtf2(size(zgtf,1),size(zgtf,2))) !$ACC DATA & @@ -118,31 +115,30 @@ SUBROUTINE FTDIR(KFIELDS) istat = cuda_Synchronize() -!$acc kernels DEFAULT(NONE) -zgtf(:,:) = zgtf2(:,:) -!$acc end kernels -!$acc end data - -!$ACC parallel loop collapse(3) private(JMAX,KGL,IOFF,SCAL,IST) DEFAULT(NONE) +OFFSET_VAR=D_NPTRLS(MYSETW) +!$ACC parallel loop collapse(2) private(JMAX,JJ,KGL,IOFF,SCAL,IST) DEFAULT(NONE) DO IGLG=IBEG+OFFSET_VAR-1,IEND+OFFSET_VAR-1,IINC - DO JJ=1, IMAX - DO JF=1,KFIELDS - JMAX = G_NLOEN(IGLG) - IST = 2*(G_NMEN(IGLG)+1) - if (JJ .le. JMAX) then - KGL=IGLG-OFFSET_VAR+1 - IOFF=D_NSTAGTF(KGL)+1 - SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) - ZGTF(JF,IOFF+JJ-1)= SCAL * ZGTF(JF, IOFF+JJ-1) - end if - - ! case JJ>0 - IF( JJ .le. (JMAX+R_NNOEXTZL+2-IST)) ZGTF(JF,IST+IOFF+JJ-1) = 0.0_JPRBT - ! case JJ=0 - IF (G_NLOEN(IGLG)==1) ZGTF(JF,IST+IOFF-1) = 0.0_JPRBT + DO JF=1,KFIELDS + JMAX = G_NLOEN(IGLG) + SCAL = 1._JPRBT/REAL(JMAX,JPRBT) + IST = 2*(G_NMEN(IGLG)+1) + KGL=IGLG-OFFSET_VAR+1 + IOFF=D_NSTAGTF(KGL)+1 + + !$ACC LOOP SEQ + DO JJ=1, JMAX + ZGTF(JF,IOFF+JJ-1)= SCAL * ZGTF2(JF, IOFF+JJ-1) + ENDDO + + !! WHAT'S GOING ON HERE? TRUNCATING? + IF (JMAX== 1) ZGTF(JF,IST+IOFF-1) = 0.0_JPRBT + !$ACC LOOP SEQ + DO JJ=1,JMAX+R%NNOEXTZL+3-IST + ZGTF(JF,IST+IOFF+JJ-1) = 0.0_JPRBT ENDDO ENDDO ENDDO +!$acc end data !$ACC end data From 6f694a7c3149bddd40bcab223c901dcc402c1496 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:52 -0700 Subject: [PATCH 025/263] avoid some over-computation in ftdir --- .../external/fourier/create_plan_fftc.cu | 5 ++-- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 3 ++- src/trans/gpu/internal/ftdir_mod.F90 | 25 ++++++++----------- src/trans/gpu/internal/ftinv_mod.F90 | 3 ++- src/trans/gpu/internal/tpm_fftc.F90 | 15 ++++++----- 5 files changed, 26 insertions(+), 25 deletions(-) diff --git a/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu index b45e329de..b59fc2e00 100644 --- a/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu +++ b/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu @@ -61,11 +61,12 @@ static int planWorkspaceSize=100*1024*1024; //100MB extern "C" void -create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, int *LOTp) +create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, int *LOTp, int *stridep) { int ISIGN = *ISIGNp; int N = *Np; int LOT = *LOTp; +int stride = *stridep; cufftHandle plan; @@ -86,7 +87,6 @@ if (cudaDeviceSynchronize() != cudaSuccess){ // cufftSetAutoAllocation(plan, false); int embed[1]; -int stride; int dist; #ifdef TRANS_SINGLE @@ -98,7 +98,6 @@ cufftType cufft_2 = CUFFT_Z2D; #endif embed[0] = 1; -stride = LOT; dist = 1; cufftSafeCall(cufftCreate(&plan)); diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index dcfa4bd26..68d0ba759 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -217,7 +218,7 @@ END SUBROUTINE cudaProfilerStop IOFF=1 !ICHUNK=2*KF_FS+2 ICHUNK=ISIZE - CALL FTDIR(ICHUNK) + CALL FTDIR(ICHUNK,KF_FS) !ENDDO ENDIF diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index c28c7f81b..6b28bfd0e 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -10,7 +10,7 @@ MODULE FTDIR_MOD CONTAINS -SUBROUTINE FTDIR(KFIELDS) +SUBROUTINE FTDIR(STRIDE,KF_FS) !**** *FTDIR - Direct Fourier transform @@ -23,7 +23,8 @@ SUBROUTINE FTDIR(KFIELDS) ! CALL FTDIR(..) ! Explicit arguments : PREEL - Fourier/grid-point array -! -------------------- KFIELDS - number of fields +! -------------------- KSTIRDE - stride of PREEL +! KF_FS - number of fields ! Method. ! ------- @@ -62,20 +63,19 @@ SUBROUTINE FTDIR(KFIELDS) IMPLICIT NONE -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS +INTEGER(KIND=JPIM),INTENT(IN) :: STRIDE,KF_FS INTEGER(KIND=JPIM) :: KGL -!!!REAL(KIND=JPRBT), INTENT(INOUT) :: PREEL(KFIELDS,D%NLENGTF) +!!!REAL(KIND=JPRBT), INTENT(INOUT) :: PREEL(STRIDE,D%NLENGTF) -INTEGER(KIND=JPIM) :: IGLG,IST,ILEN,IJUMP,JJ,JF,IST1 -INTEGER(KIND=JPIM) :: IOFF,IRLEN,ICLEN, ITYPE +INTEGER(KIND=JPIM) :: IGLG,IST,JJ,JF,IST1 +INTEGER(KIND=JPIM) :: IOFF INTEGER(KIND=JPIM) :: IPLAN_R2C INTEGER(KIND=JPIM) :: JMAX REAL(KIND=JPRBT) :: SCAL -LOGICAL :: LL_ALL=.FALSE. ! T=do kfields ffts in one batch, F=do kfields ffts one at a time INTEGER(KIND=JPIM) :: IBEG,IEND,IINC,ISCAL -INTEGER(KIND=JPIM) :: OFFSET_VAR, IUNIT, ISIZE, II, IMAX -integer :: istat, idev +INTEGER(KIND=JPIM) :: OFFSET_VAR +integer :: istat real(kind=jprbt), allocatable :: zgtf2(:,:) ! ------------------------------------------------------------------ @@ -102,11 +102,8 @@ SUBROUTINE FTDIR(KFIELDS) IOFF=D%NSTAGTF(KGL)+1 IGLG = D%NPTRLS(MYSETW)+KGL-1 - !ILEN = G_NLOEN(IGLG)+R_NNOEXTZL+3-IST - !IRLEN=G_NLOEN(IGLG)+R_NNOEXTZL - !ICLEN=(IRLEN/2+1)*2 - CALL CREATE_PLAN_FFT(IPLAN_R2C,-1,G%NLOEN(IGLG),KFIELDS) + CALL CREATE_PLAN_FFT(IPLAN_R2C,-1,G%NLOEN(IGLG),2*KF_FS,STRIDE) !$ACC host_data use_device(ZGTF,ZGTF2) CALL EXECUTE_PLAN_FFTC(IPLAN_R2C,-1,ZGTF(1,IOFF),ZGTF2(1,IOFF)) !$ACC end host_data @@ -118,7 +115,7 @@ SUBROUTINE FTDIR(KFIELDS) OFFSET_VAR=D_NPTRLS(MYSETW) !$ACC parallel loop collapse(2) private(JMAX,JJ,KGL,IOFF,SCAL,IST) DEFAULT(NONE) DO IGLG=IBEG+OFFSET_VAR-1,IEND+OFFSET_VAR-1,IINC - DO JF=1,KFIELDS + DO JF=1,2*KF_FS JMAX = G_NLOEN(IGLG) SCAL = 1._JPRBT/REAL(JMAX,JPRBT) IST = 2*(G_NMEN(IGLG)+1) diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index 26ccbcdd1..e5fdbb4be 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -122,7 +123,7 @@ SUBROUTINE FTINV(PREEL,KFIELDS) !IF (G%NLOEN(IGLG)>1) THEN !call cudaProfilerStop() !istat=cuda_SetDevice(idev) - CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),KFIELDS) + CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),KFIELDS,KFIELDS) !$ACC host_data use_device(PREEL,PREEL2) CALL EXECUTE_PLAN_FFTC(IPLAN_C2R,1,PREEL(1, ioff),PREEL2(1, ioff)) !$ACC end host_data diff --git a/src/trans/gpu/internal/tpm_fftc.F90 b/src/trans/gpu/internal/tpm_fftc.F90 index 662b9477d..4586832fe 100755 --- a/src/trans/gpu/internal/tpm_fftc.F90 +++ b/src/trans/gpu/internal/tpm_fftc.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2014- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -42,6 +43,7 @@ MODULE TPM_FFTC INTEGER(KIND=JPIM) :: NPLAN_ID=123456 INTEGER(KIND=JPIM) :: NPLAN INTEGER(KIND=JPIM) :: NLOT + INTEGER(KIND=JPIM) :: NSTRIDE INTEGER(KIND=JPIM) :: NTYPE TYPE(FFTC_PLAN),POINTER :: NEXT_PLAN => NULL() END TYPE FFTC_PLAN @@ -67,9 +69,9 @@ SUBROUTINE INIT_PLANS_FFT(KDLON) END SUBROUTINE INIT_PLANS_FFT -SUBROUTINE CREATE_PLAN_FFT(KPLAN,KTYPE,KN,KLOT) +SUBROUTINE CREATE_PLAN_FFT(KPLAN,KTYPE,KN,KLOT,KSTRIDE) INTEGER(KIND=JPIM),INTENT(OUT) :: KPLAN -INTEGER(KIND=JPIM),INTENT(IN) :: KTYPE,KN,KLOT +INTEGER(KIND=JPIM),INTENT(IN) :: KTYPE,KN,KLOT,KSTRIDE INTEGER(KIND=JPIM) :: IPLAN INTEGER(KIND=JPIM) :: IRANK, ISTRIDE @@ -79,10 +81,10 @@ SUBROUTINE CREATE_PLAN_FFT(KPLAN,KTYPE,KN,KLOT) LOGICAL :: LLRESTRICT_PLANS=.TRUE. TYPE(FFTC_PLAN),POINTER :: CURR_FFTC_PLAN,START_FFTC_PLAN INTERFACE - SUBROUTINE CREATE_PLAN_FFTC(KPLAN,KTYPE,KN,KLOT) BIND(C,NAME="create_plan_fftc_") + SUBROUTINE CREATE_PLAN_FFTC(KPLAN,KTYPE,KN,KLOT,KSTRIDE) BIND(C,NAME="create_plan_fftc_") USE, INTRINSIC :: ISO_C_BINDING INTEGER(C_INT) :: KPLAN - INTEGER(C_INT) :: KTYPE,KN,KLOT + INTEGER(C_INT) :: KTYPE,KN,KLOT,KSTRIDE END SUBROUTINE CREATE_PLAN_FFTC END INTERFACE @@ -110,7 +112,7 @@ END SUBROUTINE CREATE_PLAN_FFTC ENDIF ! search for plan in existing plans DO JL=1,TC%N_PLANS(KN) - IF( KLOT == CURR_FFTC_PLAN%NLOT .AND. KTYPE == CURR_FFTC_PLAN%NTYPE )THEN + IF( KLOT == CURR_FFTC_PLAN%NLOT .AND. KTYPE == CURR_FFTC_PLAN%NTYPE .AND. KSTRIDE == CURR_FFTC_PLAN%NSTRIDE )THEN LLFOUND=.TRUE. IPLAN=CURR_FFTC_PLAN%NPLAN EXIT @@ -136,7 +138,7 @@ END SUBROUTINE CREATE_PLAN_FFTC ! WRITE(*,'("CREATE_PLAN_FFT: END: DESTROYING A PLAN AT THE START OF THE LIST")') ENDIF ENDIF - CALL CREATE_PLAN_FFTC(IPLAN,KTYPE,KN,KLOT) + CALL CREATE_PLAN_FFTC(IPLAN,KTYPE,KN,KLOT,KSTRIDE) KPLAN=IPLAN TC%N_PLANS(KN)=TC%N_PLANS(KN)+1 IF( TC%N_PLANS(KN) /= 1 )THEN @@ -149,6 +151,7 @@ END SUBROUTINE CREATE_PLAN_FFTC ENDIF CURR_FFTC_PLAN%NPLAN=IPLAN CURR_FFTC_PLAN%NLOT=KLOT + CURR_FFTC_PLAN%NSTRIDE=KSTRIDE CURR_FFTC_PLAN%NTYPE=KTYPE CURR_FFTC_PLAN%NEXT_PLAN=>NULL() ! write(*,'("CREATE_PLAN_FFT: KN=",I5," NPLANS=",I3," KLOT=",I6," KTYPE=",I2,& From c9bdc9e001c9d5ee9b4a9fa75ec0e69d462d26f1 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:52 -0700 Subject: [PATCH 026/263] Restructure fourier_in - little slow down but better readibility --- src/trans/gpu/internal/fourier_out_mod.F90 | 75 ++++++---------------- 1 file changed, 21 insertions(+), 54 deletions(-) diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 index 1617406f8..dd44ec1ef 100755 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ b/src/trans/gpu/internal/fourier_out_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -39,9 +40,9 @@ SUBROUTINE FOURIER_OUT(KFIELDS) USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT -USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NPTRLS,D_NSTAGTF,D_MSTABF,D_NSTAGT1B,D_NPNTGTB0,D_NPROCM, D_NPROCL +USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NPTRLS,D_NSTAGTF,D_MSTABF,D_NSTAGT1B,D_NPNTGTB0,D_NPROCM USE TPM_TRANS ,ONLY : FOUBUF_IN, ZGTF -USE TPM_GEOMETRY ,ONLY : G, G_NMEN,G_NMEN_MAX +USE TPM_GEOMETRY ,ONLY : G_NMEN ! IMPLICIT NONE @@ -50,9 +51,9 @@ SUBROUTINE FOURIER_OUT(KFIELDS) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS INTEGER(KIND=JPIM) :: KGL -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,IR,II,ISTA, ISTA1,JMMAX, iunit +INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA,JMMAX -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC, IOFF,iimax1,iimax2,iimax3 +INTEGER(KIND=JPIM) :: IBEG,IEND,IINC, IOFF ! ------------------------------------------------------------------ @@ -66,61 +67,27 @@ SUBROUTINE FOURIER_OUT(KFIELDS) IINC=-1 ENDIF -!$ACC DATA PRESENT(FOUBUF_IN,ZGTF, D_NPTRLS,G_NMEN,D_NPROCM,D_NSTAGT1B,D_MSTABF,D_NPNTGTB0,D_NSTAGTF) COPYIN(IBEG,IEND,IINC) -!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(IGLG,JMMAX,IPROC,ISTA,IOFF) +!$ACC DATA PRESENT(FOUBUF_IN,ZGTF, D_NPTRLS,G_NMEN,D_NPROCM,D_NSTAGT1B,D_MSTABF,D_NPNTGTB0,D_NSTAGTF) +!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(IGLG,JMMAX,IPROC,ISTA,IOFF,JM) DO KGL=IBEG,IEND,IINC - DO JM=0,G_NMEN_MAX - DO JF=1,KFIELDS - - IGLG = D_NPTRLS(MYSETW)+KGL-1 - JMMAX = G_NMEN(IGLG) - if (JM .le. JMMAX) then - - IPROC = D_NPROCM(JM) - ISTA = (D_NSTAGT1B(D_MSTABF(IPROC))+D_NPNTGTB0(JM,KGL))*2*KFIELDS - IOFF = 1+D_NSTAGTF(KGL) - - ! imaginary may be not JM+1 but JM+G_NMEN(IGLG)+1 - FOUBUF_IN(ISTA+2*JF-1) = ZGTF(2*JF-1, 2*JM+IOFF) - FOUBUF_IN(ISTA+2*JF ) = ZGTF(2*JF , 2*JM+IOFF) - !if( myproc.eq.1 .and. jf.eq.1 .and. JM.eq.0 ) write(*,*) 'fou_oD ',ISTA+2*JF-1,FOUBUF_IN(ISTA+2*JF-1),FOUBUF_IN(ISTA+2*JF ) - !if( myproc.eq.1 .and. jf.eq.1 .and. JM.eq.0 ) write(*,*) 'fou_oD1 ', ZGTF(JF, 2*JM+IOFF), ZGTF(JF, 2*JM+1+IOFF) - !if( myproc.eq.2 .and. jf.eq.1 .and. JM.eq.0 ) write(*,*) 'fou_oD ',ISTA+2*JF-1,FOUBUF_IN(ISTA+2*JF-1),FOUBUF_IN(ISTA+2*JF ) - !if( myproc.eq.2 .and. jf.eq.1 .and. JM.eq.0 ) write(*,*) 'fou_oD1 ', IOFF, KGL, ZGTF(JF, 2*JM+IOFF), ZGTF(JF, 2*JM+1+IOFF) - - end if + DO JF=1,KFIELDS + + IGLG = D_NPTRLS(MYSETW)+KGL-1 + IOFF = 1+D_NSTAGTF(KGL) + + !$ACC LOOP SEQ + DO JM=0,G_NMEN(IGLG) + IPROC = D_NPROCM(JM) + ISTA = (D_NSTAGT1B(D_MSTABF(IPROC))+D_NPNTGTB0(JM,KGL))*2*KFIELDS + + ! imaginary may be not JM+1 but JM+G_NMEN(IGLG)+1 + FOUBUF_IN(ISTA+2*JF-1) = ZGTF(2*JF-1, 2*JM+IOFF) + FOUBUF_IN(ISTA+2*JF ) = ZGTF(2*JF , 2*JM+IOFF) ENDDO - ENDDO + ENDDO END DO !$ACC END DATA -!iimax1=0 -!iimax2=0 -!iimax3=0 -!iunit=myproc+300 -!DO KGL=IBEG,IEND,IINC -! DO JM=0,G_NMEN_MAX -! DO JF=1,KFIELDS -! IGLG = D_NPTRLS(MYSETW)+KGL-1 -! JMMAX = G_NMEN(IGLG) -! if (JM .le. JMMAX) then -! IPROC = D_NPROCM(JM) -! ISTA = (D_NSTAGT1B(D_MSTABF(IPROC))+D_NPNTGTB0(JM,KGL))*2*KFIELDS -! IOFF = 1+D_NSTAGTF(KGL) -! iimax1 = max(iimax1,2*JF) -! iimax2 = max(iimax2,2*JM+IOFF) -! iimax3 = max(iimax3,ISTA+2*JF) -! !if( jf.eq.(41+137-1) .and. JM.eq.0 ) write(iunit,*) 'fou_o ',ISTA+2*JF-1,FOUBUF_IN(ISTA+2*JF-1),FOUBUF_IN(ISTA+2*JF ) -! if( jf.eq.1 .and. JM.eq.0 ) write(iunit,*) 'fou_o10 ', IOFF, KGL, ZGTF(2*JF-1, 2*JM+IOFF), ZGTF(2*JF, 2*JM+IOFF) -! !if( jf.eq.1 .and. JM.eq.1 ) write(iunit,*) 'fou_o11 ', IOFF, KGL, ZGTF(JF, 2*JM+IOFF), ZGTF(JF, 2*JM+1+IOFF) -! !if( jf.eq.1 .and. JM.eq.2 ) write(iunit,*) 'fou_o12 ', IOFF, KGL, ZGTF(JF, 2*JM+IOFF), ZGTF(JF, 2*JM+1+IOFF) -! !if( jf.eq.1 ) write(iunit,*) 'fou_o2 ', IOFF, ZGTF(JF, 2*JM-1+IOFF),ZGTF(JF, 2*JM+IOFF),ZGTF(JF, 2*JM+1+IOFF) - ! end if -! -! ENDDO -! ENDDO -!ENDDO -!write(iunit,*), 'maxes ',iimax1,size(ZGTF,1),iimax2,size(ZGTF,2),iimax3,size(FOUBUF_IN) ! ------------------------------------------------------------------ From 784e6230027d769a33638a9daab47b281f58b363 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:52 -0700 Subject: [PATCH 027/263] Cleanup trgtol writes into ZGTF --- src/trans/gpu/internal/trgtol_mod.F90 | 29 +++++++++------------------ 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 160fad636..7e2aced3e 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -105,14 +105,13 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP2(:,:,:) REAL(KIND=JPRBT),ALLOCATABLE :: ZCOMBUFS(:,:),ZCOMBUFR(:,:) - INTEGER(KIND=JPIM) :: ICOMBUFS_FLD(NPROC),ICOMBUFR_FLD(NPROC) REAL(KIND=JPRBT) :: ZDUM(2) INTEGER(KIND=JPIM) :: ISENT (NPROC) INTEGER(KIND=JPIM) :: IRCVD (NPROC) INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) - INTEGER(KIND=JPIM) :: IREQ (NPROC*4) + INTEGER(KIND=JPIM) :: IREQ (NPROC*2) INTEGER(KIND=JPIM) :: JRECV (NPROC) INTEGER(KIND=JPIM) :: JSEND (NPROC) @@ -133,15 +132,14 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& INTEGER(KIND=JPIM) :: IGP3APARS(KF_GP),IGP3ALEVS(KF_GP),IGP3BPARS(KF_GP),IGP3BLEVS(KF_GP) INTEGER(KIND=JPIM) :: IUVPAR,IUVLEV,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR,IPAROFF INTEGER(KIND=JPIM) :: IOFF,IOFF1,IOFFNS,IOFFEW,J1,J2 - INTEGER(KIND=JPIM) :: KINDEX(D%NLENGTF),INDOFF(NPROC),IFLDOFF(MAX(KF_GP,KF_FS)) + INTEGER(KIND=JPIM) :: KINDEX(D%NLENGTF),INDOFF(NPROC),IFLDOFF(KF_FS) INTEGER(KIND=JPIM) :: ISEND_FLD_TOTAL(NPROC),ISEND_FLD_CNT - INTEGER(KIND=JPIM) :: IRECV_FLD_CNT INTEGER(KIND=JPIM) :: INUMFLDS INTEGER(KIND=JPIM) :: IGPTRSEND(2,NGPBLKS,NPRTRNS) INTEGER(KIND=JPIM) :: IGPTRRECV(NPRTRNS) INTEGER(KIND=JPIM) :: IGPTROFF(NGPBLKS) ! INTEGER FUNCTIONS - INTEGER(KIND=JPIM) :: IRECVCOUNT,ISENDCOUNT + INTEGER(KIND=JPIM) :: IRECVCOUNT,ISENDCOUNT,INDOFFL INTEGER(KIND=JPIM) :: IJPOS(NGPBLKS),IFLDA(KF_GP),JJ,JI,IFLDT REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR @@ -440,6 +438,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) COPYIN(IGP3BLEVS,IGP3BPARS) IF(ISENDTOT(MYPROC) > 0 )THEN + ! Input is KF_GP fields. We find the resulting KF_FS fields. IFLDS = 0 DO JFLD=1,KF_GP IF(KVSET(JFLD) == MYSETV .OR. KVSET(JFLD) == -1) THEN @@ -470,7 +469,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IF(LLPGPONLY) THEN !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) DO JBLK=1,NGPBLKS - DO JFLD=1,IFLDS + DO JFLD=1,KF_FS DO JKL=1, JK_MAX IFIRST = IGPTRSEND(1,JBLK,MYSETW) ILAST = IGPTRSEND(2,JBLK,MYSETW) @@ -486,7 +485,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ELSE !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) DO JBLK=1,NGPBLKS - DO JFLD=1,IFLDS + DO JFLD=1,KF_FS DO JKL=1, JK_MAX IFIRST = IGPTRSEND(1,JBLK,MYSETW) ILAST = IGPTRSEND(2,JBLK,MYSETW) @@ -574,8 +573,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDDO ENDDO ENDDO - - ICOMBUFS_FLD(INS) = IFLD ENDDO @@ -617,9 +614,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IRECV=JRECV(INR) CALL MPI_IRECV(ZCOMBUFR(1:IRECVTOT(IRECV),INR),IRECVTOT(IRECV), & & TRGTOL_DTYPE,NPRCIDS(IRECV)-1,ITAG,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) - IR=IR+1 - CALL MPI_IRECV(ICOMBUFR_FLD(INR),1, & - & MPI_INTEGER,NPRCIDS(IRECV)-1,ITAG,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) ENDDO !....Send loop......................................................... @@ -628,9 +622,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ISEND=JSEND(INS) CALL MPI_ISEND(ZCOMBUFS(1:ISENDTOT(ISEND),INS),ISENDTOT(ISEND), & & TRGTOL_DTYPE,NPRCIDS(ISEND)-1,ITAG,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) - IR=IR+1 - CALL MPI_ISEND(ICOMBUFS_FLD(INS),1, & - & MPI_INTEGER,NPRCIDS(ISEND)-1,ITAG,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) ENDDO !$ACC END HOST_DATA @@ -665,11 +656,11 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& DO INR=1,INRECV IRECV=JRECV(INR) ILEN = IRECVTOT(IRECV)/KF_FS - IRECV_FLD_CNT = ICOMBUFR_FLD(INR) - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(II) COPYIN(IRECV,ILEN,IRECV_FLD_CNT) - DO JFLD=1,IRECV_FLD_CNT + INDOFFL = INDOFF(IRECV) + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(II) + DO JFLD=1,KF_FS DO JL=1,ILEN - II = KINDEX(INDOFF(IRECV)+JL) + II = KINDEX(INDOFFL+JL) PGLAT(JFLD,II) = ZCOMBUFR(JL+(JFLD-1)*ILEN,INR) ENDDO ENDDO From 66731edeff67ccc1aab2578b2b046788eb0a6bbb Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:52 -0700 Subject: [PATCH 028/263] Simplify truncation --- src/trans/gpu/internal/ftdir_mod.F90 | 42 +++++++++++++--------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 6b28bfd0e..7ee166541 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -67,15 +67,14 @@ SUBROUTINE FTDIR(STRIDE,KF_FS) INTEGER(KIND=JPIM) :: KGL !!!REAL(KIND=JPRBT), INTENT(INOUT) :: PREEL(STRIDE,D%NLENGTF) -INTEGER(KIND=JPIM) :: IGLG,IST,JJ,JF,IST1 +INTEGER(KIND=JPIM) :: IGLG,JJ,JF,IST1 INTEGER(KIND=JPIM) :: IOFF INTEGER(KIND=JPIM) :: IPLAN_R2C INTEGER(KIND=JPIM) :: JMAX REAL(KIND=JPRBT) :: SCAL -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC,ISCAL +INTEGER(KIND=JPIM) :: IBEG,IEND,IINC,JTRUNC_START INTEGER(KIND=JPIM) :: OFFSET_VAR -integer :: istat real(kind=jprbt), allocatable :: zgtf2(:,:) ! ------------------------------------------------------------------ @@ -97,7 +96,6 @@ SUBROUTINE FTDIR(STRIDE,KF_FS) !$ACC DATA CREATE(ZGTF2) -!!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(KGL,IOFF,IGLG,IPLAN_R2C,istat) DO KGL=IBEG,IEND,IINC IOFF=D%NSTAGTF(KGL)+1 @@ -108,31 +106,31 @@ SUBROUTINE FTDIR(STRIDE,KF_FS) CALL EXECUTE_PLAN_FFTC(IPLAN_R2C,-1,ZGTF(1,IOFF),ZGTF2(1,IOFF)) !$ACC end host_data END DO -!!$OMP END PARALLEL DO - -istat = cuda_Synchronize() OFFSET_VAR=D_NPTRLS(MYSETW) -!$ACC parallel loop collapse(2) private(JMAX,JJ,KGL,IOFF,SCAL,IST) DEFAULT(NONE) -DO IGLG=IBEG+OFFSET_VAR-1,IEND+OFFSET_VAR-1,IINC +!$ACC parallel loop collapse(2) private(JMAX,JJ,IOFF,SCAL,JTRUNC_START) DEFAULT(NONE) +DO KGL=IBEG,IEND,IINC DO JF=1,2*KF_FS - JMAX = G_NLOEN(IGLG) - SCAL = 1._JPRBT/REAL(JMAX,JPRBT) - IST = 2*(G_NMEN(IGLG)+1) - KGL=IGLG-OFFSET_VAR+1 + IGLG = OFFSET_VAR+KGL-1 IOFF=D_NSTAGTF(KGL)+1 + JMAX = G_NLOEN(IGLG) + ! Multiply with two because we are in complex domain + ! TODO I am not sure if this is +1,0,-1 + JTRUNC_START = MIN(2*(G_NMEN(IGLG)+1)-1,JMAX) + + SCAL = 1._JPRBT/REAL(JMAX,JPRBT) !$ACC LOOP SEQ - DO JJ=1, JMAX + DO JJ=1, JTRUNC_START, 2 ZGTF(JF,IOFF+JJ-1)= SCAL * ZGTF2(JF, IOFF+JJ-1) - ENDDO - - !! WHAT'S GOING ON HERE? TRUNCATING? - IF (JMAX== 1) ZGTF(JF,IST+IOFF-1) = 0.0_JPRBT - !$ACC LOOP SEQ - DO JJ=1,JMAX+R%NNOEXTZL+3-IST - ZGTF(JF,IST+IOFF+JJ-1) = 0.0_JPRBT - ENDDO + ENDDO + + ! In fact this is not needed, probably, because FOURIER_OUT could only pack + ! what it needs + !$ACC LOOP SEQ + DO JJ=JTRUNC_START + 1, JMAX, 2 + ZGTF(JF,IOFF+JJ-1)= 0.0_JPRBT + ENDDO ENDDO ENDDO !$acc end data From 5de8ed5234b0fba619722e87d16f471022d78779 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:52 -0700 Subject: [PATCH 029/263] Remove redundant temporaries for ledir --- src/trans/gpu/internal/ldfou2_mod.F90 | 112 ---------------------- src/trans/gpu/internal/ledir_mod.F90 | 103 ++++++++++---------- src/trans/gpu/internal/ltdir_ctl_mod.F90 | 4 +- src/trans/gpu/internal/ltdir_mod.F90 | 21 +---- src/trans/gpu/internal/prfi2_mod.F90 | 99 ------------------- src/trans/gpu/internal/prfi2b_mod.F90 | 115 ----------------------- 6 files changed, 56 insertions(+), 398 deletions(-) delete mode 100755 src/trans/gpu/internal/ldfou2_mod.F90 delete mode 100755 src/trans/gpu/internal/prfi2_mod.F90 delete mode 100755 src/trans/gpu/internal/prfi2b_mod.F90 diff --git a/src/trans/gpu/internal/ldfou2_mod.F90 b/src/trans/gpu/internal/ldfou2_mod.F90 deleted file mode 100755 index d314f5820..000000000 --- a/src/trans/gpu/internal/ldfou2_mod.F90 +++ /dev/null @@ -1,112 +0,0 @@ -! (C) Copyright 1991- ECMWF. -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE LDFOU2_MOD -CONTAINS -SUBROUTINE LDFOU2(KF_UV,PAIA) - -!**** *LDFOU2* - Division by a*cos(theta) of u and v - -! Purpose. -! -------- -! In Fourier space divide u and v by a*cos(theta). - -!** Interface. -! ---------- -! CALL LDFOU2(KM,PAIA,PSIA) - -! Explicit arguments : -! -------------------- KM - zonal wavenumber -! PAIA - antisymmetric fourier fields -! PSIA - symmetric fourierfields - -! Implicit arguments : RACTHE - 1./(a*cos(theta)) -! -------------------- - -! Method. -! ------- - -! Externals. None. -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS -! Temperton, 1991, MWR 119 p1303 - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 91-07-01 -! Modified : 94-04-06 R. El Khatib - Full-POS configuration 'P' -! M.Hamrud : 94-11-01 New conf 'G' - vor,div->vor,div -! instead of u,v->vor,div -! MPP Group: 95-10-01 Message Passing option added -! Modified : 04/06/99 D.Salmond : change order of AIA and SIA -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - -USE TPM_FIELDS ,ONLY : F -USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS -USE TPM_DIM ,ONLY : R, R_NDGNH, R_NDGL -USE TPM_GEOMETRY ,ONLY : G, G_NDGLU - -! - -IMPLICIT NONE - - -! DUMMY INTEGER SCALARS -INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV -INTEGER(KIND=JPIM) :: KM,KMLOC - -REAL(KIND=JPRBT) ,INTENT(INOUT) :: PAIA(:,:,:) -!REAL(KIND=JPRBT) ,INTENT(INOUT) :: PSIA(:,:,:), PAIA(:,:,:) - -! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: J, JGL ,ISL - -! ------------------------------------------------------------------ - -!* 1. DIVIDE U V BY A*COS(THETA) -! -------------------------- - -IF( KF_UV > 0 ) THEN - -!$ACC DATA & -!$ACC& PRESENT(F,F%RACTHE,D,D_NUMP,D_MYMS,R_NDGNH,R_NDGL,G_NDGLU) & -!$ACC& PRESENT(PAIA) - -!loop over wavenumber - -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL) DEFAULT(NONE) -DO KMLOC=1,D_NUMP - DO J=1,4*KF_UV ! (real+complex) * (U+V) - KM = D_MYMS(KMLOC) - ISL = R_NDGNH-G_NDGLU(KM)+1 - !$ACC LOOP SEQ - DO JGL=ISL,R_NDGNH -!* 1.1 U AND V - PAIA(J,JGL,KMLOC) = PAIA(J,JGL,KMLOC)*F%RACTHE(JGL) - ENDDO - ENDDO -ENDDO -!$ACC END DATA - -ENDIF - -! ------------------------------------------------------------------ - -END SUBROUTINE LDFOU2 -END MODULE LDFOU2_MOD diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index e9349a0c4..7ecf7cda0 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -10,7 +10,7 @@ MODULE LEDIR_MOD CONTAINS -SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) +SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) !**** *LEDIR* - Direct Legendre transform. @@ -24,7 +24,6 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ! Explicit arguments : KM - zonal wavenumber ! -------------------- KFC - number of field to transform -! PAIA - antisymmetric part of Fourier ! fields for zonal wavenumber KM ! PSIA - symmetric part of Fourier ! fields for zonal wavenumber KM @@ -56,7 +55,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) USE PARKIND_ECTRANS ,ONLY : JPIM ,JPIB ,JPRB, JPRBT USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK -USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX,R_NTMAX +USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX,R_NTMAX,R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU USE TPM_FIELDS ,ONLY : F, & & ZAA,DZBST,DZCAT,ZAS,DZCST,& @@ -65,58 +64,58 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) USE TPM_DISTR USE TPM_GEN, ONLY: NOUT USE TPM_FLT +USE TPM_TRANS ,ONLY : FOUBUF USE BUTTERFLY_ALG_MOD USE CUDA_GEMM_BATCHED_MOD!!, ONLY: CUDA_TCGEMM_BATCHED, CUDA_GEMM_BATCHED USE CUBLAS_MOD, ONLY : CUDA_DGEMM_BATCHED USE, INTRINSIC :: ISO_C_BINDING USE IEEE_ARITHMETIC + IMPLICIT NONE ! DUMMY ARGUMENTS -INTEGER(KIND=JPIM) :: KM -INTEGER(KIND=JPIM) :: KMLOC -INTEGER(KIND=JPIM) :: KDGLU -INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS -INTEGER(KIND=JPIM), INTENT(IN) :: KLED2 -INTEGER(KIND=JPIM), INTENT(IN) :: KMODE - -REAL(KIND=JPRBT), INTENT(IN) :: PAIA(:,:,:) +INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV REAL(KIND=JPRBT), INTENT(OUT) :: POA1(:,:,:) ! LOCAL VARIABLES +INTEGER(KIND=JPIM) :: KM +INTEGER(KIND=JPIM) :: KMLOC INTEGER(KIND=JPIM) :: IA, IS, ISL, J, JK REAL(KIND=JPHOOK) :: ZHOOK_HANDLE +REAL(KIND=JPRBT) :: PAIA -INTEGER :: ISTAT +INTEGER(KIND=JPIM) :: IGLS, JF, JGL +INTEGER(KIND=JPIM) :: OFFSET1, OFFSET2 IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) !$ACC DATA & !$ACC& PRESENT(F,F%RW) & -!$ACC& PRESENT(D,D_NUMP,D_MYMS,R,R_NDGNH,G,G_NDGLU,R_NSMAX,R_NTMAX) & -!$ACC& PRESENT(PAIA) & +!$ACC& PRESENT(D,D_MYMS,R,G,G_NDGLU) & !$ACC& PRESENT(ZAA,ZAS,DZBST,DZCST,DZCAT) & -!$ACC& PRESENT(POA1,dzbst0,dzcat0,dzbst0,dzcst0) !& - +!$ACC& PRESENT(POA1,DZBST0,DZCAT0,DZBST0,DZCST0) & +!$ACC& PRESENT(FOUBUF,D_NPNTGTB1,D_NSTAGT1B,D_NPROCL) ! anti-symmetric -IF ( KMODE == -1 ) THEN - -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL) DEFAULT(NONE) +!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA) DO KMLOC=1,D_NUMP - DO JK=1,2*KF_FS + DO JF=1,KF_FS*2 KM = D_MYMS(KMLOC) - IF (KM /= 0) THEN - ISL = R_NDGNH-G_NDGLU(KM)+1 - - !$ACC LOOP SEQ - DO J=1,G_NDGLU(KM) - DZBST((JK-1)+1+(J-1+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) - ENDDO - ENDIF + ISL = R_NDGNH-G_NDGLU(KM)+1 + !$ACC LOOP SEQ + DO JGL=ISL,R_NDGNH + IGLS = R_NDGL+1-JGL + OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL) )+D_NPNTGTB1(KMLOC,JGL ))*2*KF_FS + OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS))+D_NPNTGTB1(KMLOC,IGLS))*2*KF_FS + PAIA = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) + IF (JF .LE. 4*KF_UV) THEN + PAIA = PAIA*F%RACTHE(JGL) + ENDIF + DZBST((JF-1)+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIA*F%RW(JGL) + ENDDO ENDDO END DO @@ -154,11 +153,11 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) IF(KMLOC0 > 0) THEN print*,'computing m=0 in double precision' - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(ISL) DEFAULT(NONE) - DO J=1,G_NDGLU(0) - DO JK=1,2*KF_FS,2 - ISL = R_NDGNH-G_NDGLU(0)+1 - DZBST0((JK-1)/2+1+(J-1)*2*KF_FS)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) + DO JGL=1,G_NDGLU(0) + DO JF=1,2*KF_FS,2 + DZBST0((JF-1)/2+1+(JGL-1)*2*KF_FS) & + & = DZBST((JF-1)+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*2*KF_FS) ENDDO ENDDO @@ -188,22 +187,24 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ENDDO ENDIF -ELSE - ! symmetric -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL) DEFAULT(NONE) +!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA) DO KMLOC=1,D_NUMP - DO JK=1,2*KF_FS + DO JF=1,KF_FS*2 KM = D_MYMS(KMLOC) - IF (KM /= 0) THEN - ISL = R_NDGNH-G_NDGLU(KM)+1 - - !$ACC LOOP SEQ - DO J=1,G_NDGLU(KM) - DZBST(JK+(J-1+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIA(JK,ISL+J-1,KMLOC)*F%RW(ISL+J-1) - ENDDO - ENDIF + ISL = R_NDGNH-G_NDGLU(KM)+1 + !$ACC LOOP SEQ + DO JGL=ISL,R_NDGNH + IGLS = R_NDGL+1-JGL + OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL) )+D_NPNTGTB1(KMLOC,JGL ))*2*KF_FS + OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS))+D_NPNTGTB1(KMLOC,IGLS))*2*KF_FS + PAIA = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) + IF (JF .LE. 4*KF_UV) THEN + PAIA = PAIA*F%RACTHE(JGL) + ENDIF + DZBST((JF-1)+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIA*F%RW(JGL) + ENDDO ENDDO END DO @@ -237,11 +238,11 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ENDDO IF(KMLOC0 > 0) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(ISL) DEFAULT(NONE) - DO J=1,G_NDGLU(0) - DO JK=1,2*KF_FS,2 - ISL = R_NDGNH-G_NDGLU(0)+1 - DZBST0((JK-1)/2+1+(J-1)*2*KF_FS)=PAIA(JK,ISL+J-1,KMLOC0)*F%RW(ISL+J-1) + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) + DO JGL=1,G_NDGLU(0) + DO JF=1,2*KF_FS,2 + DZBST0((JF-1)/2+1+(JGL-1)*2*KF_FS) & + & = DZBST((JF-1)+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*2*KF_FS) ENDDO ENDDO @@ -271,8 +272,6 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ENDIF -ENDIF - !$ACC END DATA diff --git a/src/trans/gpu/internal/ltdir_ctl_mod.F90 b/src/trans/gpu/internal/ltdir_ctl_mod.F90 index 367aba00f..d917e8f88 100755 --- a/src/trans/gpu/internal/ltdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltdir_ctl_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -64,14 +65,13 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS, & INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) - INTEGER(KIND=JPIM) :: JM,IM,IBLEN,ILED2 + INTEGER(KIND=JPIM) :: JM,IM,ILED2 !$ACC DATA PRESENT(FOUBUF_IN) CREATE(FOUBUF) ! Transposition from Fourier space distribution to spectral space distribution ! requires currently both on the host !!! - IBLEN = D%NLENGT0B*2*KF_FS CALL GSTATS(153,0) #ifdef USE_CUDA_AWARE_MPI_FT WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_CUDAAWARE' diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index 9cae24ddd..d6fa3bdea 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 1987- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -23,8 +24,6 @@ SUBROUTINE LTDIR(KF_FS,KF_UV,KF_SCALARS,KLED2,& USE TPM_GEOMETRY USE PREPSNM_MOD ,ONLY : PREPSNM - USE PRFI2B_MOD ,ONLY : PRFI2B - USE LDFOU2_MOD ,ONLY : LDFOU2 USE LEDIR_MOD ,ONLY : LEDIR USE UVTVD_MOD USE UPDSP_MOD ,ONLY : UPDSP @@ -56,7 +55,6 @@ SUBROUTINE LTDIR(KF_FS,KF_UV,KF_SCALARS,KLED2,& ! ---------- ! PREPSNM - prepare REPSNM for wavenumber KM ! PRFI2 - prepares the Fourier work arrays for model variables. - ! LDFOU2 - computations in Fourier space ! LEDIR - direct Legendre transform ! UVTVD - ! UPDSP - updating of spectral arrays (fields) @@ -141,21 +139,8 @@ END SUBROUTINE cudaProfilerStop !* 2. PREPARE WORK ARRAYS. ! -------------------- - ! serial to save memory, Nils - - ! anti-symmetric - - - CALL PRFI2B(KF_FS,ZAIA,-1) - CALL LDFOU2(KF_UV,ZAIA) - CALL LEDIR(KF_FS,KLED2,ZAIA,ZOA1,-1) - - - ! symmetric - - CALL PRFI2B(KF_FS,ZAIA,1) - CALL LDFOU2(KF_UV,ZAIA) - CALL LEDIR(KF_FS,KLED2,ZAIA,ZOA1,1) + ! do the legendre transform + CALL LEDIR(KF_FS,KF_UV,ZOA1) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/prfi2_mod.F90 b/src/trans/gpu/internal/prfi2_mod.F90 deleted file mode 100755 index 913d3022e..000000000 --- a/src/trans/gpu/internal/prfi2_mod.F90 +++ /dev/null @@ -1,99 +0,0 @@ -! (C) Copyright 1987- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE PRFI2_MOD - CONTAINS - SUBROUTINE PRFI2(KF_FS,PAIA,PSIA) - - !**** *PRFI2* - Prepare input work arrays for direct transform - - ! Purpose. - ! -------- - ! To extract the Fourier fields for a specific zonal wavenumber - ! and put them in an order suitable for the direct Legendre - ! tranforms, i.e. split into symmetric and anti-symmetric part. - - !** Interface. - ! ---------- - ! *CALL* *PRFI2(..) - - ! Explicit arguments : - ! -------------------- KM - zonal wavenumber - ! KMLOC - local zonal wavenumber - ! PAIA - antisymmetric part of Fourier - ! components for KM (output) - ! PSIA - symmetric part of Fourier - ! components for KM (output) - - ! Implicit arguments : The Grid point arrays of the model. - ! -------------------- - - ! Method. - ! ------- - - ! Externals. PRFI2B - basic copying routine - ! ---------- - - ! Reference. - ! ---------- - ! ECMWF Research Department documentation of the IFS - - ! Author. - ! ------- - ! Mats Hamrud and Philippe Courtier *ECMWF* - - ! Modifications. - ! -------------- - ! Original : 87-11-25 - ! Modified : 91-07-01 Philippe Courtier/Mats Hamrud - Rewrite - ! for uv formulation - ! Modified : 93-03-19 D. Giard - CDCONF='T' - ! Modified : 93-??-?? ???????? - CDCONF='C'--> bug if CDCONF='T' - ! Modified : 93-05-13 D. Giard - correction of the previous bug - ! Modified : 93-11-18 M. Hamrud - use only one Fourier buffer - ! Modified : 94-04-06 R. El Khatib - Full-POS configuration 'P' - ! M.Hamrud : 94-11-01 New conf 'G' - vor,div->vor,div - ! instead of u,v->vor,div - ! MPP Group: 95-10-01 Support for Distributed Memory version - ! Modified : 04/06/99 D.Salmond : change order of AIA and SIA - ! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - - !USE TPM_TRANS - - USE PRFI2B_MOD ,ONLY : PRFI2B - ! - - IMPLICIT NONE - - - ! DUMMY INTEGER SCALARS - INTEGER(KIND=JPIM) :: KM - INTEGER(KIND=JPIM) :: KMLOC - INTEGER(KIND=JPIM) , INTENT(IN) :: KF_FS - - - REAL(KIND=JPRBT) , INTENT(OUT) :: PSIA(:,:,:), PAIA(:,:,:) - - - ! LOCAL INTEGER SCALARS - - - ! ------------------------------------------------------------------ - - !* 2. EXTRACT SYM./ANTISYM. FIELDS FROM TIME T+1. - ! ------------------------------------------- - -! CALL PRFI2B(KF_FS,PAIA,PSIA) - - ! ------------------------------------------------------------------ - - END SUBROUTINE PRFI2 - END MODULE PRFI2_MOD diff --git a/src/trans/gpu/internal/prfi2b_mod.F90 b/src/trans/gpu/internal/prfi2b_mod.F90 deleted file mode 100755 index ea9f4bc77..000000000 --- a/src/trans/gpu/internal/prfi2b_mod.F90 +++ /dev/null @@ -1,115 +0,0 @@ -! (C) Copyright 1990- ECMWF. -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE PRFI2B_MOD - CONTAINS - SUBROUTINE PRFI2B(KF_FS,PAIA,KMODE) - - !**** *PRFI2B* - Prepare input work arrays for direct transform - - ! Purpose. - ! -------- - ! To extract the Fourier fields for a specific zonal wavenumber - ! and put them in an order suitable for the direct Legendre - ! tranforms, i.e. split into symmetric and anti-symmetric part. - - !** Interface. - ! ---------- - ! *CALL* *PRFI2B(..) - - ! Explicit arguments : - ! ------------------- KF_FS - number of fields - ! KM - zonal wavenumber - ! KMLOC - local zonal wavenumber - ! PAOA - antisymmetric part of Fourier - ! fields for zonal wavenumber KM - ! PSOA - symmetric part of Fourier - ! fields for zonal wavenumber KM - - ! Implicit arguments : FOUBUF in TPM_TRANS - ! -------------------- - - ! Method. - ! ------- - - ! Externals. None. - ! ---------- - - ! Reference. - ! ---------- - ! ECMWF Research Department documentation of the IFS - - ! Author. - ! ------- - ! Mats Hamrud and Philippe Courtier *ECMWF* - - ! Modifications. - ! -------------- - ! Original : 90-07-01 - ! MPP Group: 95-10-01 Support for Distributed Memory version - ! Modified : 04/06/99 D.Salmond : change order of AIA and SIA - ! ------------------------------------------------------------------ - - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - - USE TPM_DIM ,ONLY : R, R_NDGNH, R_NDGL - USE TPM_TRANS ,ONLY : FOUBUF - USE TPM_GEOMETRY ,ONLY : G, G_NDGLU - USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,D_NSTAGT1B,D_NPROCL,D_NPNTGTB1,MYPROC - ! - - IMPLICIT NONE - - INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS - INTEGER(KIND=JPIM),INTENT(IN) :: KMODE - INTEGER(KIND=JPIM) :: KM,KMLOC - REAL(KIND=JPRBT) , INTENT(OUT) :: PAIA(:,:,:) -!! REAL(KIND=JPRBT) , INTENT(OUT) :: PSIA(:,:,:), PAIA(:,:,:) - - - ! LOCAL INTEGER SCALARS - INTEGER(KIND=JPIM) :: IGLS, ISL, JF, JGL, iunit - - INTEGER(KIND=JPIM) :: OFFSET1, OFFSET2 - - - ! ------------------------------------------------------------------ - - !* 1. EXTRACT SYM./ANTISYM. FIELDS FROM FOURIER ARRAY. - ! ------------------------------------------------ - - -!$ACC DATA PRESENT(PAIA,FOUBUF, D_NPNTGTB1,D_NSTAGT1B,D_MYMS,R_NDGL,R_NDGNH,G_NDGLU,D_NPROCL) - -!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL) -DO KMLOC=1,D_NUMP - DO JF=1,KF_FS*2 - KM = D_MYMS(KMLOC) - ISL = R_NDGNH-G_NDGLU(KM)+1 - !$ACC LOOP SEQ - DO JGL=ISL,R_NDGNH - IGLS = R_NDGL+1-JGL - OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL) )+D_NPNTGTB1(KMLOC,JGL ))*2*KF_FS - OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS))+D_NPNTGTB1(KMLOC,IGLS))*2*KF_FS - IF( KMODE == -1 ) THEN - PAIA(JF,JGL,KMLOC) = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) - ELSE - PAIA(JF,JGL,KMLOC) = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) - ENDIF - ENDDO - ENDDO -END DO - -!$ACC END DATA - - ! ------------------------------------------------------------------ - - END SUBROUTINE PRFI2B - END MODULE PRFI2B_MOD From 4676c9c7ee5807b9b72cd926d9db451040fc5d5c Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:52 -0700 Subject: [PATCH 030/263] Move the OpenACC Updates for trltom to where they belong to --- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 3 --- src/trans/gpu/internal/ltdir_ctl_mod.F90 | 1 - src/trans/gpu/internal/trltom_mod.F90 | 33 +++--------------------- 3 files changed, 3 insertions(+), 34 deletions(-) diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index 68d0ba759..e81334336 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -225,9 +225,6 @@ END SUBROUTINE cudaProfilerStop ! Save Fourier data in FOUBUF_IN CALL FOURIER_OUT(KF_FS) -#ifndef USE_CUDA_AWARE_MPI_FT - !$ACC UPDATE HOST(FOUBUF_IN) -#endif CALL GSTATS(1640,1) !DEALLOCATE(ZGTF) diff --git a/src/trans/gpu/internal/ltdir_ctl_mod.F90 b/src/trans/gpu/internal/ltdir_ctl_mod.F90 index d917e8f88..56bcb5fb9 100755 --- a/src/trans/gpu/internal/ltdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltdir_ctl_mod.F90 @@ -78,7 +78,6 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS, & CALL TRLTOM_CUDAAWARE(FOUBUF_IN,FOUBUF,2*KF_FS) #else CALL TRLTOM(FOUBUF_IN,FOUBUF,2*KF_FS) - !$ACC UPDATE DEVICE(FOUBUF) #endif CALL GSTATS(153,1) diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index 6da1718a3..728cb85f2 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 1995- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -79,21 +80,6 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) IMPLICIT NONE - - INTERFACE - - FUNCTION ALLTOALLV_CUDAIPC(input,len,soff,output,roff,mtol_or_ltom) BIND(C,name='Alltoallv_CUDAIPC') - USE, INTRINSIC :: ISO_C_BINDING - IMPLICIT NONE - real(c_double), dimension(*) :: input,output - integer(c_int), dimension(*) :: len,soff,roff - integer(c_int),value :: mtol_or_ltom - integer(c_int) :: ALLTOALLV_CUDAIPC - END FUNCTION ALLTOALLV_CUDAIPC - - END INTERFACE - - INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF_IN(:) @@ -258,21 +244,6 @@ SUBROUTINE TRLTOM(PFBUF_IN,PFBUF,KFIELD) IMPLICIT NONE - - INTERFACE - - FUNCTION ALLTOALLV_CUDAIPC(input,len,soff,output,roff,mtol_or_ltom) BIND(C,name='Alltoallv_CUDAIPC') - USE, INTRINSIC :: ISO_C_BINDING - IMPLICIT NONE - real(c_double), dimension(*) :: input,output - integer(c_int), dimension(*) :: len,soff,roff - integer(c_int),value :: mtol_or_ltom - integer(c_int) :: ALLTOALLV_CUDAIPC - END FUNCTION ALLTOALLV_CUDAIPC - - END INTERFACE - - INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF_IN(:) @@ -307,12 +278,14 @@ END FUNCTION ALLTOALLV_CUDAIPC IF(NPROC > 1) THEN CALL GSTATS(806,0) + !$ACC UPDATE HOST(PFBUF_IN) CALL MPL_ALLTOALLV(PSENDBUF=PFBUF_IN,KSENDCOUNTS=ILENS,& & PRECVBUF=PFBUF,KRECVCOUNTS=ILENR,KSENDDISPL=IOFFS,KRECVDISPL=IOFFR,& & KCOMM=MPL_ALL_MS_COMM,CDSTRING='TRLTOM:') call MPI_BARRIER(MPI_COMM_WORLD,IERROR) + !$ACC UPDATE DEVICE(PFBUF) CALL GSTATS(806,1) ELSE ILEN = D%NLTSGTB(MYSETW)*KFIELD From a575ed21080f4c5ac54d5f657b624471d15a61e2 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:52 -0700 Subject: [PATCH 031/263] Various small improvements / renamings --- src/trans/gpu/internal/fourier_out_mod.F90 | 28 ++++++----------- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 15 --------- src/trans/gpu/internal/ltdir_ctl_mod.F90 | 15 +++++---- src/trans/gpu/internal/trltom_mod.F90 | 36 +++++++++++----------- 4 files changed, 34 insertions(+), 60 deletions(-) diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 index dd44ec1ef..e6eb0ffd7 100755 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ b/src/trans/gpu/internal/fourier_out_mod.F90 @@ -10,7 +10,7 @@ MODULE FOURIER_OUT_MOD CONTAINS -SUBROUTINE FOURIER_OUT(KFIELDS) +SUBROUTINE FOURIER_OUT(KF_FS) !**** *FOURIER_OUT* - Copy fourier data from local array to buffer @@ -23,7 +23,7 @@ SUBROUTINE FOURIER_OUT(KFIELDS) ! CALL FOURIER_OUT(...) ! Explicit arguments : PREEL - local fourier/GP array -! -------------------- KFIELDS - number of fields +! -------------------- KF_FS - number of fields ! ! Externals. None. ! ---------- @@ -48,29 +48,19 @@ SUBROUTINE FOURIER_OUT(KFIELDS) IMPLICIT NONE !REAL(KIND=JPRBT), INTENT(IN) :: PREEL(:,:) -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS +INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS INTEGER(KIND=JPIM) :: KGL -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA,JMMAX +INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA INTEGER(KIND=JPIM) :: IBEG,IEND,IINC, IOFF ! ------------------------------------------------------------------ -IF(MYPROC > NPROC/2)THEN - IBEG=1 - IEND=D%NDGL_FS - IINC=1 -ELSE - IBEG=D%NDGL_FS - IEND=1 - IINC=-1 -ENDIF - -!$ACC DATA PRESENT(FOUBUF_IN,ZGTF, D_NPTRLS,G_NMEN,D_NPROCM,D_NSTAGT1B,D_MSTABF,D_NPNTGTB0,D_NSTAGTF) -!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(IGLG,JMMAX,IPROC,ISTA,IOFF,JM) -DO KGL=IBEG,IEND,IINC - DO JF=1,KFIELDS +!$ACC DATA PRESENT(D,FOUBUF_IN,ZGTF,D_NPTRLS,G_NMEN,D_NPROCM,D_NSTAGT1B,D_MSTABF,D_NPNTGTB0,D_NSTAGTF) +!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(IGLG,IPROC,ISTA,IOFF,JM) +DO KGL=1,D%NDGL_FS + DO JF=1,KF_FS IGLG = D_NPTRLS(MYSETW)+KGL-1 IOFF = 1+D_NSTAGTF(KGL) @@ -78,7 +68,7 @@ SUBROUTINE FOURIER_OUT(KFIELDS) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) IPROC = D_NPROCM(JM) - ISTA = (D_NSTAGT1B(D_MSTABF(IPROC))+D_NPNTGTB0(JM,KGL))*2*KFIELDS + ISTA = (D_NSTAGT1B(D_MSTABF(IPROC))+D_NPNTGTB0(JM,KGL))*2*KF_FS ! imaginary may be not JM+1 but JM+G_NMEN(IGLG)+1 FOUBUF_IN(ISTA+2*JF-1) = ZGTF(2*JF-1, 2*JM+IOFF) diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index e81334336..fabb17bce 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -64,7 +64,6 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & USE TPM_DISTR ,ONLY : D, MYPROC, NPROC USE TRGTOL_MOD ,ONLY : TRGTOL, TRGTOL_CUDAAWARE -USE FOURIER_OUT_MOD ,ONLY : FOURIER_OUT USE FTDIR_MOD ,ONLY : FTDIR use ieee_arithmetic ! @@ -192,16 +191,6 @@ END SUBROUTINE cudaProfilerStop ! Fourier transform -IF(MYPROC > NPROC/2)THEN - IBEG=1 - IEND=D%NDGL_FS - IINC=1 -ELSE - IBEG=D%NDGL_FS - IEND=1 - IINC=-1 -ENDIF - !write(301,*) 'sizey: ', myproc, size(zgtf,1), KF_FS CALL GSTATS(1640,0) @@ -222,10 +211,6 @@ END SUBROUTINE cudaProfilerStop !ENDDO ENDIF -! Save Fourier data in FOUBUF_IN - - CALL FOURIER_OUT(KF_FS) - CALL GSTATS(1640,1) !DEALLOCATE(ZGTF) CALL GSTATS(106,1) diff --git a/src/trans/gpu/internal/ltdir_ctl_mod.F90 b/src/trans/gpu/internal/ltdir_ctl_mod.F90 index 56bcb5fb9..e843b3c2a 100755 --- a/src/trans/gpu/internal/ltdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltdir_ctl_mod.F90 @@ -46,6 +46,7 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS, & USE TPM_DISTR ,ONLY : D USE TPM_GEOMETRY ,ONLY : G USE TPM_FIELDS ,ONLY : F + USE FOURIER_OUT_MOD ,ONLY : FOURIER_OUT USE LTDIR_MOD ,ONLY : LTDIR @@ -69,35 +70,33 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS, & !$ACC DATA PRESENT(FOUBUF_IN) CREATE(FOUBUF) - ! Transposition from Fourier space distribution to spectral space distribution - ! requires currently both on the host !!! + CALL FOURIER_OUT(KF_FS) CALL GSTATS(153,0) #ifdef USE_CUDA_AWARE_MPI_FT WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_CUDAAWARE' - CALL TRLTOM_CUDAAWARE(FOUBUF_IN,FOUBUF,2*KF_FS) + CALL TRLTOM_CUDAAWARE(FOUBUF_IN,FOUBUF,KF_FS) #else - CALL TRLTOM(FOUBUF_IN,FOUBUF,2*KF_FS) + CALL TRLTOM(FOUBUF_IN,FOUBUF,KF_FS) #endif CALL GSTATS(153,1) ! Direct Legendre transform CALL GSTATS(103,0) - ILED2 = 2*KF_FS CALL GSTATS(1645,0) - IF(KF_FS>0) THEN + IF (KF_FS > 0) THEN CALL LTDIR(KF_FS,KF_UV,KF_SCALARS,ILED2, & & PSPVOR,PSPDIV,PSPSCALAR,& & PSPSC3A,PSPSC3B,PSPSC2 , & & KFLDPTRUV,KFLDPTRSC) - ENDIF - !$ACC END DATA CALL GSTATS(1645,1) CALL GSTATS(103,1) + + !$ACC END DATA ! ----------------------------------------------------------------- diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index 728cb85f2..32066a80d 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -10,7 +10,7 @@ MODULE TRLTOM_MOD CONTAINS - SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) + SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) !**** *TRLTOM * - transposition in Fourierspace @@ -29,7 +29,7 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) ! Explicit arguments : PFBUF - Fourier coefficient buffer. It is ! -------------------- used for both input and output. - ! KFIELD - Number of fields communicated + ! KF_FS - Number of fields communicated ! Implicit arguments : ! -------------------- @@ -80,7 +80,7 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) IMPLICIT NONE - INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF_IN(:) @@ -114,10 +114,10 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) ITAG = MTAGLM DO J=1,NPRTRW - ILENS(J) = D%NLTSGTB(J)*KFIELD - IOFFS(J) = D%NSTAGT1B(D%MSTABF(J))*KFIELD - ILENR(J) = D%NLTSFTB(J)*KFIELD - IOFFR(J) = D%NSTAGT1B(J)*KFIELD + ILENS(J) = D%NLTSGTB(J)*2*KF_FS + IOFFS(J) = D%NSTAGT1B(D%MSTABF(J))*2*KF_FS + ILENR(J) = D%NLTSFTB(J)*2*KF_FS + IOFFR(J) = D%NSTAGT1B(J)*2*KF_FS ENDDO IF(NPROC > 1) THEN @@ -160,8 +160,8 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) CALL MPI_BARRIER(MPI_COMM_WORLD,IERROR) CALL GSTATS(806,1) ELSE - ILEN = D%NLTSGTB(MYSETW)*KFIELD - ISTA = D%NSTAGT1B(MYSETW)*KFIELD+1 + ILEN = D%NLTSGTB(MYSETW)*2*KF_FS + ISTA = D%NSTAGT1B(MYSETW)*2*KF_FS+1 CALL GSTATS(1607,0) !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) DO J=ISTA,ISTA+ILEN-1 @@ -174,7 +174,7 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) ! ------------------------------------------------------------------ END SUBROUTINE TRLTOM_CUDAAWARE - SUBROUTINE TRLTOM(PFBUF_IN,PFBUF,KFIELD) + SUBROUTINE TRLTOM(PFBUF_IN,PFBUF,KF_FS) !**** *TRLTOM * - transposition in Fourierspace @@ -193,7 +193,7 @@ SUBROUTINE TRLTOM(PFBUF_IN,PFBUF,KFIELD) ! Explicit arguments : PFBUF - Fourier coefficient buffer. It is ! -------------------- used for both input and output. - ! KFIELD - Number of fields communicated + ! KF_FS - Number of fields communicated ! Implicit arguments : ! -------------------- @@ -244,7 +244,7 @@ SUBROUTINE TRLTOM(PFBUF_IN,PFBUF,KFIELD) IMPLICIT NONE - INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF_IN(:) @@ -270,10 +270,10 @@ SUBROUTINE TRLTOM(PFBUF_IN,PFBUF,KFIELD) ITAG = MTAGLM DO J=1,NPRTRW - ILENS(J) = D%NLTSGTB(J)*KFIELD - IOFFS(J) = D%NSTAGT1B(D%MSTABF(J))*KFIELD - ILENR(J) = D%NLTSFTB(J)*KFIELD - IOFFR(J) = D%NSTAGT1B(J)*KFIELD + ILENS(J) = D%NLTSGTB(J)*2*KF_FS + IOFFS(J) = D%NSTAGT1B(D%MSTABF(J))*2*KF_FS + ILENR(J) = D%NLTSFTB(J)*2*KF_FS + IOFFR(J) = D%NSTAGT1B(J)*2*KF_FS ENDDO IF(NPROC > 1) THEN @@ -288,8 +288,8 @@ SUBROUTINE TRLTOM(PFBUF_IN,PFBUF,KFIELD) !$ACC UPDATE DEVICE(PFBUF) CALL GSTATS(806,1) ELSE - ILEN = D%NLTSGTB(MYSETW)*KFIELD - ISTA = D%NSTAGT1B(MYSETW)*KFIELD+1 + ILEN = D%NLTSGTB(MYSETW)*2*KF_FS + ISTA = D%NSTAGT1B(MYSETW)*2*KF_FS+1 CALL GSTATS(1607,0) DO J=ISTA,ISTA+ILEN-1 PFBUF(J) = PFBUF_IN(J) From cb109d56de41c4b773748ecd63b5b32494897786 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:53 -0700 Subject: [PATCH 032/263] tight packing in trmtol/trltom for FOUBUF/FOUBUF_IN --- src/trans/gpu/internal/asre1b_mod.F90 | 9 +++---- src/trans/gpu/internal/fourier_in_mod.F90 | 3 ++- src/trans/gpu/internal/fourier_out_mod.F90 | 6 ++--- src/trans/gpu/internal/fspgl_int_mod.F90 | 5 ++-- src/trans/gpu/internal/sump_trans_mod.F90 | 9 ++++--- src/trans/gpu/internal/trltom_mod.F90 | 28 +++++++++++----------- src/trans/gpu/internal/trmtol_mod.F90 | 9 +++---- 7 files changed, 38 insertions(+), 31 deletions(-) diff --git a/src/trans/gpu/internal/asre1b_mod.F90 b/src/trans/gpu/internal/asre1b_mod.F90 index cb31156aa..585036e6e 100755 --- a/src/trans/gpu/internal/asre1b_mod.F90 +++ b/src/trans/gpu/internal/asre1b_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -16,7 +17,7 @@ SUBROUTINE ASRE1B(KFIELD,PAOA,PSOA) USE TPM_DIM ,ONLY : R, R_NDGNH, R_NDGL USE TPM_TRANS ,ONLY : FOUBUF_IN USE TPM_GEOMETRY ,ONLY : G, G_NDGLU -USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,D_NSTAGT0B,D_NSTAGT1B,D_NPROCL,D_NPNTGTB1 +USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,D_NSTAGT1B,D_NPROCL,D_NPNTGTB1 use tpm_gen, only: nout !**** *ASRE1B* - Recombine antisymmetric and symmetric parts @@ -81,7 +82,7 @@ SUBROUTINE ASRE1B(KFIELD,PAOA,PSOA) !* 1. RECOMBINATION OF SYMMETRIC AND ANTSYMMETRIC PARTS. ! --------------------------------------------------- -!$ACC DATA PRESENT(PAOA,PSOA,D_MYMS,D_NPROCL,D_NSTAGT0B,D_NPNTGTB1,G_NDGLU) PRESENT(FOUBUF_IN) +!$ACC DATA PRESENT(PAOA,PSOA,D_MYMS,D_NPROCL,D_NSTAGT1B,D_NPNTGTB1,G_NDGLU) PRESENT(FOUBUF_IN) !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(KM,ISL,IPROC,ISTAN,IGLS,IPROCS,ISTAS) DO KMLOC=1,D_NUMP KM = D_MYMS(KMLOC) @@ -89,10 +90,10 @@ SUBROUTINE ASRE1B(KFIELD,PAOA,PSOA) DO JGL=ISL, R_NDGNH ! if (JGL .ge. ISL) then IPROC = D_NPROCL(JGL) - ISTAN = (D_NSTAGT0B(IPROC) + D_NPNTGTB1(KMLOC,JGL))*2*KFIELD + ISTAN = (D_NSTAGT1B(IPROC) + D_NPNTGTB1(KMLOC,JGL))*2*KFIELD IGLS = R_NDGL+1-JGL IPROCS = D_NPROCL(IGLS) - ISTAS = (D_NSTAGT0B(IPROCS) + D_NPNTGTB1(KMLOC,IGLS))*2*KFIELD + ISTAS = (D_NSTAGT1B(IPROCS) + D_NPNTGTB1(KMLOC,IGLS))*2*KFIELD DO JFLD=1,2*KFIELD !write(iunit,*) 'xx ', KM, KFIELD, ISL , KMLOC, JFLD, ISTAN, ISTAS, IGLS, JGL, IPROC, PAOA(JFLD,JGL,KMLOC), PSOA(JFLD,JGL,KMLOC) !call flush(iunit) diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index efe1d68bc..58fe3bb5d 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -79,7 +80,7 @@ SUBROUTINE FOURIER_IN(PREEL,KFIELDS) if ( JM .le. G_NMEN(IGLG)) then IPROC = D_NPROCM(JM) - ISTA = (D_NSTAGT0B(D_MSTABF(IPROC))+D_NPNTGTB0(JM,KGL))*2*KFIELDS + ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*2*KFIELDS PREEL(2*JF-1,2*JM+1+D_NSTAGTF(KGL)) = FOUBUF(ISTA+2*JF-1) PREEL(2*JF, 2*JM+1+D_NSTAGTF(KGL)) = FOUBUF(ISTA+2*JF ) diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 index e6eb0ffd7..11ada675e 100755 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ b/src/trans/gpu/internal/fourier_out_mod.F90 @@ -40,7 +40,7 @@ SUBROUTINE FOURIER_OUT(KF_FS) USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT -USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NPTRLS,D_NSTAGTF,D_MSTABF,D_NSTAGT1B,D_NPNTGTB0,D_NPROCM +USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NPTRLS,D_NSTAGTF,D_NSTAGT0B,D_NPNTGTB0,D_NPROCM USE TPM_TRANS ,ONLY : FOUBUF_IN, ZGTF USE TPM_GEOMETRY ,ONLY : G_NMEN ! @@ -57,7 +57,7 @@ SUBROUTINE FOURIER_OUT(KF_FS) ! ------------------------------------------------------------------ -!$ACC DATA PRESENT(D,FOUBUF_IN,ZGTF,D_NPTRLS,G_NMEN,D_NPROCM,D_NSTAGT1B,D_MSTABF,D_NPNTGTB0,D_NSTAGTF) +!$ACC DATA PRESENT(D,FOUBUF_IN,ZGTF,D_NPTRLS,G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,D_NSTAGTF) !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(IGLG,IPROC,ISTA,IOFF,JM) DO KGL=1,D%NDGL_FS DO JF=1,KF_FS @@ -68,7 +68,7 @@ SUBROUTINE FOURIER_OUT(KF_FS) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) IPROC = D_NPROCM(JM) - ISTA = (D_NSTAGT1B(D_MSTABF(IPROC))+D_NPNTGTB0(JM,KGL))*2*KF_FS + ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*2*KF_FS ! imaginary may be not JM+1 but JM+G_NMEN(IGLG)+1 FOUBUF_IN(ISTA+2*JF-1) = ZGTF(2*JF-1, 2*JM+IOFF) diff --git a/src/trans/gpu/internal/fspgl_int_mod.F90 b/src/trans/gpu/internal/fspgl_int_mod.F90 index 651655b10..75e1a1acb 100755 --- a/src/trans/gpu/internal/fspgl_int_mod.F90 +++ b/src/trans/gpu/internal/fspgl_int_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -71,10 +72,10 @@ SUBROUTINE FSPGL_INT(KF_UV,KF_SCALARS,KF_SCDERS,KF_OUT_LT,& !$acc parallel loop DO JGL=ISL,IDGNH IPROC = D%NPROCL(JGL) - ISTAN(JGL) = (D%NSTAGT0B(IPROC) + D%NPNTGTB1(KMLOC,JGL))*2*KF_OUT_LT + ISTAN(JGL) = (D%NSTAGT1B(IPROC) + D%NPNTGTB1(KMLOC,JGL))*2*KF_OUT_LT IGLS = IDGL+1-JGL IPROCS = D%NPROCL(IGLS) - ISTAS(JGL) = (D%NSTAGT0B(IPROCS) + D%NPNTGTB1(KMLOC,IGLS))*2*KF_OUT_LT + ISTAS(JGL) = (D%NSTAGT1B(IPROCS) + D%NPNTGTB1(KMLOC,IGLS))*2*KF_OUT_LT ENDDO !$acc parallel loop collapse(2) diff --git a/src/trans/gpu/internal/sump_trans_mod.F90 b/src/trans/gpu/internal/sump_trans_mod.F90 index 2ff268b31..6800b7c71 100755 --- a/src/trans/gpu/internal/sump_trans_mod.F90 +++ b/src/trans/gpu/internal/sump_trans_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -156,9 +157,11 @@ SUBROUTINE SUMP_TRANS ENDDO IAUX0 = MAX(D%NLTSGTB(MYSETW),IAUX0) IAUX1 = MAX(D%NLTSGTB(MYSETW),IAUX1) - DO JA=1,NPRTRNS+1 - D%NSTAGT0B(JA) = (JA-1)*IAUX0 - D%NSTAGT1B(JA) = (JA-1)*IAUX1 + D%NSTAGT0B(1) = 0 + D%NSTAGT1B(1) = 0 + DO JA=2,NPRTRNS + D%NSTAGT0B(JA) = D%NSTAGT0B(JA-1)+D%NLTSGTB(JA-1) + D%NSTAGT1B(JA) = D%NSTAGT1B(JA-1)+D%NLTSFTB(JA-1) ENDDO D%NLENGT0B = IAUX0*NPRTRNS D%NLENGT1B = IAUX1*NPRTRNS diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index 32066a80d..e8adf421e 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -113,14 +113,14 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) ITAG = MTAGLM - DO J=1,NPRTRW - ILENS(J) = D%NLTSGTB(J)*2*KF_FS - IOFFS(J) = D%NSTAGT1B(D%MSTABF(J))*2*KF_FS - ILENR(J) = D%NLTSFTB(J)*2*KF_FS - IOFFR(J) = D%NSTAGT1B(J)*2*KF_FS - ENDDO - IF(NPROC > 1) THEN + DO J=1,NPRTRW + ILENS(J) = D%NLTSGTB(J)*2*KF_FS + IOFFS(J) = D%NSTAGT0B(J)*2*KF_FS + ILENR(J) = D%NLTSFTB(J)*2*KF_FS + IOFFR(J) = D%NSTAGT1B(J)*2*KF_FS + ENDDO + CALL GSTATS(806,0) IF (LSYNC_TRANS) THEN CALL GSTATS(420,0) @@ -269,14 +269,14 @@ SUBROUTINE TRLTOM(PFBUF_IN,PFBUF,KF_FS) ITAG = MTAGLM - DO J=1,NPRTRW - ILENS(J) = D%NLTSGTB(J)*2*KF_FS - IOFFS(J) = D%NSTAGT1B(D%MSTABF(J))*2*KF_FS - ILENR(J) = D%NLTSFTB(J)*2*KF_FS - IOFFR(J) = D%NSTAGT1B(J)*2*KF_FS - ENDDO - IF(NPROC > 1) THEN + DO J=1,NPRTRW + ILENS(J) = D%NLTSGTB(J)*2*KF_FS + IOFFS(J) = D%NSTAGT0B(J)*2*KF_FS + ILENR(J) = D%NLTSFTB(J)*2*KF_FS + IOFFR(J) = D%NSTAGT1B(J)*2*KF_FS + ENDDO + CALL GSTATS(806,0) !$ACC UPDATE HOST(PFBUF_IN) diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 24248cc8e..9072f9a40 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 1995- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -106,9 +107,9 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) DO J=1,NPRTRW ILENS(J) = D%NLTSFTB(J)*KFIELD - IOFFS(J) = D%NSTAGT0B(J)*KFIELD + IOFFS(J) = D%NSTAGT1B(J)*KFIELD ILENR(J) = D%NLTSGTB(J)*KFIELD - IOFFR(J) = D%NSTAGT0B(D%MSTABF(J))*KFIELD + IOFFR(J) = D%NSTAGT0B(J)*KFIELD ENDDO !write(300+myproc,*)"0:TRMTOL:PFBUF",sum(PFBUF) @@ -265,9 +266,9 @@ SUBROUTINE TRMTOL(PFBUF_IN,PFBUF,KFIELD) DO J=1,NPRTRW ILENS(J) = D%NLTSFTB(J)*KFIELD - IOFFS(J) = D%NSTAGT0B(J)*KFIELD + IOFFS(J) = D%NSTAGT1B(J)*KFIELD ILENR(J) = D%NLTSGTB(J)*KFIELD - IOFFR(J) = D%NSTAGT0B(D%MSTABF(J))*KFIELD + IOFFR(J) = D%NSTAGT0B(J)*KFIELD ENDDO !write(300+myproc,*)"0:TRMTOL:PFBUF",sum(PFBUF) From cf90e4d6891b481680a9414c1f9f88ff73a0bdc5 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:53 -0700 Subject: [PATCH 033/263] Move FOURIER_OUT into FTDIR, and cleanup FTDIR should directly write into the out buffer. Truncation is now implicitly handled (G%NMEN holds truncated loop bounds) --- src/trans/gpu/internal/fourier_out_mod.F90 | 33 +++++++++++----------- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 17 ++--------- src/trans/gpu/internal/ftdir_mod.F90 | 31 +++++++------------- src/trans/gpu/internal/ltdir_ctl_mod.F90 | 3 -- 4 files changed, 29 insertions(+), 55 deletions(-) diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 index 11ada675e..6c55b8834 100755 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ b/src/trans/gpu/internal/fourier_out_mod.F90 @@ -54,27 +54,28 @@ SUBROUTINE FOURIER_OUT(KF_FS) INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA INTEGER(KIND=JPIM) :: IBEG,IEND,IINC, IOFF +INTEGER(KIND=JPIM) :: OFFSET_VAR ! ------------------------------------------------------------------ -!$ACC DATA PRESENT(D,FOUBUF_IN,ZGTF,D_NPTRLS,G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,D_NSTAGTF) +OFFSET_VAR=D_NPTRLS(MYSETW) +!$ACC DATA PRESENT(D,FOUBUF_IN,ZGTF,G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,D_NSTAGTF) !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(IGLG,IPROC,ISTA,IOFF,JM) DO KGL=1,D%NDGL_FS - DO JF=1,KF_FS - - IGLG = D_NPTRLS(MYSETW)+KGL-1 - IOFF = 1+D_NSTAGTF(KGL) - - !$ACC LOOP SEQ - DO JM=0,G_NMEN(IGLG) - IPROC = D_NPROCM(JM) - ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*2*KF_FS - - ! imaginary may be not JM+1 but JM+G_NMEN(IGLG)+1 - FOUBUF_IN(ISTA+2*JF-1) = ZGTF(2*JF-1, 2*JM+IOFF) - FOUBUF_IN(ISTA+2*JF ) = ZGTF(2*JF , 2*JM+IOFF) - ENDDO - ENDDO + DO JF=1,KF_FS + IGLG = OFFSET_VAR+KGL-1 + IOFF = D_NSTAGTF(KGL)+1 + + !$ACC LOOP SEQ + DO JM=0,G_NMEN(IGLG) + IPROC = D_NPROCM(JM) + ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*2*KF_FS + + ! imaginary may be not JM+1 but JM+G_NMEN(IGLG)+1 + FOUBUF_IN(ISTA+2*JF-1) = ZGTF(2*JF-1, 2*JM+IOFF) + FOUBUF_IN(ISTA+2*JF ) = ZGTF(2*JF , 2*JM+IOFF) + ENDDO + ENDDO END DO !$ACC END DATA diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index fabb17bce..5dc40fd85 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -194,21 +194,8 @@ END SUBROUTINE cudaProfilerStop !write(301,*) 'sizey: ', myproc, size(zgtf,1), KF_FS CALL GSTATS(1640,0) -!!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JGL,IGL) -IF(KF_FS>0) THEN - ! TRY THIS IN CHUNKS, ISIZE is even, need equal and even chunks too - ISIZE=size(zgtf,1) - !ICHUNKS=2 - !ICHUNK=ISIZE/ICHUNKS - !ICHUNK=ICHUNK+MOD(ICHUNK,2) - !DO JK=ICHUNKS,1,-1 - ! repeat some fields to have constant chunk size - !IOFF=MAX(1,ISIZE-(ICHUNKS-JK+1)*ICHUNK+1) - IOFF=1 - !ICHUNK=2*KF_FS+2 - ICHUNK=ISIZE - CALL FTDIR(ICHUNK,KF_FS) - !ENDDO +IF (KF_FS > 0) THEN + CALL FTDIR(SIZE(ZGTF,1),KF_FS) ENDIF CALL GSTATS(1640,1) diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 7ee166541..c1519672d 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -59,6 +59,7 @@ SUBROUTINE FTDIR(STRIDE,KF_FS) USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT USE TPM_DIM ,ONLY : R,R_NNOEXTZL USE CUDA_DEVICE_MOD +USE FOURIER_OUT_MOD ,ONLY : FOURIER_OUT ! IMPLICIT NONE @@ -67,13 +68,12 @@ SUBROUTINE FTDIR(STRIDE,KF_FS) INTEGER(KIND=JPIM) :: KGL !!!REAL(KIND=JPRBT), INTENT(INOUT) :: PREEL(STRIDE,D%NLENGTF) -INTEGER(KIND=JPIM) :: IGLG,JJ,JF,IST1 +INTEGER(KIND=JPIM) :: IGLG,JM,JF,IST1 INTEGER(KIND=JPIM) :: IOFF INTEGER(KIND=JPIM) :: IPLAN_R2C -INTEGER(KIND=JPIM) :: JMAX REAL(KIND=JPRBT) :: SCAL -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC,JTRUNC_START +INTEGER(KIND=JPIM) :: IBEG,IEND,IINC INTEGER(KIND=JPIM) :: OFFSET_VAR real(kind=jprbt), allocatable :: zgtf2(:,:) @@ -108,31 +108,20 @@ SUBROUTINE FTDIR(STRIDE,KF_FS) END DO OFFSET_VAR=D_NPTRLS(MYSETW) -!$ACC parallel loop collapse(2) private(JMAX,JJ,IOFF,SCAL,JTRUNC_START) DEFAULT(NONE) -DO KGL=IBEG,IEND,IINC +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF,SCAL,JM) DEFAULT(NONE) +DO KGL=1,D%NDGL_FS DO JF=1,2*KF_FS IGLG = OFFSET_VAR+KGL-1 - IOFF=D_NSTAGTF(KGL)+1 - - JMAX = G_NLOEN(IGLG) - ! Multiply with two because we are in complex domain - ! TODO I am not sure if this is +1,0,-1 - JTRUNC_START = MIN(2*(G_NMEN(IGLG)+1)-1,JMAX) - - SCAL = 1._JPRBT/REAL(JMAX,JPRBT) - !$ACC LOOP SEQ - DO JJ=1, JTRUNC_START, 2 - ZGTF(JF,IOFF+JJ-1)= SCAL * ZGTF2(JF, IOFF+JJ-1) - ENDDO + IOFF = D_NSTAGTF(KGL)+1 - ! In fact this is not needed, probably, because FOURIER_OUT could only pack - ! what it needs + SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) !$ACC LOOP SEQ - DO JJ=JTRUNC_START + 1, JMAX, 2 - ZGTF(JF,IOFF+JJ-1)= 0.0_JPRBT + DO JM=0,G_NMEN(IGLG) + ZGTF(JF,2*JM+IOFF)= SCAL * ZGTF2(JF,2*JM+IOFF) ENDDO ENDDO ENDDO +CALL FOURIER_OUT(KF_FS) !$acc end data !$ACC end data diff --git a/src/trans/gpu/internal/ltdir_ctl_mod.F90 b/src/trans/gpu/internal/ltdir_ctl_mod.F90 index e843b3c2a..7f461e719 100755 --- a/src/trans/gpu/internal/ltdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltdir_ctl_mod.F90 @@ -46,7 +46,6 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS, & USE TPM_DISTR ,ONLY : D USE TPM_GEOMETRY ,ONLY : G USE TPM_FIELDS ,ONLY : F - USE FOURIER_OUT_MOD ,ONLY : FOURIER_OUT USE LTDIR_MOD ,ONLY : LTDIR @@ -70,8 +69,6 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS, & !$ACC DATA PRESENT(FOUBUF_IN) CREATE(FOUBUF) - CALL FOURIER_OUT(KF_FS) - CALL GSTATS(153,0) #ifdef USE_CUDA_AWARE_MPI_FT WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_CUDAAWARE' From 2ae19b93fae43bb4426770bd795cd4d24d7b5069 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:53 -0700 Subject: [PATCH 034/263] Directly write FOUBUF_IN --- src/trans/gpu/internal/fourier_out_mod.F90 | 87 ---------------------- src/trans/gpu/internal/ftdir_mod.F90 | 53 +++++++------ 2 files changed, 26 insertions(+), 114 deletions(-) delete mode 100755 src/trans/gpu/internal/fourier_out_mod.F90 diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 deleted file mode 100755 index 6c55b8834..000000000 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ /dev/null @@ -1,87 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE FOURIER_OUT_MOD -CONTAINS -SUBROUTINE FOURIER_OUT(KF_FS) - -!**** *FOURIER_OUT* - Copy fourier data from local array to buffer - -! Purpose. -! -------- -! Routine for copying fourier data from local array to buffer - -!** Interface. -! ---------- -! CALL FOURIER_OUT(...) - -! Explicit arguments : PREEL - local fourier/GP array -! -------------------- KF_FS - number of fields -! -! Externals. None. -! ---------- - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 2000-04-01 - -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - -USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NPTRLS,D_NSTAGTF,D_NSTAGT0B,D_NPNTGTB0,D_NPROCM -USE TPM_TRANS ,ONLY : FOUBUF_IN, ZGTF -USE TPM_GEOMETRY ,ONLY : G_NMEN -! - -IMPLICIT NONE - -!REAL(KIND=JPRBT), INTENT(IN) :: PREEL(:,:) -INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS -INTEGER(KIND=JPIM) :: KGL - -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA - -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC, IOFF -INTEGER(KIND=JPIM) :: OFFSET_VAR - -! ------------------------------------------------------------------ - -OFFSET_VAR=D_NPTRLS(MYSETW) -!$ACC DATA PRESENT(D,FOUBUF_IN,ZGTF,G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,D_NSTAGTF) -!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(IGLG,IPROC,ISTA,IOFF,JM) -DO KGL=1,D%NDGL_FS - DO JF=1,KF_FS - IGLG = OFFSET_VAR+KGL-1 - IOFF = D_NSTAGTF(KGL)+1 - - !$ACC LOOP SEQ - DO JM=0,G_NMEN(IGLG) - IPROC = D_NPROCM(JM) - ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*2*KF_FS - - ! imaginary may be not JM+1 but JM+G_NMEN(IGLG)+1 - FOUBUF_IN(ISTA+2*JF-1) = ZGTF(2*JF-1, 2*JM+IOFF) - FOUBUF_IN(ISTA+2*JF ) = ZGTF(2*JF , 2*JM+IOFF) - ENDDO - ENDDO -END DO -!$ACC END DATA - - -! ------------------------------------------------------------------ - -END SUBROUTINE FOURIER_OUT -END MODULE FOURIER_OUT_MOD - diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index c1519672d..e824e4af5 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -47,28 +47,22 @@ SUBROUTINE FTDIR(STRIDE,KF_FS) ! ------------------------------------------------------------------ -USE PARKIND_ECTRANS ,ONLY : JPIM, JPIB, JPRBT - -USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC,D_NSTAGTF,D_NPTRLS -USE TPM_TRANS ,ONLY : ZGTF -USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NMEN_MAX,G_NLOEN,G_NLOEN_MAX -USE TPM_FFT ,ONLY : T -#ifdef WITH_FFTW -USE TPM_FFTW ,ONLY : TW, EXEC_FFTW -#endif +USE TPM_GEN ,ONLY : LSYNC_TRANS +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT + +USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NSTAGTF,D_NPTRLS, D_NSTAGT0B, D_NPNTGTB0, D_NPROCM +USE TPM_TRANS ,ONLY : ZGTF, FOUBUF_IN +USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NLOEN USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT -USE TPM_DIM ,ONLY : R,R_NNOEXTZL USE CUDA_DEVICE_MOD -USE FOURIER_OUT_MOD ,ONLY : FOURIER_OUT ! IMPLICIT NONE INTEGER(KIND=JPIM),INTENT(IN) :: STRIDE,KF_FS INTEGER(KIND=JPIM) :: KGL -!!!REAL(KIND=JPRBT), INTENT(INOUT) :: PREEL(STRIDE,D%NLENGTF) -INTEGER(KIND=JPIM) :: IGLG,JM,JF,IST1 +INTEGER(KIND=JPIM) :: IGLG,JM,JF,IST1, IPROC, ISTA INTEGER(KIND=JPIM) :: IOFF INTEGER(KIND=JPIM) :: IPLAN_R2C REAL(KIND=JPRBT) :: SCAL @@ -92,9 +86,9 @@ SUBROUTINE FTDIR(STRIDE,KF_FS) allocate(zgtf2(size(zgtf,1),size(zgtf,2))) !$ACC DATA & -!$ACC& PRESENT(ZGTF,D,D_NSTAGTF,D_NPTRLS,G_NMEN,G_NMEN_MAX,G_NLOEN,G_NLOEN_MAX,R_NNOEXTZL) - -!$ACC DATA CREATE(ZGTF2) +!$ACC& PRESENT(ZGTF,FOUBUF_IN, & +!$ACC& D,D_NSTAGTF,G_NMEN,G_NLOEN,D_NSTAGT0B,D_NPNTGTB0,D_NPROCM) & +!$ACC& CREATE(ZGTF2) DO KGL=IBEG,IEND,IINC @@ -107,24 +101,29 @@ SUBROUTINE FTDIR(STRIDE,KF_FS) !$ACC end host_data END DO + +! scale results and move into next transformation buffer + OFFSET_VAR=D_NPTRLS(MYSETW) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF,SCAL,JM) DEFAULT(NONE) DO KGL=1,D%NDGL_FS - DO JF=1,2*KF_FS - IGLG = OFFSET_VAR+KGL-1 - IOFF = D_NSTAGTF(KGL)+1 - - SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) - !$ACC LOOP SEQ - DO JM=0,G_NMEN(IGLG) - ZGTF(JF,2*JM+IOFF)= SCAL * ZGTF2(JF,2*JM+IOFF) + DO JF=1,KF_FS + IGLG = OFFSET_VAR+KGL-1 + IOFF = D_NSTAGTF(KGL)+1 + + SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) + !$ACC LOOP SEQ + DO JM=0,G_NMEN(IGLG) + IPROC = D_NPROCM(JM) + ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*2*KF_FS + + FOUBUF_IN(ISTA+2*JF-1) = SCAL * ZGTF2(2*JF-1, 2*JM+IOFF) + FOUBUF_IN(ISTA+2*JF ) = SCAL * ZGTF2(2*JF , 2*JM+IOFF) ENDDO ENDDO ENDDO -CALL FOURIER_OUT(KF_FS) -!$acc end data -!$ACC end data +!$ACC END DATA ! ------------------------------------------------------------------ From 75b74c49875d56776dd1d0e1bd82a3ad046ac868 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:54 -0700 Subject: [PATCH 035/263] assume that trgtol outputs on device --- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 1 - src/trans/gpu/internal/trgtol_mod.F90 | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index 5dc40fd85..722fee612 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -183,7 +183,6 @@ END SUBROUTINE cudaProfilerStop #else CALL TRGTOL(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) -!$ACC UPDATE DEVICE(ZGTF) #endif CALL GSTATS(158,1) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 7e2aced3e..f545b6a7d 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -1359,6 +1359,8 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IF (IBUFLENS > 0) DEALLOCATE(ZCOMBUFS) IF (IBUFLENR > 0) DEALLOCATE(ZCOMBUFR) + !$ACC UPDATE DEVICE(PGLAT) + IF (LHOOK) CALL DR_HOOK('TRGTOL',1,ZHOOK_HANDLE) END SUBROUTINE TRGTOL From baa42bec8f773e4bb151b4fdd1c9699536e84a2e Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:54 -0700 Subject: [PATCH 036/263] ZGTF is completely written! --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index d175ccde6..71c996fc2 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2001- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -126,10 +127,6 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ! Perform transform -!$ACC KERNELS -ZGTF(:,:) = 0 -!$ACC END KERNELS - IF_GPB = 2*KF_UV_G+KF_SCALARS_G IF(NPROMATR > 0 .AND. IF_GPB > NPROMATR) THEN From 2de7adf75e9c9ee08ba35dc65d7832011c6fcff0 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:54 -0700 Subject: [PATCH 037/263] pin buffers --- src/programs/CMakeLists.txt | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/programs/CMakeLists.txt b/src/programs/CMakeLists.txt index 731a492ae..d4f232621 100644 --- a/src/programs/CMakeLists.txt +++ b/src/programs/CMakeLists.txt @@ -1,4 +1,5 @@ # (C) Copyright 2020- ECMWF. +# (C) Copyright 2022- NVIDIA. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -75,10 +76,10 @@ if( HAVE_GPU ) #target_link_libraries( driver-spectrans PRIVATE OpenACC::OpenACC_Fortran ) set_property( TARGET driver-spectrans-${prec} PROPERTY CUDA_ARCHITECTURES 70 ) set_property( TARGET driver-spectrans-CA-${prec} PROPERTY CUDA_ARCHITECTURES 70 ) - target_compile_options( driver-spectrans-${prec} PRIVATE $<$:-g -acc -Minfo=acc -gpu=cc70,lineinfo,deepcopy,fastmath,nordc -cudalib=cufft,cublas -fpic> ) - target_compile_options( driver-spectrans-CA-${prec} PRIVATE $<$:-g -acc -Minfo=acc -gpu=cc70,lineinfo,deepcopy,fastmath,nordc -cudalib=cufft,cublas -fpic> ) - set_target_properties(driver-spectrans-${prec} PROPERTIES LINK_FLAGS "-acc -cudalib=cufft,cublas -fpic") - set_target_properties(driver-spectrans-CA-${prec} PROPERTIES LINK_FLAGS "-acc -cudalib=cufft,cublas -fpic") + target_compile_options( driver-spectrans-${prec} PRIVATE $<$:-g -acc -Minfo=acc -gpu=cc70,lineinfo,deepcopy,fastmath,nordc,pinned -cudalib=cufft,cublas -fpic> ) + target_compile_options( driver-spectrans-CA-${prec} PRIVATE $<$:-g -acc -Minfo=acc -gpu=cc70,lineinfo,deepcopy,fastmath,nordc,pinned -cudalib=cufft,cublas -fpic> ) + set_target_properties(driver-spectrans-${prec} PROPERTIES LINK_FLAGS "-acc -cudalib=cufft,cublas -fpic -gpu=cc70,pinned") + set_target_properties(driver-spectrans-CA-${prec} PROPERTIES LINK_FLAGS "-acc -cudalib=cufft,cublas -fpic -gpu=cc70,pinned") message("Building ${prec} GPU driver") endif() endforeach() From aac7e2a95fd4e16a36f1723b2b617d3c6107035f Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:54 -0700 Subject: [PATCH 038/263] Add option to disable file dumps --- src/programs/driver-spectraltransform.F90 | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/programs/driver-spectraltransform.F90 b/src/programs/driver-spectraltransform.F90 index 25064a71d..5c608721f 100644 --- a/src/programs/driver-spectraltransform.F90 +++ b/src/programs/driver-spectraltransform.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2014- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -81,7 +82,7 @@ PROGRAM TRANSFORM_TEST LOGICAL :: LUSERPNM, LKEEPRPNM, LUSEFLT LOGICAL :: LTRACE_STATS,LSTATS_OMP, LSTATS_COMMS, LSTATS_MPL LOGICAL :: LSTATS,LBARRIER_STATS, LBARRIER_STATS2, LDETAILED_STATS -LOGICAL :: LSTATS_ALLOC, LSYNCSTATS, LSTATSCPU, LSTATS_MEM +LOGICAL :: LSTATS_ALLOC, LSYNCSTATS, LSTATSCPU, LSTATS_MEM, LDUMP_DATA LOGICAL :: LXML_STATS LOGICAL :: LFFTW INTEGER(KIND=JPIM) :: NSTATS_MEM, NTRACE_STATS, NPRNT_STATS @@ -140,7 +141,7 @@ PROGRAM TRANSFORM_TEST & LUSERPNM, LKEEPRPNM, LUSEFLT, NQ, NLIN, IMAX_FLDS_IN, & & NPRINTNORMS, ITERS, ZMAXERR_CHECK, NPROMA, NPROMATR, LEQ_REGIONS, & & NPRINTLEV, NPRTRW, NPRTRV, NSPECRESMIN, NFLEVG, MBX_SIZE, LSTACK, & - & LFFTW + & LFFTW, LDUMP_DATA ! ------------------------------------------------------------------ @@ -215,6 +216,7 @@ PROGRAM TRANSFORM_TEST LSTACK=.FALSE. ! Use FFTW LFFTW=.TRUE. +LDUMP_DATA=.TRUE. ! Default number of vertical levels NFLEVG=137 @@ -857,10 +859,12 @@ PROGRAM TRANSFORM_TEST ZTSTEP1(JSTEP)=(TIMEF()-ZTSTEP1(JSTEP))/1000.0_JPRD ! Dump a field to a binary file - CALL DUMP_GRIDPOINT_FIELD(JSTEP, MYPROC, NPROMA, NGPBLKS, ZGMVS(:,1,:), 'S', NOUTDUMP) - CALL DUMP_GRIDPOINT_FIELD(JSTEP, MYPROC, NPROMA, NGPBLKS, ZWINDS(:,NFLEVG,3,:), 'U', NOUTDUMP) - CALL DUMP_GRIDPOINT_FIELD(JSTEP, MYPROC, NPROMA, NGPBLKS, ZWINDS(:,NFLEVG,4,:), 'V', NOUTDUMP) - CALL DUMP_GRIDPOINT_FIELD(JSTEP, MYPROC, NPROMA, NGPBLKS, ZGMV(:,NFLEVG,5,:), 'T', NOUTDUMP) + if (LDUMP_DATA) THEN + CALL DUMP_GRIDPOINT_FIELD(JSTEP, MYPROC, NPROMA, NGPBLKS, ZGMVS(:,1,:), 'S', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD(JSTEP, MYPROC, NPROMA, NGPBLKS, ZWINDS(:,NFLEVG,3,:), 'U', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD(JSTEP, MYPROC, NPROMA, NGPBLKS, ZWINDS(:,NFLEVG,4,:), 'V', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD(JSTEP, MYPROC, NPROMA, NGPBLKS, ZGMV(:,NFLEVG,5,:), 'T', NOUTDUMP) + ENDIF ZTSTEP2(JSTEP)=TIMEF() CALL DIR_TRANS(PSPVOR=ZVOR,PSPDIV=ZDIV,& From dda2816e15f4d1d1ab747028d873b8fb9454a197 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:54 -0700 Subject: [PATCH 039/263] improve data regions / add some async --- src/trans/gpu/internal/ltinv_mod.F90 | 33 +++++++++++---------------- src/trans/gpu/internal/trgtol_mod.F90 | 32 ++++++++++++++------------ 2 files changed, 30 insertions(+), 35 deletions(-) diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index 3ee42f64c..8adf93439 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -168,6 +169,13 @@ END SUBROUTINE cudaProfilerStop ! COPY FROM PSPXXXX TO ZIA + CALL GSTATS(431,0) + !$ACC DATA COPYIN(PSPVOR,PSPDIV) IF(KF_UV > 0) + !$ACC DATA COPYIN(PSPSCALAR) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSCALAR)) + !$ACC DATA COPYIN(PSPSC2) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC2) .AND. NF_SC2 > 0) + !$ACC DATA COPYIN(PSPSC3A) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC3A) .AND. NF_SC3A > 0) + !$ACC DATA COPYIN(PSPSC3B) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC3B) .AND. NF_SC3B > 0) + CALL GSTATS(431,1) IF (KF_UV > 0) THEN IVORL = 1 IVORU = 2*KF_UV @@ -179,12 +187,8 @@ END SUBROUTINE cudaProfilerStop IVU = 8*KF_UV IDIM2=UBOUND(PSPVOR,2) - CALL GSTATS(431,0) - !$ACC DATA COPYIN(PSPVOR,PSPDIV) - CALL GSTATS(431,1) CALL PRFI1B(ZIA(IVORL:IVORU,:,:),PSPVOR,KF_UV,IDIM2,KFLDPTRUV) CALL PRFI1B(ZIA(IDIVL:IDIVU,:,:),PSPDIV,KF_UV,IDIM2,KFLDPTRUV) - !$ACC END DATA ! ------------------------------------------------------------------ @@ -200,21 +204,13 @@ END SUBROUTINE cudaProfilerStop ILAST = IFIRST - 1 + 2*KF_SCALARS IDIM2=UBOUND(PSPSCALAR,2) - CALL GSTATS(431,0) - !$ACC DATA COPYIN(PSPSCALAR) - CALL GSTATS(431,1) CALL PRFI1B(ZIA(IFIRST:ILAST,:,:),PSPSCALAR(:,:),KF_SCALARS,IDIM2,KFLDPTRSC) - !$ACC END DATA ELSE IF(PRESENT(PSPSC2) .AND. NF_SC2 > 0) THEN IFIRST = ILAST+1 ILAST = IFIRST-1+2*NF_SC2 IDIM2=UBOUND(PSPSC2,2) - CALL GSTATS(431,0) - !$ACC DATA COPYIN(PSPSC2) - CALL GSTATS(431,1) CALL PRFI1B(ZIA(IFIRST:ILAST,:,:),PSPSC2(:,:),NF_SC2,IDIM2) - !$ACC END DATA ENDIF IF(PRESENT(PSPSC3A) .AND. NF_SC3A > 0) THEN IDIM1=NF_SC3A @@ -222,28 +218,20 @@ END SUBROUTINE cudaProfilerStop IFIRST = ILAST+1 ILAST = IFIRST-1+2*IDIM1 IDIM2=UBOUND(PSPSC3A,2) - CALL GSTATS(431,0) - !$ACC DATA COPYIN(PSPSC3A) - CALL GSTATS(431,1) DO J3=1,IDIM3 CALL PRFI1B(ZIA(IFIRST:ILAST,:,:),PSPSC3A(:,:,J3),IDIM1,IDIM2) ENDDO - !$ACC END DATA ENDIF IF(PRESENT(PSPSC3B) .AND. NF_SC3B > 0) THEN IDIM1=NF_SC3B IDIM3=UBOUND(PSPSC3B,3) IDIM2=UBOUND(PSPSC3B,2) - CALL GSTATS(431,0) - !$ACC DATA COPYIN(PSPSC3B) - CALL GSTATS(431,1) DO J3=1,IDIM3 IFIRST = ILAST+1 ILAST = IFIRST-1+2*IDIM1 CALL PRFI1B(ZIA(IFIRST:ILAST,:,:),PSPSC3B(:,:,J3),IDIM1,IDIM2) ENDDO - !$ACC END DATA ENDIF ENDIF IF(ILAST /= 8*KF_UV+2*KF_SCALARS) THEN @@ -251,6 +239,11 @@ END SUBROUTINE cudaProfilerStop CALL ABORT_TRANS('LTINV_MOD:ILAST /= 8*KF_UV+2*KF_SCALARS') ENDIF ENDIF + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA IF (KF_SCDERS > 0) THEN ! stop 'Error: code path not (yet) supported in GPU version' diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index f545b6a7d..f2b168aeb 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -338,7 +338,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL INIGPTR(IGPTRSEND,IGPTRRECV) LLDONE = .FALSE. - !$ACC DATA COPYIN(LLGP2,LLGP3A,LLGP3B,LLUV,IGPTRSEND) + !$ACC DATA COPYIN(LLGP2,LLGP3A,LLGP3B,LLUV,IGPTRSEND) ASYNC(1) ITAG = MTAGGL @@ -417,10 +417,10 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IF (IBUFLENS > 0) ALLOCATE(ZCOMBUFS(ISENDCOUNT,INSEND)) IF (IBUFLENR > 0) ALLOCATE(ZCOMBUFR(IRECVCOUNT,INRECV)) - !$ACC DATA IF(IBUFLENS > 0) CREATE(ZCOMBUFS) - !$ACC DATA IF(IBUFLENR > 0) CREATE(ZCOMBUFR) + !$ACC DATA IF(IBUFLENS > 0) CREATE(ZCOMBUFS) ASYNC(1) + !$ACC DATA IF(IBUFLENR > 0) CREATE(ZCOMBUFR) ASYNC(1) - !$ACC KERNELS DEFAULT(NONE) + !$ACC KERNELS DEFAULT(NONE) ASYNC(1) IF (IBUFLENS > 0) ZCOMBUFS(:,:) = 0. IF (IBUFLENR > 0) ZCOMBUFR(:,:) = 0. !$ACC END KERNELS @@ -430,12 +430,12 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ! Send loop............................................................. ! Copy local contribution - !$ACC DATA PRESENT(PGLAT) COPYIN(KPTRGP,INDOFF,KINDEX) - !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) - !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) COPYIN(IUVLEVS,IUVPARS) - !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) COPYIN(IGP2PARS) - !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) COPYIN(IGP3ALEVS,IGP3APARS) - !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) COPYIN(IGP3BLEVS,IGP3BPARS) + !$ACC DATA PRESENT(PGLAT) COPYIN(KPTRGP,INDOFF,KINDEX) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) + !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) COPYIN(IUVLEVS,IUVPARS) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) COPYIN(IGP2PARS) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) COPYIN(IGP3ALEVS,IGP3APARS) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) COPYIN(IGP3BLEVS,IGP3BPARS) ASYNC(1) IF(ISENDTOT(MYPROC) > 0 )THEN ! Input is KF_GP fields. We find the resulting KF_FS fields. @@ -463,11 +463,11 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDIF ENDDO - !$ACC DATA COPYIN(IFLDOFF,IGPTROFF) + !$ACC DATA COPYIN(IFLDOFF,IGPTROFF) ASYNC(1) CALL GSTATS(1601,0) IF(LLPGPONLY) THEN - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) ASYNC(1) DO JBLK=1,NGPBLKS DO JFLD=1,KF_FS DO JKL=1, JK_MAX @@ -483,7 +483,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDDO ENDDO ELSE - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) ASYNC(1) DO JBLK=1,NGPBLKS DO JFLD=1,KF_FS DO JKL=1, JK_MAX @@ -546,7 +546,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDDO - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLDT,IFIRST,ILAST) COPYIN(INS,JK_MAX,IJPOS,IFLDA) + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLDT,IFIRST,ILAST) COPYIN(IJPOS,IFLDA) ASYNC(1) DO JJ=1,ISEND_FLD_CNT DO JBLK=1,NGPBLKS DO JKL=1, JK_MAX @@ -585,6 +585,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL GSTATS(1602,1) + !$ACC WAIT(1) IF (LHOOK) CALL DR_HOOK('TRGTOL_BAR',0,ZHOOK_HANDLE_BAR) CALL GSTATS_BARRIER(761) IF (LHOOK) CALL DR_HOOK('TRGTOL_BAR',1,ZHOOK_HANDLE_BAR) @@ -657,7 +658,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IRECV=JRECV(INR) ILEN = IRECVTOT(IRECV)/KF_FS INDOFFL = INDOFF(IRECV) - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(II) + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(II) ASYNC(1) DO JFLD=1,KF_FS DO JL=1,ILEN II = KINDEX(INDOFFL+JL) @@ -665,6 +666,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDDO ENDDO ENDDO + !$ACC WAIT(1) !$ACC END DATA !#ifdef COMVERBOSE From 7606b23010dac48bd3bf75dbf9ba185bf97e7357 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:55 -0700 Subject: [PATCH 040/263] add 2 more labels --- src/trans/gpu/internal/ftdir_mod.F90 | 10 ++++++++++ src/trans/gpu/internal/ftinv_mod.F90 | 12 ++++++++++++ src/trans/gpu/internal/gstats_label_ifs.F90 | 5 +++++ src/trans/gpu/internal/ledir_mod.F90 | 11 +++++++++++ src/trans/gpu/internal/leinv_mod.F90 | 12 ++++++++++++ 5 files changed, 50 insertions(+) diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index e824e4af5..ca5c4796b 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -55,6 +55,7 @@ SUBROUTINE FTDIR(STRIDE,KF_FS) USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NLOEN USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT USE CUDA_DEVICE_MOD +USE MPL_MODULE ,ONLY : MPL_BARRIER ! IMPLICIT NONE @@ -90,6 +91,11 @@ SUBROUTINE FTDIR(STRIDE,KF_FS) !$ACC& D,D_NSTAGTF,G_NMEN,G_NLOEN,D_NSTAGT0B,D_NPNTGTB0,D_NPROCM) & !$ACC& CREATE(ZGTF2) +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='FTDIR BARRIER') +ENDIF +CALL GSTATS(450,0) + DO KGL=IBEG,IEND,IINC IOFF=D%NSTAGTF(KGL)+1 @@ -101,6 +107,10 @@ SUBROUTINE FTDIR(STRIDE,KF_FS) !$ACC end host_data END DO +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='FTDIR BARRIER') +ENDIF +CALL GSTATS(450,1) ! scale results and move into next transformation buffer diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index e5fdbb4be..2c7834d19 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -44,6 +44,7 @@ SUBROUTINE FTINV(PREEL,KFIELDS) ! G. Mozdzynski (Jun 2015): Support alternative FFTs to FFTW ! ------------------------------------------------------------------ +USE TPM_GEN ,ONLY : LSYNC_TRANS USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC @@ -56,6 +57,7 @@ SUBROUTINE FTINV(PREEL,KFIELDS) USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT, destroy_plan_fft USE TPM_DIM ,ONLY : R USE CUDA_DEVICE_MOD +USE MPL_MODULE ,ONLY : MPL_BARRIER IMPLICIT NONE @@ -114,6 +116,11 @@ SUBROUTINE FTINV(PREEL,KFIELDS) allocate(preel2(size(preel,1),size(preel,2))) !$acc data create(preel2) present(preel) +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='FTINV BARRIER') +ENDIF +CALL GSTATS(451,0) + !istat = cuda_GetDevice(idev) !istat = cuda_Synchronize() !!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(istat,KGL,IOFF,IGLG,IPLAN_C2R) @@ -133,6 +140,11 @@ SUBROUTINE FTINV(PREEL,KFIELDS) !!$OMP END PARALLEL DO istat = cuda_Synchronize() +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='FTINV BARRIER') +ENDIF +CALL GSTATS(451,1) + !$acc kernels preel(:,:) = preel2(:,:) diff --git a/src/trans/gpu/internal/gstats_label_ifs.F90 b/src/trans/gpu/internal/gstats_label_ifs.F90 index 247d7fc37..253cc81bf 100644 --- a/src/trans/gpu/internal/gstats_label_ifs.F90 +++ b/src/trans/gpu/internal/gstats_label_ifs.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -229,6 +230,10 @@ SUBROUTINE GSTATS_LABEL_IFS CALL GSTATS_LABEL(431,' ','INV COPIES') CALL GSTATS_LABEL(440,' ','FULL DIRTRANS') CALL GSTATS_LABEL(441,' ','FULL INVTRANS') +CALL GSTATS_LABEL(450,' ','FFTDIR - PLANS') +CALL GSTATS_LABEL(451,' ','FFTINV - PLANS') +CALL GSTATS_LABEL(452,' ','LEDIR') +CALL GSTATS_LABEL(453,' ','LEINV') ! counters 500 to 2000 CALL GSTATS_LABEL(501,'MPL','SLCOMM2_COMMS PART1') diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 7ecf7cda0..524b74874 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -53,6 +53,7 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) ! F. Vana 05-Mar-2015 Support for single precision ! ------------------------------------------------------------------ +USE TPM_GEN ,ONLY : LSYNC_TRANS USE PARKIND_ECTRANS ,ONLY : JPIM ,JPIB ,JPRB, JPRBT USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX,R_NTMAX,R_NDGL @@ -68,6 +69,7 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) USE BUTTERFLY_ALG_MOD USE CUDA_GEMM_BATCHED_MOD!!, ONLY: CUDA_TCGEMM_BATCHED, CUDA_GEMM_BATCHED USE CUBLAS_MOD, ONLY : CUDA_DGEMM_BATCHED +USE MPL_MODULE ,ONLY : MPL_BARRIER USE, INTRINSIC :: ISO_C_BINDING USE IEEE_ARITHMETIC @@ -100,6 +102,11 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) ! anti-symmetric +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='LEDIR BARRIER') +ENDIF +CALL GSTATS(452,0) + !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA) DO KMLOC=1,D_NUMP DO JF=1,KF_FS*2 @@ -275,6 +282,10 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) !$ACC END DATA +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='LEDIR BARRIER') +ENDIF +CALL GSTATS(452,1) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 1c9ae9c0b..407077a07 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -53,6 +54,7 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) ! F. Vana 05-Mar-2015 Support for single precision ! ------------------------------------------------------------------ +USE TPM_GEN ,ONLY : LSYNC_TRANS USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX @@ -68,6 +70,7 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) USE BUTTERFLY_ALG_MOD USE CUDA_GEMM_BATCHED_MOD USE, INTRINSIC :: ISO_C_BINDING +USE MPL_MODULE ,ONLY : MPL_BARRIER USE IEEE_ARITHMETIC IMPLICIT NONE @@ -103,6 +106,10 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) ! -------------------------- !* 1.1 PREPARATIONS. +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='LEINV BARRIER') +ENDIF +CALL GSTATS(453,0) ALLOCATE(ZZBA(ITDZBA,ILDZBA,D_NUMP)) ALLOCATE(ZZCSTA(ITDZCA,ILDZCA,D_NUMP)) @@ -293,6 +300,11 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) DEALLOCATE(ZZCSTS) DEALLOCATE(ZZCSTA) +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='LEINV BARRIER') +ENDIF +CALL GSTATS(453,1) + IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ From 5230b16ea4770ef02c293604ae396a7e551bef1c Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:55 -0700 Subject: [PATCH 041/263] add quite some new barriers/labels --- src/trans/gpu/external/dir_trans.F90 | 9 +++++++++ src/trans/gpu/external/inv_trans.F90 | 9 +++++++++ src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 8 ++++++++ src/trans/gpu/internal/ftdir_ctl_mod.F90 | 8 ++++++++ src/trans/gpu/internal/ltinv_mod.F90 | 8 ++++++++ src/trans/gpu/internal/trgtol_mod.F90 | 3 +++ src/trans/gpu/internal/trltog_mod.F90 | 4 ++++ src/trans/gpu/internal/trltom_mod.F90 | 14 +++++++------- src/trans/gpu/internal/trmtol_mod.F90 | 14 +++++++------- 9 files changed, 63 insertions(+), 14 deletions(-) diff --git a/src/trans/gpu/external/dir_trans.F90 b/src/trans/gpu/external/dir_trans.F90 index ffabc1f5e..9060faff1 100755 --- a/src/trans/gpu/external/dir_trans.F90 +++ b/src/trans/gpu/external/dir_trans.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -122,6 +123,8 @@ SUBROUTINE DIR_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& USE DIR_TRANS_CTL_MOD ,ONLY : DIR_TRANS_CTL USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK +USE MPL_MODULE ,ONLY : MPL_BARRIER +USE TPM_GEN ,ONLY : LSYNC_TRANS !endif INTERFACE @@ -161,6 +164,9 @@ SUBROUTINE DIR_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& ! ------------------------------------------------------------------ IF (LHOOK) CALL DR_HOOK('DIR_TRANS',0,ZHOOK_HANDLE) +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='') +ENDIF CALL GSTATS(440,0) CALL GSTATS(1808,0) ! Set current resolution @@ -527,6 +533,9 @@ SUBROUTINE DIR_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& & PSPSC3A,PSPSC3B,PSPSC2,KVSETSC3A,KVSETSC3B,KVSETSC2,PGPUV,PGP3A,PGP3B,PGP2) IF (LHOOK) CALL DR_HOOK('DIR_TRANS',1,ZHOOK_HANDLE) +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='') +ENDIF CALL GSTATS(440,1) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/external/inv_trans.F90 b/src/trans/gpu/external/inv_trans.F90 index 5e7b025a1..ca75c2171 100755 --- a/src/trans/gpu/external/inv_trans.F90 +++ b/src/trans/gpu/external/inv_trans.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -142,6 +143,8 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& USE INV_TRANS_CTL_MOD ,ONLY : INV_TRANS_CTL USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK +USE MPL_MODULE ,ONLY : MPL_BARRIER +USE TPM_GEN ,ONLY : LSYNC_TRANS #ifdef _OPENACC use openacc @@ -196,6 +199,9 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& call flush(unit_no) IF (LHOOK) CALL DR_HOOK('INV_TRANS',0,ZHOOK_HANDLE) +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='') +ENDIF CALL GSTATS(441,0) CALL GSTATS(1807,0) ! Set current resolution @@ -643,6 +649,9 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& & PSPSC3A,PSPSC3B,PSPSC2,KVSETSC3A,KVSETSC3B,KVSETSC2,PGPUV,PGP3A,PGP3B,PGP2) IF (LHOOK) CALL DR_HOOK('INV_TRANS',1,ZHOOK_HANDLE) +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='') +ENDIF CALL GSTATS(441,1) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 71c996fc2..21a9f740d 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -83,6 +83,8 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& USE LTDIR_CTL_MOD ,ONLY : LTDIR_CTL USE FTDIR_CTL_MOD ,ONLY : FTDIR_CTL USE TPM_TRANS ,ONLY : ZGTF +USE MPL_MODULE ,ONLY : MPL_BARRIER +USE TPM_GEN ,ONLY : LSYNC_TRANS use nvtx ! @@ -201,6 +203,9 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& & PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& & PSPSC3A=PSPSC3A,PSPSC3B=PSPSC3B,PSPSC2=PSPSC2) call nvtxEndRange +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='') +ENDIF CALL GSTATS(430,0) !$ACC END DATA !$ACC END DATA @@ -208,6 +213,9 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& !$ACC END DATA !$ACC END DATA !$ACC END DATA +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='') +ENDIF CALL GSTATS(430,1) call nvtxEndRange diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index 722fee612..1800b9f6b 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -65,6 +65,8 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & USE TRGTOL_MOD ,ONLY : TRGTOL, TRGTOL_CUDAAWARE USE FTDIR_MOD ,ONLY : FTDIR +USE MPL_MODULE ,ONLY : MPL_BARRIER +USE TPM_GEN ,ONLY : LSYNC_TRANS use ieee_arithmetic ! @@ -166,12 +168,18 @@ END SUBROUTINE cudaProfilerStop ! needed ??? JF_FS=KF_FS-D%IADJUST_D #ifdef USE_CUDA_AWARE_MPI_FT +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='') +ENDIF CALL GSTATS(430,0) !$ACC DATA IF(PRESENT(PGP)) COPYIN(PGP) !$ACC DATA IF(PRESENT(PGPUV)) COPYIN(PGPUV) !$ACC DATA IF(PRESENT(PGP2)) COPYIN(PGP2) !$ACC DATA IF(PRESENT(PGP3A)) COPYIN(PGP3A) !$ACC DATA IF(PRESENT(PGP3B)) COPYIN(PGP3B) +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='') +ENDIF CALL GSTATS(430,1) CALL TRGTOL_CUDAAWARE(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index 8adf93439..1edbe7993 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -36,6 +36,8 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& use ieee_arithmetic !USE TPM_FIELDS ,ONLY : F,ZIA,ZSOA1,ZAOA1,ISTAN,ISTAS,ZEPSNM USE TPM_FIELDS ,ONLY : F,ZIA,ZSOA1,ZAOA1,ZEPSNM + USE MPL_MODULE ,ONLY : MPL_BARRIER + USE TPM_GEN ,ONLY : LSYNC_TRANS !**** *LTINV* - Inverse Legendre transform @@ -169,12 +171,18 @@ END SUBROUTINE cudaProfilerStop ! COPY FROM PSPXXXX TO ZIA + IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='') + ENDIF CALL GSTATS(431,0) !$ACC DATA COPYIN(PSPVOR,PSPDIV) IF(KF_UV > 0) !$ACC DATA COPYIN(PSPSCALAR) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSCALAR)) !$ACC DATA COPYIN(PSPSC2) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC2) .AND. NF_SC2 > 0) !$ACC DATA COPYIN(PSPSC3A) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC3A) .AND. NF_SC3A > 0) !$ACC DATA COPYIN(PSPSC3B) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC3B) .AND. NF_SC3B > 0) + IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='') + ENDIF CALL GSTATS(431,1) IF (KF_UV > 0) THEN IVORL = 1 diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index f2b168aeb..62b16304f 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -630,6 +630,9 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL MPL_WAIT(KREQUEST=IREQ(1:IR), & & CDSTRING='TRGTOL_CUDAAWARE: WAIT FOR SENDS AND RECEIVES') ENDIF + IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='TRGTOL BARRIER') + ENDIF CALL GSTATS(413,1) !#ifdef COMVERBOSE diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index e83a8fd87..faa6b9182 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 1995- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -605,6 +606,9 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& & CDSTRING='TRLTOG_CUDAAWARE: WAIT FOR SENDS AND RECEIVES') ENDIF + IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='TRLTOG BARRIER') + ENDIF CALL GSTATS(412,1) #ifdef COMVERBOSE call MPI_BARRIER(MPI_COMM_WORLD,IERROR) diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index e8adf421e..2229dc345 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -122,11 +122,6 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) ENDDO CALL GSTATS(806,0) - IF (LSYNC_TRANS) THEN - CALL GSTATS(420,0) - CALL MPL_BARRIER(CDSTRING='TRLTOM BARRIER') - CALL GSTATS(420,1) - ENDIF ! copy to self workaround IRANK = MPL_MYRANK(MPL_ALL_MS_COMM) @@ -146,18 +141,23 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) ILENR(IRANK) = 0 ENDIF - CALL GSTATS(411,0) !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) + IF (LSYNC_TRANS) THEN + CALL GSTATS(420,0) + CALL MPL_BARRIER(CDSTRING='TRLTOM BARRIER') + CALL GSTATS(420,1) + ENDIF + CALL GSTATS(411,0) CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,IOFFS,TRLTOM_DTYPE,& & PFBUF,ILENR,IOFFR, TRLTOM_DTYPE, & & MPL_ALL_MS_COMM, IERROR) !$ACC END HOST_DATA - CALL GSTATS(411,1) !$ACC WAIT(1) CALL MPI_BARRIER(MPI_COMM_WORLD,IERROR) + CALL GSTATS(411,1) CALL GSTATS(806,1) ELSE ILEN = D%NLTSGTB(MYSETW)*2*KF_FS diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 9072f9a40..419cd747c 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -119,11 +119,6 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) IF (LHOOK) CALL DR_HOOK('TRMTOL_BAR',0,ZHOOK_HANDLE_BAR) ! CALL GSTATS_BARRIER(764) IF (LHOOK) CALL DR_HOOK('TRMTOL_BAR',1,ZHOOK_HANDLE_BAR) - IF (LSYNC_TRANS) THEN - CALL GSTATS(421,0) - CALL MPL_BARRIER(CDSTRING='TRLTOM BARRIER') - CALL GSTATS(421,1) - ENDIF ! copy to self workaround IRANK = MPL_MYRANK(MPL_ALL_MS_COMM) @@ -138,20 +133,25 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) ILENS(IRANK) = 0 ILENR(IRANK) = 0 ENDIF - CALL GSTATS(410,0) !$ACC DATA PRESENT(PFBUF_IN, PFBUF) !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) CALL GSTATS(807,0) + IF (LSYNC_TRANS) THEN + CALL GSTATS(421,0) + CALL MPL_BARRIER(CDSTRING='TRLTOM BARRIER') + CALL GSTATS(421,1) + ENDIF + CALL GSTATS(410,0) CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,IOFFS,TRMTOL_DTYPE,& & PFBUF,ILENR,IOFFR,TRMTOL_DTYPE,& & MPL_ALL_MS_COMM,IERROR) !$ACC END HOST_DATA !$ACC END DATA - CALL GSTATS(410,1) CALL GSTATS(807,1) !$ACC WAIT(1) + CALL GSTATS(410,1) IF (LHOOK) CALL DR_HOOK('TRMTOL_BAR2',0,ZHOOK_HANDLE_BAR2) !CALL GSTATS_BARRIER2(764) From 18c0e99ad747f28acd3a7badb3eda03e5f097e1a Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:55 -0700 Subject: [PATCH 042/263] Model FOURIER_IN according to FOURIER_OUT --- src/trans/gpu/internal/fourier_in_mod.F90 | 70 ++++++----------------- 1 file changed, 18 insertions(+), 52 deletions(-) diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index 58fe3bb5d..430e82a6b 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -40,7 +40,7 @@ SUBROUTINE FOURIER_IN(PREEL,KFIELDS) USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT -USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NSTAGTF,D_MSTABF,D_NSTAGT0B,D_NPNTGTB0,D_NPROCM,D_NPTRLS +USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NSTAGTF,D_NSTAGT0B,D_NPNTGTB0,D_NPROCM,D_NPTRLS USE TPM_TRANS ,ONLY : FOUBUF USE TPM_GEOMETRY ,ONLY : G, G_NMEN,G_NMEN_MAX use tpm_gen, only: nout @@ -54,7 +54,7 @@ SUBROUTINE FOURIER_IN(PREEL,KFIELDS) REAL(KIND=JPRBT), INTENT(OUT) :: PREEL(:,:) -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,IR,II,ISTA +INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,IR,II,ISTA,OFFSET_VAR,IOFF INTEGER(KIND=JPIM) :: IBEG,IEND,IINC,iimax1,iimax2,iimax3,iunit ! ------------------------------------------------------------------ @@ -69,60 +69,26 @@ SUBROUTINE FOURIER_IN(PREEL,KFIELDS) IINC=-1 ENDIF -!$ACC DATA PRESENT(D_NPTRLS,G_NMEN,D_NPROCM,D_NSTAGT0B,D_MSTABF,D_NPNTGTB0,FOUBUF,PREEL,D_NSTAGTF) -!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(IGLG,IPROC,ISTA) DEFAULT(NONE) +OFFSET_VAR=D_NPTRLS(MYSETW) +!$ACC DATA PRESENT(G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF,PREEL,D_NSTAGTF) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF,JM,IPROC,ISTA) DEFAULT(NONE) DO KGL=IBEG,IEND,IINC - DO JM=0,G_NMEN_MAX - DO JF=1,KFIELDS - - IGLG = D_NPTRLS(MYSETW)+KGL-1 - - if ( JM .le. G_NMEN(IGLG)) then - - IPROC = D_NPROCM(JM) - ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*2*KFIELDS - - PREEL(2*JF-1,2*JM+1+D_NSTAGTF(KGL)) = FOUBUF(ISTA+2*JF-1) - PREEL(2*JF, 2*JM+1+D_NSTAGTF(KGL)) = FOUBUF(ISTA+2*JF ) - !write(nout,*) , 'istart1 ...', KGL, JM, JF, ISTA+2*JF,ISTA,D_NSTAGT0B(D_MSTABF(IPROC)),IPROC,KFIELDS - !write(nout,*) , 'istart2 ...',D_NPNTGTB0(JM,KGL), FOUBUF(ISTA+2*JF-1), FOUBUF(ISTA+2*JF),2*JM+1+D_NSTAGTF(KGL) - !if(jf==1 .and. 2*JM+1+D_NSTAGTF(KGL)==7972) write(nout,*) 'fourier_in: fidx=7972, kgl=',kgl,' jm=',jm - !TODO (Andreas): should be able to remove the factor 2 in the second dimension (in front of jm) - !and reduce the size of the array. Will need to adapt fsc_mod accordingly! This is actually more - !difficult: d_nstagtf(kgl) is not necessarily even! - - end if - ENDDO - ENDDO + DO JF=1,KFIELDS + IGLG = OFFSET_VAR+KGL-1 + IOFF = D_NSTAGTF(KGL)+1 + + !$ACC LOOP SEQ + DO JM=0,G_NMEN(IGLG) + IPROC = D_NPROCM(JM) + ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*2*KFIELDS + + PREEL(2*JF-1,2*JM+IOFF) = FOUBUF(ISTA+2*JF-1) + PREEL(2*JF, 2*JM+IOFF) = FOUBUF(ISTA+2*JF ) + ENDDO + ENDDO ENDDO !$ACC END DATA -!iimax1=0 -!iimax2=0 -!iimax3=0 -!iunit=myproc+300 -!DO KGL=IBEG,IEND,IINC -! DO JM=0,G_NMEN_MAX -! DO JF=1,KFIELDS -! -! IGLG = D_NPTRLS(MYSETW)+KGL-1 -! -! if ( JM .le. G_NMEN(IGLG)) then -! -! IPROC = D_NPROCM(JM) -! ISTA = (D_NSTAGT0B(D_MSTABF(IPROC))+D_NPNTGTB0(JM,KGL))*2*KFIELDS -! -! iimax1=max(iimax1,2*JF) -! iimax2=max(iimax2,2*JM+1+D_NSTAGTF(KGL)) -! iimax3=max(iimax3,ISTA+2*JF) -! -! endif -! ENDDO -! ENDDO -!ENDDO -!write(iunit,*) 'max_in ',iimax1,size(PREEL,1),iimax2,size(PREEL,2),iimax3,size(FOUBUF) -! ------------------------------------------------------------------ - END SUBROUTINE FOURIER_IN END MODULE FOURIER_IN_MOD From 6416140b56732ccdebde21cf20d30148ff29e013 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:55 -0700 Subject: [PATCH 043/263] Merge the many kernels in FSC --- src/trans/gpu/internal/fsc_mod.F90 | 215 +++++++++++++++-------------- 1 file changed, 108 insertions(+), 107 deletions(-) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 5040840af..5c05695b0 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -47,7 +48,7 @@ SUBROUTINE FSC(KF_UV,KF_SCALARS,KF_SCDERS,& USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT -USE TPM_TRANS ,ONLY : LUVDER, LATLON, ZGTF +USE TPM_TRANS ,ONLY : LUVDER, LATLON, ZGTF, LVORGP, LDIVGP USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC USE TPM_FIELDS ,ONLY : F USE TPM_GEOMETRY ,ONLY : G @@ -60,24 +61,29 @@ SUBROUTINE FSC(KF_UV,KF_SCALARS,KF_SCDERS,& INTEGER(KIND=JPIM) , INTENT(IN) :: KF_UV,KF_SCALARS,KF_SCDERS INTEGER(KIND=JPIM) , INTENT(IN) :: KST_UV, KST_SC, KST_NSDERS, KST_EWDERS, KST_UVDERS -REAL(KIND=JPRBT) , POINTER :: PUV(:,:) -REAL(KIND=JPRBT) , POINTER :: PSCALAR(:,:) -REAL(KIND=JPRBT) , POINTER :: PNSDERS(:,:) -REAL(KIND=JPRBT) , POINTER :: PEWDERS(:,:) -REAL(KIND=JPRBT) , POINTER :: PUVDERS(:,:) - REAL(KIND=JPRBT) :: ZACHTE,ZMUL, ZACHTE2, ZSHIFT, ZPI REAL(KIND=JPRBT) :: ZAMP, ZPHASE INTEGER(KIND=JPIM) :: IMEN,ISTAGTF -INTEGER(KIND=JPIM) :: JLON,JF,IGLG,II,IR,JM +INTEGER(KIND=JPIM) :: JF,IGLG,II,IR,JM INTEGER(KIND=JPIM) :: IBEG,IEND,IINC !DEBUGGING: -integer :: i,J,maxi,maxj +integer :: i,J,maxi,maxj,ist real :: maxv +INTEGER(JPIM) :: ZGTF_START(8) +INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_VOR = 1 +INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_DIV = 2 +INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_UV = 3 +INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_SCALAR = 4 +INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_NSDERS = 5 +INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_UVDERS = 6 +INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_EWDERS = 7 +INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_END = 8 +INTEGER(JPIM) :: JF_UV, JF_SCALAR + ! ------------------------------------------------------------------ IF(MYPROC > NPROC/2)THEN @@ -90,80 +96,36 @@ SUBROUTINE FSC(KF_UV,KF_SCALARS,KF_SCDERS,& IINC=-1 ENDIF -!write(301,*) ' nums ', KST_UV, KST_SC, KF_UV, KST_nsders, KST_ewders, KF_SCDERS, KST_uvders, D%NDGL_FS -IF( KF_UV > 0 ) THEN - PUV => ZGTF(2*KST_UV-1:2*(KST_UV+2*KF_UV-1),:) +IST = 1 +ZGTF_START(ZGTF_START_INDEX_VOR) = IST +IF (LVORGP) THEN + IST = IST+KF_UV ENDIF -PSCALAR => ZGTF(2*KST_SC-1:2*(KST_SC+KF_SCALARS-1),:) -IF( KF_SCDERS > 0 ) THEN - PNSDERS => ZGTF(2*KST_nsders-1:2*(KST_nsders+KF_SCDERS-1),:) - PEWDERS => ZGTF(2*KST_ewders-1:2*(KST_ewders+KF_SCDERS-1),:) +ZGTF_START(ZGTF_START_INDEX_DIV) = IST +IF (LDIVGP) THEN + IST = IST+KF_UV ENDIF +ZGTF_START(ZGTF_START_INDEX_UV) = IST +IST = IST+2*KF_UV +ZGTF_START(ZGTF_START_INDEX_SCALAR) = IST +IST = IST+KF_SCALARS +ZGTF_START(ZGTF_START_INDEX_NSDERS) = IST +IST = IST+KF_SCDERS +ZGTF_START(ZGTF_START_INDEX_UVDERS) = IST IF (LUVDER) THEN - PUVDERS => ZGTF(2*KST_uvders-1:2*(KST_uvders+2*KF_UV-1),:) + IST = IST+2*KF_UV ENDIF +ZGTF_START(ZGTF_START_INDEX_EWDERS) = IST +IF (KF_SCDERS > 0) THEN + IST = IST+KF_SCDERS +ENDIF +ZGTF_START(ZGTF_START_INDEX_END) = IST -!$ACC DATA PRESENT(ZGTF) & -!$ACC& PRESENT(PUV,PSCALAR,PNSDERS,PEWDERS,PUVDERS) - -DO KGL=IBEG,IEND,IINC - -IGLG = D%NPTRLS(MYSETW)+KGL-1 -ZACHTE = F%RACTHE(IGLG) -IMEN = G%NMEN(IGLG) -ISTAGTF = D%NSTAGTF(KGL) -ZACHTE2 = F%RACTHE(IGLG) - -IF( LATLON.AND.S%LDLL ) THEN - ZPI = 2.0_JPRBT*ASIN(1.0_JPRBT) - ZACHTE2 = 1._JPRBT - ZACHTE = F%RACTHE2(IGLG) - - ! apply shift for (even) lat-lon output grid - IF( S%LSHIFTLL ) THEN - ZSHIFT = ZPI/REAL(G%NLOEN(IGLG),JPRBT) +!$ACC DATA PRESENT(ZGTF,D,G,F) COPYIN(ZGTF_START) - !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IR,II,ZAMP,ZPHASE) - DO JF=1,KF_SCALARS - DO JM=0,IMEN - IR = ISTAGTF+2*JM+1 - II = IR+1 - - ! calculate amplitude and add phase shift then reconstruct A,B - ZAMP = SQRT(PSCALAR(JF,IR)**2 + PSCALAR(JF,II)**2) - ZPHASE = ATAN2(PSCALAR(JF,II),PSCALAR(JF,IR)) + REAL(JM,JPRBT)*ZSHIFT - - PSCALAR(2*JF-1,IR) = ZAMP*COS(ZPHASE) - PSCALAR(2*JF, Ir) = ZAMP*SIN(ZPHASE) - ENDDO - ENDDO - IF(KF_SCDERS > 0)THEN - !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IR,II,ZAMP,ZPHASE) - DO JF=1,KF_SCALARS - DO JM=0,IMEN - IR = ISTAGTF+2*JM+1 - II = IR+1 - ! calculate amplitude and phase shift and reconstruct A,B - ZAMP = SQRT(PNSDERS(JF,IR)**2 + PNSDERS(JF,II)**2) - ZPHASE = ATAN2(PNSDERS(JF,II),PNSDERS(JF,IR)) + REAL(JM,JPRBT)*ZSHIFT - PNSDERS(2*JF-1,IR) = ZAMP*COS(ZPHASE) - PNSDERS(2*JF, Ir) = ZAMP*SIN(ZPHASE) - ENDDO - ENDDO - ENDIF - !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IR,II,ZAMP,ZPHASE) - DO JF=1,2*KF_UV - DO JM=0,IMEN - IR = ISTAGTF+2*JM+1 - II = IR+1 - ! calculate amplitude and phase shift and reconstruct A,B - ZAMP = SQRT(PUV(JF,IR)**2 + PUV(JF,II)**2) - ZPHASE = ATAN2(PUV(JF,II),PUV(JF,IR)) + REAL(JM,JPRBT)*ZSHIFT - PUV(2*JF-1,IR) = ZAMP*COS(ZPHASE) - PUV(2*JF, Ir) = ZAMP*SIN(ZPHASE) - ENDDO - ENDDO - ENDIF +IF( LATLON.AND.S%LDLL.AND.S%LSHIFTLL ) THEN + PRINT *, "This is not implemented yet! LATLON.AND.S%LDLL.AND.S%LSHIFTLL" + STOP 128 ! not implemented ENDIF ! ------------------------------------------------------------------ @@ -171,63 +133,102 @@ SUBROUTINE FSC(KF_UV,KF_SCALARS,KF_SCDERS,& !* 1. DIVIDE U V AND N-S DERIVATIVES BY A*COS(THETA) ! ---------------------------------------------- - !* 1.1 U AND V. -IF(KF_UV > 0) THEN - !$ACC PARALLEL LOOP DEFAULT(NONE) - DO JLON=ISTAGTF+1,ISTAGTF+2*IMEN+1 - DO JF=1,2*KF_UV - PUV(2*JF-1,JLON) = PUV(2*JF-1,JLON)*ZACHTE2 - PUV(2*JF, JLON) = PUV(2*JF ,JLON)*ZACHTE2 +IF (ZGTF_START(ZGTF_START_INDEX_UV) /= ZGTF_START(ZGTF_START_INDEX_UV+1)) THEN + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) + DO KGL=IBEG,IEND,IINC + DO JF=ZGTF_START(ZGTF_START_INDEX_UV),ZGTF_START(ZGTF_START_INDEX_UV+1)-1 + + IGLG = D%NPTRLS(MYSETW)+KGL-1 + IMEN = G%NMEN(IGLG) + ISTAGTF = D%NSTAGTF(KGL) + ZACHTE2 = F%RACTHE(IGLG) + + !$ACC LOOP SEQ + DO JM=0,2*IMEN + IR = ISTAGTF+JM+1 + ZGTF(2*JF-1,IR) = ZGTF(2*JF-1,IR)*ZACHTE2 + ZGTF(2*JF, IR) = ZGTF(2*JF ,IR)*ZACHTE2 + ENDDO ENDDO ENDDO ENDIF !* 1.2 N-S DERIVATIVES -IF(KF_SCDERS > 0)THEN - !$ACC PARALLEL LOOP DEFAULT(NONE) - DO JLON=ISTAGTF+1,ISTAGTF+2*IMEN+1 - DO JF=1,KF_SCALARS - PNSDERS(2*JF-1,JLON) = PNSDERS(2*JF-1,JLON)*ZACHTE2 - PNSDERS(2*JF, JLON) = PNSDERS(2*JF, JLON)*ZACHTE2 +IF (ZGTF_START(ZGTF_START_INDEX_NSDERS) /= ZGTF_START(ZGTF_START_INDEX_NSDERS+1)) THEN + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) + DO KGL=IBEG,IEND,IINC + DO JF=ZGTF_START(ZGTF_START_INDEX_NSDERS),ZGTF_START(ZGTF_START_INDEX_NSDERS+1)-1 + + IGLG = D%NPTRLS(MYSETW)+KGL-1 + IMEN = G%NMEN(IGLG) + ISTAGTF = D%NSTAGTF(KGL) + ZACHTE2 = F%RACTHE(IGLG) + + !$ACC LOOP SEQ + DO JM=0,2*IMEN + IR = ISTAGTF+JM+1 + ZGTF(2*JF-1,IR) = ZGTF(2*JF-1,IR)*ZACHTE2 + ZGTF(2*JF, IR) = ZGTF(2*JF, IR)*ZACHTE2 + ENDDO ENDDO ENDDO ENDIF -! ------------------------------------------------------------------ + ! ------------------------------------------------------------------ !* 2. EAST-WEST DERIVATIVES ! --------------------- !* 2.1 U AND V. -IF(LUVDER)THEN - !$ACC PARALLEL LOOP PRIVATE(IR) DEFAULT(NONE) - DO JM=0,IMEN - DO JF=1,2*KF_UV - IR = ISTAGTF+2*JM+1 - PUVDERS(2*JF-1,IR) = -PUV(2*JF,IR)*ZACHTE2*REAL(JM,JPRBT) - PUVDERS(2*JF, IR) = PUV(2*JF-1,IR)*ZACHTE2*REAL(JM,JPRBT) +IF (ZGTF_START(ZGTF_START_INDEX_UVDERS) /= ZGTF_START(ZGTF_START_INDEX_UVDERS+1)) THEN + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) + DO KGL=IBEG,IEND,IINC + DO JF=ZGTF_START(ZGTF_START_INDEX_UVDERS),ZGTF_START(ZGTF_START_INDEX_UVDERS+1)-1 + + IGLG = D%NPTRLS(MYSETW)+KGL-1 + IMEN = G%NMEN(IGLG) + ISTAGTF = D%NSTAGTF(KGL) + ZACHTE2 = F%RACTHE(IGLG) + JF_UV = JF - ZGTF_START(ZGTF_START_INDEX_UVDERS) + ZGTF_START(ZGTF_START_INDEX_UV) + + !$ACC LOOP SEQ + DO JM=0,IMEN + IR = ISTAGTF+2*JM+1 + ZGTF(2*JF-1,IR) = -ZGTF(2*JF_UV,IR)*ZACHTE2*REAL(JM,JPRBT) + ZGTF(2*JF, IR) = ZGTF(2*JF_UV-1,IR)*ZACHTE2*REAL(JM,JPRBT) + ENDDO ENDDO ENDDO ENDIF !* 2.2 SCALAR VARIABLES - -IF(KF_SCDERS > 0)THEN - !$ACC PARALLEL LOOP PRIVATE(IR) DEFAULT(NONE) - DO JM=0,IMEN - DO JF=1,KF_SCALARS - IR = ISTAGTF+2*JM+1 - PEWDERS(2*JF-1,IR) = -PSCALAR(2*JF,IR)*ZACHTE2*REAL(JM,JPRBT) - PEWDERS(2*JF, IR) = PSCALAR(2*JF-1,IR)*ZACHTE2*REAL(JM,JPRBT) +IF (ZGTF_START(ZGTF_START_INDEX_EWDERS) /= ZGTF_START(ZGTF_START_INDEX_EWDERS+1)) THEN + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) + DO KGL=IBEG,IEND,IINC + DO JF=ZGTF_START(ZGTF_START_INDEX_EWDERS),ZGTF_START(ZGTF_START_INDEX_EWDERS+1)-1 + + IGLG = D%NPTRLS(MYSETW)+KGL-1 + IMEN = G%NMEN(IGLG) + ISTAGTF = D%NSTAGTF(KGL) + ZACHTE2 = F%RACTHE(IGLG) + JF_SCALAR = JF - ZGTF_START(ZGTF_START_INDEX_EWDERS) + ZGTF_START(ZGTF_START_INDEX_SCALAR) + + !$ACC LOOP SEQ + DO JM=0,IMEN + IR = ISTAGTF+2*JM+1 + ZGTF(2*JF-1,IR) = -ZGTF(2*JF_SCALAR,IR)*ZACHTE2*REAL(JM,JPRBT) + ZGTF(2*JF, IR) = ZGTF(2*JF_SCALAR-1,IR)*ZACHTE2*REAL(JM,JPRBT) + ENDDO ENDDO ENDDO ENDIF -enddo +!$ACC WAIT(1) + !$ACC END DATA ! ------------------------------------------------------------------ From 77ce5c297d01093fb61907c75eb10d44ceb8756b Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:55 -0700 Subject: [PATCH 044/263] Adapt FSC Layout to what we have in FOURIER_IN --- src/trans/gpu/internal/fsc_mod.F90 | 62 +++++++++++++----------------- 1 file changed, 27 insertions(+), 35 deletions(-) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 5c05695b0..df4b6dd29 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -49,9 +49,9 @@ SUBROUTINE FSC(KF_UV,KF_SCALARS,KF_SCDERS,& USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT USE TPM_TRANS ,ONLY : LUVDER, LATLON, ZGTF, LVORGP, LDIVGP -USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC +USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NSTAGTF USE TPM_FIELDS ,ONLY : F -USE TPM_GEOMETRY ,ONLY : G +USE TPM_GEOMETRY ,ONLY : G, G_NMEN USE TPM_FLT ,ONLY: S use tpm_gen, only: nout ! @@ -63,10 +63,10 @@ SUBROUTINE FSC(KF_UV,KF_SCALARS,KF_SCDERS,& REAL(KIND=JPRBT) :: ZACHTE,ZMUL, ZACHTE2, ZSHIFT, ZPI REAL(KIND=JPRBT) :: ZAMP, ZPHASE -INTEGER(KIND=JPIM) :: IMEN,ISTAGTF +INTEGER(KIND=JPIM) :: IOFF, OFFSET_VAR -INTEGER(KIND=JPIM) :: JF,IGLG,II,IR,JM +INTEGER(KIND=JPIM) :: JF,IGLG,II,JM INTEGER(KIND=JPIM) :: IBEG,IEND,IINC !DEBUGGING: @@ -121,15 +121,15 @@ SUBROUTINE FSC(KF_UV,KF_SCALARS,KF_SCDERS,& ENDIF ZGTF_START(ZGTF_START_INDEX_END) = IST -!$ACC DATA PRESENT(ZGTF,D,G,F) COPYIN(ZGTF_START) +!$ACC DATA PRESENT(ZGTF,D,G,F,D_NSTAGTF,G_NMEN) COPYIN(ZGTF_START) IF( LATLON.AND.S%LDLL.AND.S%LSHIFTLL ) THEN PRINT *, "This is not implemented yet! LATLON.AND.S%LDLL.AND.S%LSHIFTLL" STOP 128 ! not implemented ENDIF - ! ------------------------------------------------------------------ - +OFFSET_VAR=D%NPTRLS(MYSETW) + !* 1. DIVIDE U V AND N-S DERIVATIVES BY A*COS(THETA) ! ---------------------------------------------- @@ -139,17 +139,15 @@ SUBROUTINE FSC(KF_UV,KF_SCALARS,KF_SCDERS,& !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=ZGTF_START(ZGTF_START_INDEX_UV),ZGTF_START(ZGTF_START_INDEX_UV+1)-1 + IGLG = OFFSET_VAR+KGL-1 + IOFF = D_NSTAGTF(KGL)+1 - IGLG = D%NPTRLS(MYSETW)+KGL-1 - IMEN = G%NMEN(IGLG) - ISTAGTF = D%NSTAGTF(KGL) ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ - DO JM=0,2*IMEN - IR = ISTAGTF+JM+1 - ZGTF(2*JF-1,IR) = ZGTF(2*JF-1,IR)*ZACHTE2 - ZGTF(2*JF, IR) = ZGTF(2*JF ,IR)*ZACHTE2 + DO JM=0,2*G_NMEN(IGLG) + ZGTF(2*JF-1,JM+IOFF) = ZGTF(2*JF-1,JM+IOFF)*ZACHTE2 + ZGTF(2*JF, JM+IOFF) = ZGTF(2*JF ,JM+IOFF)*ZACHTE2 ENDDO ENDDO ENDDO @@ -161,17 +159,15 @@ SUBROUTINE FSC(KF_UV,KF_SCALARS,KF_SCDERS,& !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=ZGTF_START(ZGTF_START_INDEX_NSDERS),ZGTF_START(ZGTF_START_INDEX_NSDERS+1)-1 + IGLG = OFFSET_VAR+KGL-1 + IOFF = D_NSTAGTF(KGL)+1 - IGLG = D%NPTRLS(MYSETW)+KGL-1 - IMEN = G%NMEN(IGLG) - ISTAGTF = D%NSTAGTF(KGL) ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ - DO JM=0,2*IMEN - IR = ISTAGTF+JM+1 - ZGTF(2*JF-1,IR) = ZGTF(2*JF-1,IR)*ZACHTE2 - ZGTF(2*JF, IR) = ZGTF(2*JF, IR)*ZACHTE2 + DO JM=0,2*G_NMEN(IGLG) + ZGTF(2*JF-1,JM+IOFF) = ZGTF(2*JF-1,JM+IOFF)*ZACHTE2 + ZGTF(2*JF, JM+IOFF) = ZGTF(2*JF, JM+IOFF)*ZACHTE2 ENDDO ENDDO ENDDO @@ -188,18 +184,16 @@ SUBROUTINE FSC(KF_UV,KF_SCALARS,KF_SCDERS,& !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=ZGTF_START(ZGTF_START_INDEX_UVDERS),ZGTF_START(ZGTF_START_INDEX_UVDERS+1)-1 + IGLG = OFFSET_VAR+KGL-1 + IOFF = D_NSTAGTF(KGL)+1 - IGLG = D%NPTRLS(MYSETW)+KGL-1 - IMEN = G%NMEN(IGLG) - ISTAGTF = D%NSTAGTF(KGL) ZACHTE2 = F%RACTHE(IGLG) JF_UV = JF - ZGTF_START(ZGTF_START_INDEX_UVDERS) + ZGTF_START(ZGTF_START_INDEX_UV) !$ACC LOOP SEQ - DO JM=0,IMEN - IR = ISTAGTF+2*JM+1 - ZGTF(2*JF-1,IR) = -ZGTF(2*JF_UV,IR)*ZACHTE2*REAL(JM,JPRBT) - ZGTF(2*JF, IR) = ZGTF(2*JF_UV-1,IR)*ZACHTE2*REAL(JM,JPRBT) + DO JM=0,G_NMEN(IGLG) + ZGTF(2*JF-1,2*JM+IOFF) = -ZGTF(2*JF_UV,2*JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) + ZGTF(2*JF, 2*JM+IOFF) = ZGTF(2*JF_UV-1,2*JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) ENDDO ENDDO ENDDO @@ -210,18 +204,16 @@ SUBROUTINE FSC(KF_UV,KF_SCALARS,KF_SCDERS,& !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=ZGTF_START(ZGTF_START_INDEX_EWDERS),ZGTF_START(ZGTF_START_INDEX_EWDERS+1)-1 + IGLG = OFFSET_VAR+KGL-1 + IOFF = D_NSTAGTF(KGL)+1 - IGLG = D%NPTRLS(MYSETW)+KGL-1 - IMEN = G%NMEN(IGLG) - ISTAGTF = D%NSTAGTF(KGL) ZACHTE2 = F%RACTHE(IGLG) JF_SCALAR = JF - ZGTF_START(ZGTF_START_INDEX_EWDERS) + ZGTF_START(ZGTF_START_INDEX_SCALAR) !$ACC LOOP SEQ - DO JM=0,IMEN - IR = ISTAGTF+2*JM+1 - ZGTF(2*JF-1,IR) = -ZGTF(2*JF_SCALAR,IR)*ZACHTE2*REAL(JM,JPRBT) - ZGTF(2*JF, IR) = ZGTF(2*JF_SCALAR-1,IR)*ZACHTE2*REAL(JM,JPRBT) + DO JM=0,G_NMEN(IGLG) + ZGTF(2*JF-1,2*JM+IOFF) = -ZGTF(2*JF_SCALAR,2*JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) + ZGTF(2*JF, 2*JM+IOFF) = ZGTF(2*JF_SCALAR-1,2*JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) ENDDO ENDDO ENDDO From db141f86ab8441ca032b51fda59b4fa4c3309c9c Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:56 -0700 Subject: [PATCH 045/263] Avoid over-computation in FSC no need to recompute data that is not being read in FOURIER_IN --- src/trans/gpu/internal/fsc_mod.F90 | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index df4b6dd29..be87ace8e 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -145,9 +145,9 @@ SUBROUTINE FSC(KF_UV,KF_SCALARS,KF_SCDERS,& ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ - DO JM=0,2*G_NMEN(IGLG) - ZGTF(2*JF-1,JM+IOFF) = ZGTF(2*JF-1,JM+IOFF)*ZACHTE2 - ZGTF(2*JF, JM+IOFF) = ZGTF(2*JF ,JM+IOFF)*ZACHTE2 + DO JM=0,G_NMEN(IGLG) + ZGTF(2*JF-1,2*JM+IOFF) = ZGTF(2*JF-1,2*JM+IOFF)*ZACHTE2 + ZGTF(2*JF, 2*JM+IOFF) = ZGTF(2*JF ,2*JM+IOFF)*ZACHTE2 ENDDO ENDDO ENDDO @@ -165,9 +165,9 @@ SUBROUTINE FSC(KF_UV,KF_SCALARS,KF_SCDERS,& ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ - DO JM=0,2*G_NMEN(IGLG) - ZGTF(2*JF-1,JM+IOFF) = ZGTF(2*JF-1,JM+IOFF)*ZACHTE2 - ZGTF(2*JF, JM+IOFF) = ZGTF(2*JF, JM+IOFF)*ZACHTE2 + DO JM=0,G_NMEN(IGLG) + ZGTF(2*JF-1,2*JM+IOFF) = ZGTF(2*JF-1,2*JM+IOFF)*ZACHTE2 + ZGTF(2*JF, 2*JM+IOFF) = ZGTF(2*JF, 2*JM+IOFF)*ZACHTE2 ENDDO ENDDO ENDDO From 9bf4f7b94a1c12cf19fb2668f6018c32b5fe1baf Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:56 -0700 Subject: [PATCH 046/263] Truncation is implicitly handled because we only fill the relevant data --- src/trans/gpu/internal/fourier_in_mod.F90 | 14 ++++++++++++-- src/trans/gpu/internal/ftinv_mod.F90 | 23 ----------------------- 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index 430e82a6b..c47e4e6ac 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -69,9 +69,17 @@ SUBROUTINE FOURIER_IN(PREEL,KFIELDS) IINC=-1 ENDIF +!$ACC DATA PRESENT(G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF,PREEL,D_NSTAGTF) ASYNC(1) + +! TODO: We don't need to zero out the full array here but we need to zero out because implicit +! truncation happens. We cannot rely on previous iterations that they had the same configuration. + +!$ACC KERNELS ASYNC(1) +PREEL(:,:) = 0 +!$ACC END KERNELS + OFFSET_VAR=D_NPTRLS(MYSETW) -!$ACC DATA PRESENT(G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF,PREEL,D_NSTAGTF) -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF,JM,IPROC,ISTA) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF,JM,IPROC,ISTA) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=1,KFIELDS IGLG = OFFSET_VAR+KGL-1 @@ -89,6 +97,8 @@ SUBROUTINE FOURIER_IN(PREEL,KFIELDS) ENDDO !$ACC END DATA +!$ACC WAIT(1) + END SUBROUTINE FOURIER_IN END MODULE FOURIER_IN_MOD diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index 2c7834d19..f03d39f49 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -90,29 +90,6 @@ SUBROUTINE FTINV(PREEL,KFIELDS) ISIZE=size(PREEL,1) -!$ACC DATA & -!$ACC& PRESENT(PREEL) - -!$ACC PARALLEL LOOP DEFAULT(NONE) -DO KGL=IBEG,IEND,IINC - - IOFF = D%NSTAGTF(KGL)+1 - IGLG = D%NPTRLS(MYSETW)+KGL-1 - IST = 2*(G%NMEN(IGLG)+1) - ILEN = G%NLOEN(IGLG)+R%NNOEXTZL+2-IST - IST1=1 - IF (G%NLOEN(IGLG)==1) IST1=0 - - !$ACC loop collapse(2) - DO JJ=IST1,ILEN - DO JF=1,KFIELDS - PREEL(JF,IST+IOFF+JJ-1) = 0.0_JPRBT - ENDDO - ENDDO - -END DO -!$ACC end data - allocate(preel2(size(preel,1),size(preel,2))) !$acc data create(preel2) present(preel) From 92a8131f1cde8d2da55fbd4b2b601a368f6334ca Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:56 -0700 Subject: [PATCH 047/263] Move ZGTF_START_INDEX to tpm_fields and initialize in ftinv --- src/trans/gpu/internal/fsc_mod.F90 | 60 +++--------------- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 77 +++++++++--------------- src/trans/gpu/internal/tpm_fields.F90 | 12 ++++ 3 files changed, 50 insertions(+), 99 deletions(-) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index be87ace8e..4ab621d8c 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -10,8 +10,7 @@ MODULE FSC_MOD CONTAINS -SUBROUTINE FSC(KF_UV,KF_SCALARS,KF_SCDERS,& - & KST_UV,KST_SC,KST_NSDERS,KST_EWDERS,KST_UVDERS) +SUBROUTINE FSC !**** *FSC - Division by a*cos(theta), east-west derivatives @@ -50,39 +49,21 @@ SUBROUTINE FSC(KF_UV,KF_SCALARS,KF_SCDERS,& USE TPM_TRANS ,ONLY : LUVDER, LATLON, ZGTF, LVORGP, LDIVGP USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NSTAGTF -USE TPM_FIELDS ,ONLY : F +USE TPM_FIELDS ,ONLY : F, ZGTF_START, ZGTF_START_INDEX_UV, ZGTF_START_INDEX_NSDERS, & + & ZGTF_START_INDEX_UVDERS, ZGTF_START_INDEX_EWDERS, ZGTF_START_INDEX_SCALAR USE TPM_GEOMETRY ,ONLY : G, G_NMEN USE TPM_FLT ,ONLY: S use tpm_gen, only: nout ! IMPLICIT NONE -INTEGER(KIND=JPIM) :: KGL -INTEGER(KIND=JPIM) , INTENT(IN) :: KF_UV,KF_SCALARS,KF_SCDERS -INTEGER(KIND=JPIM) , INTENT(IN) :: KST_UV, KST_SC, KST_NSDERS, KST_EWDERS, KST_UVDERS - -REAL(KIND=JPRBT) :: ZACHTE,ZMUL, ZACHTE2, ZSHIFT, ZPI -REAL(KIND=JPRBT) :: ZAMP, ZPHASE -INTEGER(KIND=JPIM) :: IOFF, OFFSET_VAR - -INTEGER(KIND=JPIM) :: JF,IGLG,II,JM +INTEGER(KIND=JPIM) :: KGL +REAL(KIND=JPRBT) :: ZACHTE2 +INTEGER(KIND=JPIM) :: IOFF,OFFSET_VAR +INTEGER(KIND=JPIM) :: JF,IGLG,JM,JF_UV,JF_SCALAR INTEGER(KIND=JPIM) :: IBEG,IEND,IINC -!DEBUGGING: -integer :: i,J,maxi,maxj,ist -real :: maxv - -INTEGER(JPIM) :: ZGTF_START(8) -INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_VOR = 1 -INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_DIV = 2 -INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_UV = 3 -INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_SCALAR = 4 -INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_NSDERS = 5 -INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_UVDERS = 6 -INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_EWDERS = 7 -INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_END = 8 -INTEGER(JPIM) :: JF_UV, JF_SCALAR ! ------------------------------------------------------------------ @@ -96,38 +77,13 @@ SUBROUTINE FSC(KF_UV,KF_SCALARS,KF_SCDERS,& IINC=-1 ENDIF -IST = 1 -ZGTF_START(ZGTF_START_INDEX_VOR) = IST -IF (LVORGP) THEN - IST = IST+KF_UV -ENDIF -ZGTF_START(ZGTF_START_INDEX_DIV) = IST -IF (LDIVGP) THEN - IST = IST+KF_UV -ENDIF -ZGTF_START(ZGTF_START_INDEX_UV) = IST -IST = IST+2*KF_UV -ZGTF_START(ZGTF_START_INDEX_SCALAR) = IST -IST = IST+KF_SCALARS -ZGTF_START(ZGTF_START_INDEX_NSDERS) = IST -IST = IST+KF_SCDERS -ZGTF_START(ZGTF_START_INDEX_UVDERS) = IST -IF (LUVDER) THEN - IST = IST+2*KF_UV -ENDIF -ZGTF_START(ZGTF_START_INDEX_EWDERS) = IST -IF (KF_SCDERS > 0) THEN - IST = IST+KF_SCDERS -ENDIF -ZGTF_START(ZGTF_START_INDEX_END) = IST - !$ACC DATA PRESENT(ZGTF,D,G,F,D_NSTAGTF,G_NMEN) COPYIN(ZGTF_START) IF( LATLON.AND.S%LDLL.AND.S%LSHIFTLL ) THEN PRINT *, "This is not implemented yet! LATLON.AND.S%LDLL.AND.S%LSHIFTLL" STOP 128 ! not implemented ENDIF - + OFFSET_VAR=D%NPTRLS(MYSETW) !* 1. DIVIDE U V AND N-S DERIVATIVES BY A*COS(THETA) diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index d8c087f0d..8cb59c4ad 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -72,6 +73,9 @@ SUBROUTINE FTINV_CTL(KF_UV_G,KF_SCALARS_G,& USE FTINV_MOD ,ONLY : FTINV USE TRLTOG_MOD ,ONLY : TRLTOG, TRLTOG_CUDAAWARE USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE TPM_FIELDS ,ONLY : ZGTF_START, ZGTF_START_INDEX_UV, ZGTF_START_INDEX_NSDERS, & + & ZGTF_START_INDEX_UVDERS, ZGTF_START_INDEX_EWDERS, ZGTF_START_INDEX_SCALAR, & + & ZGTF_START_INDEX_VOR, ZGTF_START_INDEX_DIV, ZGTF_START_INDEX_END use ieee_arithmetic ! @@ -102,25 +106,9 @@ SUBROUTINE FTINV_CTL(KF_UV_G,KF_SCALARS_G,& INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) INTEGER(KIND=JPIM) :: IVSET(KF_GP) INTEGER(KIND=JPIM) :: J3,JGL,IGL,IOFF,IFGP2,IFGP3A,IFGP3B,IGP3APAR,IGP3BPAR -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC - -REAL(KIND=JPRBT),POINTER :: ZUV(:,:) -REAL(KIND=JPRBT),POINTER :: ZSCALAR(:,:) -REAL(KIND=JPRBT),POINTER :: ZNSDERS(:,:) -REAL(KIND=JPRBT),POINTER :: ZEWDERS(:,:) -REAL(KIND=JPRBT),POINTER :: ZUVDERS(:,:) -#if 0 -REAL(KIND=JPRBT),TARGET :: ZDUM(1,D%NLENGTF) ! Reducing stack usage here, too -#else -REAL(KIND=JPRBT),TARGET,ALLOCATABLE :: ZDUM(:,:) ! When using this (HEAP) alloc Cray CCE 8.6.2 fails in OMP 1639 -#endif -INTEGER(KIND=JPIM) :: ist_uv, ist_sc, ist_nsders, ist_uvders, ist_ewders, JF_FS -ist_uv = 1 -ist_sc = 1 -ist_nsders = 1 -ist_uvders = 1 -ist_ewders = 1 +INTEGER(KIND=JPIM) :: JF_FS + ! ------------------------------------------------------------------ @@ -128,37 +116,32 @@ SUBROUTINE FTINV_CTL(KF_UV_G,KF_SCALARS_G,& CALL GSTATS(107,0) -IF (KF_UV > 0 .OR. KF_SCDERS > 0 .OR. (LATLON.AND.S%LDLL) ) THEN - IST = 1 - IF (LVORGP) THEN - IST = IST+KF_UV - ENDIF - IF (LDIVGP) THEN - IST = IST+KF_UV - ENDIF - IST_UV = IST +! Figure out where we want to store data in ZGTF +IST = 1 +ZGTF_START(ZGTF_START_INDEX_VOR) = IST +IF (LVORGP) THEN + IST = IST+KF_UV +ENDIF +ZGTF_START(ZGTF_START_INDEX_DIV) = IST +IF (LDIVGP) THEN + IST = IST+KF_UV +ENDIF +ZGTF_START(ZGTF_START_INDEX_UV) = IST +IST = IST+2*KF_UV +ZGTF_START(ZGTF_START_INDEX_SCALAR) = IST +IST = IST+KF_SCALARS +ZGTF_START(ZGTF_START_INDEX_NSDERS) = IST +IST = IST+KF_SCDERS +ZGTF_START(ZGTF_START_INDEX_UVDERS) = IST +IF (LUVDER) THEN IST = IST+2*KF_UV - IST_SC = IST - IST = IST+KF_SCALARS - IST_NSDERS = IST - IST = IST+KF_SCDERS - IF (LUVDER) THEN - IST_UVDERS = IST - IST = IST+2*KF_UV - ENDIF - IF (KF_SCDERS > 0) THEN - IST_EWDERS = IST - ENDIF ENDIF -IF (MYPROC > NPROC/2)THEN - IBEG=1 - IEND=D%NDGL_FS - IINC=1 -ELSE - IBEG=D%NDGL_FS - IEND=1 - IINC=-1 +ZGTF_START(ZGTF_START_INDEX_EWDERS) = IST +IF (KF_SCDERS > 0) THEN + IST = IST+KF_SCDERS ENDIF +ZGTF_START(ZGTF_START_INDEX_END) = IST + CALL GSTATS(1639,0) ! from FOUBUF to ZGTF @@ -167,7 +150,7 @@ SUBROUTINE FTINV_CTL(KF_UV_G,KF_SCALARS_G,& ! 2. Fourier space computations IF(KF_UV > 0 .OR. KF_SCDERS > 0) THEN - CALL FSC(KF_UV,KF_SCALARS,KF_SCDERS,IST_UV,IST_SC,IST_NSDERS,IST_EWDERS,IST_UVDERS) + CALL FSC ENDIF ! 3. Fourier transform diff --git a/src/trans/gpu/internal/tpm_fields.F90 b/src/trans/gpu/internal/tpm_fields.F90 index 606e5862f..6781c0def 100755 --- a/src/trans/gpu/internal/tpm_fields.F90 +++ b/src/trans/gpu/internal/tpm_fields.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -115,4 +116,15 @@ MODULE TPM_FIELDS REAL(KIND=JPRBT),ALLOCATABLE, TARGET :: ZOA1(:,:,:) REAL(KIND=JPRBT),ALLOCATABLE, TARGET :: ZOA2(:,:,:) +! TODO find a better place for this +INTEGER(JPIM) :: ZGTF_START(8) +INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_VOR = 1 +INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_DIV = 2 +INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_UV = 3 +INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_SCALAR = 4 +INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_NSDERS = 5 +INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_UVDERS = 6 +INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_EWDERS = 7 +INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_END = 8 + END MODULE TPM_FIELDS From 9217c285e704d7a38c49f74e700a0294ef5cc96c Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:56 -0700 Subject: [PATCH 048/263] Compute only the INVFFTs that are actually needed --- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 11 +++++------ src/trans/gpu/internal/ftinv_mod.F90 | 6 +++--- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index 8cb59c4ad..8ea5dbfe0 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -145,18 +145,17 @@ SUBROUTINE FTINV_CTL(KF_UV_G,KF_SCALARS_G,& CALL GSTATS(1639,0) ! from FOUBUF to ZGTF -CALL FOURIER_IN(ZGTF,KF_OUT_LT) +CALL FOURIER_IN(ZGTF,ZGTF_START(ZGTF_START_INDEX_NSDERS+1)-1) ! 2. Fourier space computations -IF(KF_UV > 0 .OR. KF_SCDERS > 0) THEN - CALL FSC -ENDIF +! fills rest of data up to ZGTF_START_INDEX_UVDERS +CALL FSC ! 3. Fourier transform +! from ZGTF to ZGTF IF(KF_FS > 0) THEN - ! from ZGTF to ZGTF - CALL FTINV(ZGTF,size(zgtf,1)) + CALL FTINV(ZGTF,size(zgtf,1),ZGTF_START(ZGTF_START_INDEX_UVDERS+1)-1) ENDIF CALL GSTATS(1639,1) diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index f03d39f49..d9c654331 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -10,7 +10,7 @@ MODULE FTINV_MOD CONTAINS -SUBROUTINE FTINV(PREEL,KFIELDS) +SUBROUTINE FTINV(PREEL,STRIDE,KFIELDS) !**** *FTINV - Inverse Fourier transform @@ -61,7 +61,7 @@ SUBROUTINE FTINV(PREEL,KFIELDS) IMPLICIT NONE -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS +INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS, STRIDE INTEGER(KIND=JPIM) :: KGL REAL(KIND=JPRBT), INTENT(INOUT) :: PREEL(:,:) @@ -107,7 +107,7 @@ SUBROUTINE FTINV(PREEL,KFIELDS) !IF (G%NLOEN(IGLG)>1) THEN !call cudaProfilerStop() !istat=cuda_SetDevice(idev) - CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),KFIELDS,KFIELDS) + CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),2*KFIELDS,STRIDE) !$ACC host_data use_device(PREEL,PREEL2) CALL EXECUTE_PLAN_FFTC(IPLAN_C2R,1,PREEL(1, ioff),PREEL2(1, ioff)) !$ACC end host_data From 5df2a7b2bd6c758e7f2a56b18f9ddf62f664071a Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:57 -0700 Subject: [PATCH 049/263] Cleanup FTINV_MOD.F90 --- src/trans/gpu/internal/ftinv_mod.F90 | 39 +++++++--------------------- 1 file changed, 9 insertions(+), 30 deletions(-) diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index d9c654331..297fba3d0 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -49,28 +49,18 @@ SUBROUTINE FTINV(PREEL,STRIDE,KFIELDS) USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC USE TPM_GEOMETRY ,ONLY : G -use tpm_gen, only: nout -USE TPM_FFT ,ONLY : T -#ifdef WITH_FFTW -USE TPM_FFTW ,ONLY : TW, EXEC_FFTW -#endif -USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT, destroy_plan_fft -USE TPM_DIM ,ONLY : R +USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT USE CUDA_DEVICE_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER IMPLICIT NONE INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS, STRIDE -INTEGER(KIND=JPIM) :: KGL REAL(KIND=JPRBT), INTENT(INOUT) :: PREEL(:,:) -INTEGER(KIND=JPIM) :: IGLG,IST,ILEN,IJUMP,JJ,JF,IST1 -INTEGER(KIND=JPIM) :: IOFF,IRLEN,ICLEN, ITYPE -LOGICAL :: LL_ALL=.FALSE. ! T=do kfields ffts in one batch, F=do kfields ffts one at a time +INTEGER(KIND=JPIM) :: IGLG,IOFF,KGL INTEGER(KIND=JPIM) :: IPLAN_C2R -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC,ISIZE -integer :: istat,idev +INTEGER(KIND=JPIM) :: IBEG,IEND,IINC REAL(KIND=JPRBT), allocatable :: PREEL2(:,:) @@ -88,8 +78,6 @@ SUBROUTINE FTINV(PREEL,STRIDE,KFIELDS) IINC=-1 ENDIF -ISIZE=size(PREEL,1) - allocate(preel2(size(preel,1),size(preel,2))) !$acc data create(preel2) present(preel) @@ -98,31 +86,22 @@ SUBROUTINE FTINV(PREEL,STRIDE,KFIELDS) ENDIF CALL GSTATS(451,0) -!istat = cuda_GetDevice(idev) -!istat = cuda_Synchronize() -!!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(istat,KGL,IOFF,IGLG,IPLAN_C2R) DO KGL=IBEG,IEND,IINC + IOFF=D%NSTAGTF(KGL)+1 IGLG = D%NPTRLS(MYSETW)+KGL-1 - !IF (G%NLOEN(IGLG)>1) THEN -!call cudaProfilerStop() - !istat=cuda_SetDevice(idev) - CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),2*KFIELDS,STRIDE) - !$ACC host_data use_device(PREEL,PREEL2) - CALL EXECUTE_PLAN_FFTC(IPLAN_C2R,1,PREEL(1, ioff),PREEL2(1, ioff)) - !$ACC end host_data -!call cudaProfilerStart() - !ENDIF + + CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),2*KFIELDS,STRIDE) + !$ACC HOST_DATA USE_DEVICE(PREEL,PREEL2) + CALL EXECUTE_PLAN_FFTC(IPLAN_C2R,1,PREEL(1,IOFF),PREEL2(1,IOFF)) + !$ACC END HOST_DATA END DO -!!$OMP END PARALLEL DO -istat = cuda_Synchronize() IF (LSYNC_TRANS) THEN CALL MPL_BARRIER(CDSTRING='FTINV BARRIER') ENDIF CALL GSTATS(451,1) - !$acc kernels preel(:,:) = preel2(:,:) !$acc end kernels From 60a0e8ef4a31d2160b4ad2fc70aa84ec5d5e6c40 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:57 -0700 Subject: [PATCH 050/263] Changes for TRMTOL - TRMTOL call takes ZGTF_START now - trltom and trmtol do exactly the same now --- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 31 +- src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 31 +- src/trans/gpu/internal/ltinv_ctl_mod.F90 | 8 +- src/trans/gpu/internal/trltom_mod.F90 | 515 +++++++++---------- src/trans/gpu/internal/trmtol_mod.F90 | 159 ++---- 5 files changed, 324 insertions(+), 420 deletions(-) diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index 8ea5dbfe0..e68c1853c 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -73,9 +73,7 @@ SUBROUTINE FTINV_CTL(KF_UV_G,KF_SCALARS_G,& USE FTINV_MOD ,ONLY : FTINV USE TRLTOG_MOD ,ONLY : TRLTOG, TRLTOG_CUDAAWARE USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS -USE TPM_FIELDS ,ONLY : ZGTF_START, ZGTF_START_INDEX_UV, ZGTF_START_INDEX_NSDERS, & - & ZGTF_START_INDEX_UVDERS, ZGTF_START_INDEX_EWDERS, ZGTF_START_INDEX_SCALAR, & - & ZGTF_START_INDEX_VOR, ZGTF_START_INDEX_DIV, ZGTF_START_INDEX_END +USE TPM_FIELDS ,ONLY : ZGTF_START, ZGTF_START_INDEX_NSDERS, ZGTF_START_INDEX_UVDERS use ieee_arithmetic ! @@ -116,33 +114,6 @@ SUBROUTINE FTINV_CTL(KF_UV_G,KF_SCALARS_G,& CALL GSTATS(107,0) -! Figure out where we want to store data in ZGTF -IST = 1 -ZGTF_START(ZGTF_START_INDEX_VOR) = IST -IF (LVORGP) THEN - IST = IST+KF_UV -ENDIF -ZGTF_START(ZGTF_START_INDEX_DIV) = IST -IF (LDIVGP) THEN - IST = IST+KF_UV -ENDIF -ZGTF_START(ZGTF_START_INDEX_UV) = IST -IST = IST+2*KF_UV -ZGTF_START(ZGTF_START_INDEX_SCALAR) = IST -IST = IST+KF_SCALARS -ZGTF_START(ZGTF_START_INDEX_NSDERS) = IST -IST = IST+KF_SCDERS -ZGTF_START(ZGTF_START_INDEX_UVDERS) = IST -IF (LUVDER) THEN - IST = IST+2*KF_UV -ENDIF -ZGTF_START(ZGTF_START_INDEX_EWDERS) = IST -IF (KF_SCDERS > 0) THEN - IST = IST+KF_SCDERS -ENDIF -ZGTF_START(ZGTF_START_INDEX_END) = IST - - CALL GSTATS(1639,0) ! from FOUBUF to ZGTF CALL FOURIER_IN(ZGTF,ZGTF_START(ZGTF_START_INDEX_NSDERS+1)-1) diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index b3a49856d..6190db0c8 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2001- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -87,6 +88,9 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& USE TPM_GEN ,ONLY : NPROMATR USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, FOUBUF +USE TPM_FIELDS ,ONLY : ZGTF_START, ZGTF_START_INDEX_UV, ZGTF_START_INDEX_NSDERS, & + & ZGTF_START_INDEX_UVDERS, ZGTF_START_INDEX_EWDERS, ZGTF_START_INDEX_SCALAR, & + & ZGTF_START_INDEX_VOR, ZGTF_START_INDEX_DIV, ZGTF_START_INDEX_END !USE TPM_DISTR USE SHUFFLE_MOD ,ONLY : SHUFFLE @@ -135,7 +139,7 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& INTEGER(KIND=JPIM) :: IBLKS,JBLK,ISTUV_G,IENUV_G INTEGER(KIND=JPIM) :: IF_UV_G,IF_UV,ISTUV,IF_SCALARS,IF_SCALARS_G,IF_FS,IF_GP INTEGER(KIND=JPIM) :: JFLD,ISTSC_G,IENSC_G,ISTSC,IENSC,IENUV,IF_SCDERS,IF_OUT_LT -INTEGER(KIND=JPIM) :: IOFFD,IOFFU,IOFFV,IOFFUVD,IOFFSC,IOFFSCNS,IOFFSCEW,IOFF,IF_GPB +INTEGER(KIND=JPIM) :: IOFFD,IOFFU,IOFFV,IOFFUVD,IOFFSC,IOFFSCNS,IOFFSCEW,IOFF,IF_GPB,IST ! ------------------------------------------------------------------ @@ -143,6 +147,8 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& IF_GPB = 2*KF_UV_G+KF_SCALARS_G IF(NPROMATR > 0 .AND. IF_GPB > NPROMATR) THEN + print *, "This is currently not supported and/or tested" + stop 24 ! Fields to be split into packets @@ -278,6 +284,29 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& ELSE call nvtxStartRange("INVTRANS") + ! Figure out where we want to store data in ZGTF + IST = 1 + ZGTF_START(ZGTF_START_INDEX_VOR) = IST + IF (LVORGP) THEN + IST = IST+KF_UV + ENDIF + ZGTF_START(ZGTF_START_INDEX_DIV) = IST + IF (LDIVGP) THEN + IST = IST+KF_UV + ENDIF + ZGTF_START(ZGTF_START_INDEX_UV) = IST + IST = IST+2*KF_UV + ZGTF_START(ZGTF_START_INDEX_SCALAR) = IST + IST = IST+KF_SCALARS + ZGTF_START(ZGTF_START_INDEX_NSDERS) = IST + IST = IST+KF_SCDERS + ZGTF_START(ZGTF_START_INDEX_UVDERS) = IST + IF (LUVDER) THEN + IST = IST+2*KF_UV + ENDIF + ZGTF_START(ZGTF_START_INDEX_EWDERS) = IST + IST = IST+KF_SCDERS + ZGTF_START(ZGTF_START_INDEX_END) = IST !$ACC DATA CREATE(FOUBUF) ! No splitting of fields, transform done in one go diff --git a/src/trans/gpu/internal/ltinv_ctl_mod.F90 b/src/trans/gpu/internal/ltinv_ctl_mod.F90 index 1a0712d11..7cb488521 100755 --- a/src/trans/gpu/internal/ltinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltinv_ctl_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -59,6 +60,7 @@ SUBROUTINE LTINV_CTL(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,& USE TPM_TRANS ,ONLY : FOUBUF, FOUBUF_IN USE TPM_DISTR ,ONLY : D USE TPM_GEOMETRY ,ONLY : G + USE TPM_FIELDS ,ONLY : ZGTF_START, ZGTF_START_INDEX_NSDERS USE TPM_FLT @@ -106,11 +108,9 @@ SUBROUTINE LTINV_CTL(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,& ! from FOUBUF_IN to FOUBUF #ifdef USE_CUDA_AWARE_MPI_FT WRITE(NOUT,*) 'ltinv_ctl:TRMTOL_CUDAAWARE' - CALL TRMTOL_CUDAAWARE(FOUBUF_IN,FOUBUF,2*KF_OUT_LT) + CALL TRMTOL_CUDAAWARE(FOUBUF_IN,FOUBUF,ZGTF_START(ZGTF_START_INDEX_NSDERS+1)-1) #else - !$ACC UPDATE HOST(FOUBUF_IN) - CALL TRMTOL(FOUBUF_IN,FOUBUF,2*KF_OUT_LT) - !$ACC UPDATE DEVICE(FOUBUF) + CALL TRMTOL(FOUBUF_IN,FOUBUF,ZGTF_START(ZGTF_START_INDEX_NSDERS+1)-1) #endif CALL GSTATS(152,1) !$ACC END DATA diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index 2229dc345..aedfa79c5 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -9,295 +9,250 @@ ! MODULE TRLTOM_MOD - CONTAINS - SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) - - !**** *TRLTOM * - transposition in Fourierspace - - ! Purpose. - ! -------- - ! Transpose Fourier coefficients from partitioning - ! over latitudes to partitioning over wave numbers - ! This is done between inverse Legendre Transform - ! and inverse FFT. - ! This is the inverse routine of TRMTOL. - - !** Interface. - ! ---------- - ! *CALL* *TRLTOM(...)* - - ! Explicit arguments : PFBUF - Fourier coefficient buffer. It is - ! -------------------- used for both input and output. - - ! KF_FS - Number of fields communicated - - ! Implicit arguments : - ! -------------------- - - ! Method. - ! ------- - ! See documentation - - ! Externals. - ! ---------- - - ! Reference. - ! ---------- - ! ECMWF Research Department documentation of the IFS - - ! Author. - ! ------- - ! MPP Group *ECMWF* - - ! Modifications. - ! -------------- - ! Original : 95-10-01 - ! Modified : 97-06-18 G. Mozdzynski - control MPI mailbox use - ! (NCOMBFLEN) for nphase.eq.1 - ! Modified : 99-05-28 D.Salmond - Optimise copies. - ! Modified : 00-02-02 M.Hamrud - Remove NPHASE - ! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message - ! passing and buffer packing - ! G.Mozdzynski : 08-01-01 Cleanup - ! Y.Seity : 07-08-30 Add barrier synchonisation under LSYNC_TRANS - ! ------------------------------------------------------------------ - - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - - USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK, MPL_WAIT, JP_NON_BLOCKING_STANDARD - - USE TPM_DISTR ,ONLY : D, MTAGLM, MYSETW, NPRTRW, NPROC, MYPROC - USE TPM_GEN ,ONLY : LSYNC_TRANS - - USE MPI - - !USE SET2PE_MOD - !USE MYSENDSET_MOD - !USE MYRECVSET_MOD - !USE ABORT_TRANS_MOD - ! - - IMPLICIT NONE - - INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS - REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) - REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF_IN(:) - - INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) - - INTEGER(KIND=JPIM) :: ITAG, J, ILEN, ISTA - + +CONTAINS +SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) + +!**** *TRLTOM * - transposition in Fourierspace + +! Purpose. +! -------- +! Transpose Fourier coefficients from partitioning +! over latitudes to partitioning over wave numbers +! This is done between inverse Legendre Transform +! and inverse FFT. +! This is the inverse routine of TRMTOL. + +!** Interface. +! ---------- +! *CALL* *TRLTOM(...)* + +! Explicit arguments : PFBUF - Fourier coefficient buffer. It is +! -------------------- used for both input and output. + +! KF_FS - Number of fields communicated + +! Implicit arguments : +! -------------------- + +! Method. +! ------- +! See documentation + +! Externals. +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! MPP Group *ECMWF* + +! Modifications. +! -------------- +! Original : 95-10-01 +! Modified : 97-06-18 G. Mozdzynski - control MPI mailbox use +! (NCOMBFLEN) for nphase.eq.1 +! Modified : 99-05-28 D.Salmond - Optimise copies. +! Modified : 00-02-02 M.Hamrud - Remove NPHASE +! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message +! passing and buffer packing +! G.Mozdzynski: 08-01-01 Cleanup +! Y.Seity : 07-08-30 Add barrier synchronisation under LSYNC_TRANS +! ------------------------------------------------------------------ + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT +USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK +USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK +USE TPM_DISTR ,ONLY : D, NPRTRW, NPROC, MYPROC, MYSETW +USE TPM_GEN ,ONLY : LSYNC_TRANS +USE MPI + +IMPLICIT NONE + +INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_FS +REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) +REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF_IN(:) + +INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) +INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK REAL(KIND=JPHOOK) :: ZHOOK_HANDLE -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR2 - - REAL(KIND=JPRBT) :: ZDUM(1) - INTEGER(KIND=JPIM) :: IREQ - INTEGER(KIND=JPIM) :: IERROR - ! ------------------------------------------------------------------ - - REAL(KIND=JPRBT) :: T1, T2, TIMEF, Tc - INTEGER(KIND=JPIM) :: MTOL_OR_LTOM, NOFULLPEERACCESS - INTEGER(KIND=JPIM) :: IRANK,IUNIT - INTEGER(KIND=JPIM) :: FROM_SEND,FROM_RECV,TO_RECV,TO_SEND - - - IF (LHOOK) CALL DR_HOOK('TRLTOM_CUDAAWARE',0,ZHOOK_HANDLE) +INTEGER(KIND=JPIM) :: IERROR #ifdef PARKINDTRANS_SINGLE #define TRLTOM_DTYPE MPI_REAL #else #define TRLTOM_DTYPE MPI_DOUBLE_PRECISION #endif - - ITAG = MTAGLM - - IF(NPROC > 1) THEN - DO J=1,NPRTRW - ILENS(J) = D%NLTSGTB(J)*2*KF_FS - IOFFS(J) = D%NSTAGT0B(J)*2*KF_FS - ILENR(J) = D%NLTSFTB(J)*2*KF_FS - IOFFR(J) = D%NSTAGT1B(J)*2*KF_FS - ENDDO - - CALL GSTATS(806,0) - - ! copy to self workaround - IRANK = MPL_MYRANK(MPL_ALL_MS_COMM) - IF (ILENS(IRANK) .ne. ILENR(IRANK)) THEN - PRINT *, "ERROR", ILENS(IRANK), ILENR(IRANK) - stop 1 - ENDIF - IF (ILENS(IRANK) > 0) THEN - FROM_SEND = IOFFS(IRANK) + 1 - TO_SEND = FROM_SEND + ILENS(IRANK) - 1 - FROM_RECV = IOFFR(IRANK) + 1 - TO_RECV = FROM_RECV + ILENR(IRANK) - 1 - !$ACC KERNELS ASYNC(1) DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) - PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND) - !$ACC END KERNELS - ILENS(IRANK) = 0 - ILENR(IRANK) = 0 - ENDIF - - !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) - - IF (LSYNC_TRANS) THEN - CALL GSTATS(420,0) - CALL MPL_BARRIER(CDSTRING='TRLTOM BARRIER') - CALL GSTATS(420,1) - ENDIF - CALL GSTATS(411,0) - CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,IOFFS,TRLTOM_DTYPE,& - & PFBUF,ILENR,IOFFR, TRLTOM_DTYPE, & - & MPL_ALL_MS_COMM, IERROR) - - !$ACC END HOST_DATA - !$ACC WAIT(1) +IF (LHOOK) CALL DR_HOOK('TRLTOM_CUDAAWARE',0,ZHOOK_HANDLE) + +IF(NPROC > 1) THEN + DO J=1,NPRTRW + ILENS(J) = D%NLTSGTB(J)*2*KF_FS + IOFFS(J) = D%NSTAGT0B(J)*2*KF_FS + ILENR(J) = D%NLTSFTB(J)*2*KF_FS + IOFFR(J) = D%NSTAGT1B(J)*2*KF_FS + ENDDO + + CALL GSTATS(806,0) + + ! copy to self workaround + IRANK = MPL_MYRANK(MPL_ALL_MS_COMM) + IF (ILENS(IRANK) .ne. ILENR(IRANK)) THEN + PRINT *, "ERROR", ILENS(IRANK), ILENR(IRANK) + stop 1 + ENDIF + IF (ILENS(IRANK) > 0) THEN + FROM_SEND = IOFFS(IRANK) + 1 + TO_SEND = FROM_SEND + ILENS(IRANK) - 1 + FROM_RECV = IOFFR(IRANK) + 1 + TO_RECV = FROM_RECV + ILENR(IRANK) - 1 + !$ACC KERNELS ASYNC(1) DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) + PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND) + !$ACC END KERNELS + ILENS(IRANK) = 0 + ILENR(IRANK) = 0 + ENDIF + + !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) + + IF (LSYNC_TRANS) THEN + CALL GSTATS(420,0) + CALL MPL_BARRIER(CDSTRING='TRLTOM BARRIER') + CALL GSTATS(420,1) + ENDIF + CALL GSTATS(411,0) + CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,IOFFS,TRLTOM_DTYPE,& + & PFBUF,ILENR,IOFFR, TRLTOM_DTYPE, & + & MPL_ALL_MS_COMM,IERROR) + + !$ACC END HOST_DATA + !$ACC WAIT(1) + + IF (LSYNC_TRANS) THEN CALL MPI_BARRIER(MPI_COMM_WORLD,IERROR) - CALL GSTATS(411,1) - CALL GSTATS(806,1) - ELSE - ILEN = D%NLTSGTB(MYSETW)*2*KF_FS - ISTA = D%NSTAGT1B(MYSETW)*2*KF_FS+1 - CALL GSTATS(1607,0) - !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) - DO J=ISTA,ISTA+ILEN-1 - PFBUF(J) = PFBUF_IN(J) - ENDDO - CALL GSTATS(1607,1) ENDIF - - IF (LHOOK) CALL DR_HOOK('TRLTOM_CUDAAWARE',1,ZHOOK_HANDLE) - ! ------------------------------------------------------------------ - END SUBROUTINE TRLTOM_CUDAAWARE - - SUBROUTINE TRLTOM(PFBUF_IN,PFBUF,KF_FS) - - !**** *TRLTOM * - transposition in Fourierspace - - ! Purpose. - ! -------- - ! Transpose Fourier coefficients from partitioning - ! over latitudes to partitioning over wave numbers - ! This is done between inverse Legendre Transform - ! and inverse FFT. - ! This is the inverse routine of TRMTOL. - - !** Interface. - ! ---------- - ! *CALL* *TRLTOM(...)* - - ! Explicit arguments : PFBUF - Fourier coefficient buffer. It is - ! -------------------- used for both input and output. - - ! KF_FS - Number of fields communicated - - ! Implicit arguments : - ! -------------------- - - ! Method. - ! ------- - ! See documentation - - ! Externals. - ! ---------- - - ! Reference. - ! ---------- - ! ECMWF Research Department documentation of the IFS - - ! Author. - ! ------- - ! MPP Group *ECMWF* - - ! Modifications. - ! -------------- - ! Original : 95-10-01 - ! Modified : 97-06-18 G. Mozdzynski - control MPI mailbox use - ! (NCOMBFLEN) for nphase.eq.1 - ! Modified : 99-05-28 D.Salmond - Optimise copies. - ! Modified : 00-02-02 M.Hamrud - Remove NPHASE - ! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message - ! passing and buffer packing - ! G.Mozdzynski : 08-01-01 Cleanup - ! Y.Seity : 07-08-30 Add barrier synchonisation under LSYNC_TRANS - ! ------------------------------------------------------------------ - - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - - USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK, MPL_WAIT, JP_NON_BLOCKING_STANDARD - - USE TPM_DISTR ,ONLY : D, MTAGLM, MYSETW, NPRTRW, NPROC, MYPROC - USE TPM_GEN ,ONLY : LSYNC_TRANS - - USE MPI - - !USE SET2PE_MOD - !USE MYSENDSET_MOD - !USE MYRECVSET_MOD - !USE ABORT_TRANS_MOD - ! - - IMPLICIT NONE - - INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS - REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) - REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF_IN(:) - - INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) - - INTEGER(KIND=JPIM) :: ITAG, J, ILEN, ISTA - + CALL GSTATS(411,1) + CALL GSTATS(806,1) +ELSE + ILEN = D%NLTSGTB(MYSETW)*2*KF_FS + ISTA = D%NSTAGT1B(MYSETW)*2*KF_FS+1 + CALL GSTATS(1607,0) + !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) + DO J=ISTA,ISTA+ILEN-1 + PFBUF(J) = PFBUF_IN(J) + ENDDO + CALL GSTATS(1607,1) +ENDIF + +IF (LHOOK) CALL DR_HOOK('TRLTOM_CUDAAWARE',1,ZHOOK_HANDLE) +! ------------------------------------------------------------------ +END SUBROUTINE TRLTOM_CUDAAWARE + +SUBROUTINE TRLTOM(PFBUF_IN,PFBUF,KF_FS) + +!**** *TRLTOM * - transposition in Fourier space + +! Purpose. +! -------- +! Transpose Fourier coefficients from partitioning +! over latitudes to partitioning over wave numbers +! This is done between inverse Legendre Transform +! and inverse FFT. +! This is the inverse routine of TRMTOL. + +!** Interface. +! ---------- +! *CALL* *TRLTOM(...)* + +! Explicit arguments : PFBUF - Fourier coefficient buffer. It is +! -------------------- used for both input and output. +! KF_FS - Number of fields communicated + +! Implicit arguments : +! -------------------- + +! Method. +! ------- +! See documentation + +! Externals. +! ---------- + +! Reference. +! ---------- +! ECMWF Research Department documentation of the IFS + +! Author. +! ------- +! MPP Group *ECMWF* + +! Modifications. +! -------------- +! Original : 95-10-01 +! Modified : 97-06-18 G. Mozdzynski - control MPI mailbox use +! (NCOMBFLEN) for nphase.eq.1 +! Modified : 99-05-28 D.Salmond - Optimise copies. +! Modified : 00-02-02 M.Hamrud - Remove NPHASE +! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message +! passing and buffer packing +! G.Mozdzynski : 08-01-01 Cleanup +! Y.Seity : 07-08-30 Add barrier synchronisation under LSYNC_TRANS +! ------------------------------------------------------------------ + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT +USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK +USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM +USE TPM_DISTR ,ONLY : D, NPRTRW, NPROC, MYPROC, MYSETW +USE TPM_GEN ,ONLY : LSYNC_TRANS +USE MPI + +IMPLICIT NONE + +INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_FS +REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) +REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF_IN(:) + +INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) +INTEGER(KIND=JPIM) :: J, ILEN, ISTA REAL(KIND=JPHOOK) :: ZHOOK_HANDLE -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR2 - - REAL(KIND=JPRBT) :: ZDUM(1) - INTEGER(KIND=JPIM) :: IREQ - INTEGER(KIND=JPIM) :: IERROR - ! ------------------------------------------------------------------ - - REAL(KIND=JPRBT) :: T1, T2, TIMEF, tc - INTEGER(KIND=JPIM) :: MTOL_OR_LTOM, NOFULLPEERACCESS - INTEGER(KIND=JPIM) :: IRANK,iunit - - IF (LHOOK) CALL DR_HOOK('TRLTOM',0,ZHOOK_HANDLE) - - ITAG = MTAGLM - - IF(NPROC > 1) THEN - DO J=1,NPRTRW - ILENS(J) = D%NLTSGTB(J)*2*KF_FS - IOFFS(J) = D%NSTAGT0B(J)*2*KF_FS - ILENR(J) = D%NLTSFTB(J)*2*KF_FS - IOFFR(J) = D%NSTAGT1B(J)*2*KF_FS - ENDDO - - CALL GSTATS(806,0) - !$ACC UPDATE HOST(PFBUF_IN) - - CALL MPL_ALLTOALLV(PSENDBUF=PFBUF_IN,KSENDCOUNTS=ILENS,& - & PRECVBUF=PFBUF,KRECVCOUNTS=ILENR,KSENDDISPL=IOFFS,KRECVDISPL=IOFFR,& - & KCOMM=MPL_ALL_MS_COMM,CDSTRING='TRLTOM:') - - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - !$ACC UPDATE DEVICE(PFBUF) +INTEGER(KIND=JPIM) :: IERROR + +IF (LHOOK) CALL DR_HOOK('TRLTOM',0,ZHOOK_HANDLE) + +IF(NPROC > 1) THEN + DO J=1,NPRTRW + ILENS(J) = D%NLTSGTB(J)*2*KF_FS + IOFFS(J) = D%NSTAGT0B(J)*2*KF_FS + ILENR(J) = D%NLTSFTB(J)*2*KF_FS + IOFFR(J) = D%NSTAGT1B(J)*2*KF_FS + ENDDO + + CALL GSTATS(806,0) + !$ACC UPDATE HOST(PFBUF_IN) + + CALL MPL_ALLTOALLV(PSENDBUF=PFBUF_IN,KSENDCOUNTS=ILENS,& + & PRECVBUF=PFBUF,KRECVCOUNTS=ILENR,KSENDDISPL=IOFFS,KRECVDISPL=IOFFR,& + & KCOMM=MPL_ALL_MS_COMM,CDSTRING='TRLTOM:') + + !$ACC UPDATE DEVICE(PFBUF) + CALL MPI_BARRIER(MPI_COMM_WORLD,IERROR) CALL GSTATS(806,1) - ELSE - ILEN = D%NLTSGTB(MYSETW)*2*KF_FS - ISTA = D%NSTAGT1B(MYSETW)*2*KF_FS+1 - CALL GSTATS(1607,0) - DO J=ISTA,ISTA+ILEN-1 - PFBUF(J) = PFBUF_IN(J) - ENDDO - CALL GSTATS(1607,1) - ENDIF - - IF (LHOOK) CALL DR_HOOK('TRLTOM',1,ZHOOK_HANDLE) - ! ------------------------------------------------------------------ - END SUBROUTINE TRLTOM - END MODULE TRLTOM_MOD +ELSE + ILEN = D%NLTSGTB(MYSETW)*2*KF_FS + ISTA = D%NSTAGT1B(MYSETW)*2*KF_FS+1 + CALL GSTATS(1607,0) + !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) + DO J=ISTA,ISTA+ILEN-1 + PFBUF(J) = PFBUF_IN(J) + ENDDO + CALL GSTATS(1607,1) +ENDIF + +IF (LHOOK) CALL DR_HOOK('TRLTOM',1,ZHOOK_HANDLE) +! ------------------------------------------------------------------ +END SUBROUTINE TRLTOM +END MODULE TRLTOM_MOD diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 419cd747c..0a7f6782d 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -60,38 +60,26 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) ! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message ! passing and buffer packing ! G.Mozdzynski: 08-01-01 Cleanup -! Y.Seity : 07-08-31 add barrien synchronisation under LSYNC_TRANS +! Y.Seity : 07-08-31 add barrier synchronisation under LSYNC_TRANS ! ------------------------------------------------------------------ - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - -USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_WAIT, JP_NON_BLOCKING_STANDARD, MPL_MYRANK - -USE TPM_DISTR ,ONLY : D, MTAGML, MYSETW, NPRTRW, NPROC, MYPROC +USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK +USE TPM_DISTR ,ONLY : D, NPRTRW, NPROC, MYPROC, MYSETW USE TPM_GEN ,ONLY : LSYNC_TRANS - USE MPI IMPLICIT NONE - -INTEGER(KIND=JPIM) ,INTENT(IN) :: KFIELD -REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) -REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF_IN(:) +INTEGER(KIND=JPIM) ,INTENT(IN) :: KFIELD +REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) +REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) - -INTEGER(KIND=JPIM) :: ITAG, J, ILEN, ISTA - -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR2 - -REAL(KIND=JPRBT) :: ZDUM(1) -INTEGER(KIND=JPIM) :: IREQ, IERROR, IRANK -INTEGER(KIND=JPIM) :: FROM_SEND,FROM_RECV,TO_RECV,TO_SEND +INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK +REAL(KIND=JPHOOK) :: ZHOOK_HANDLE +INTEGER(KIND=JPIM) :: IERROR #ifdef PARKINDTRANS_SINGLE #define TRMTOL_DTYPE MPI_REAL @@ -99,29 +87,24 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) #define TRMTOL_DTYPE MPI_DOUBLE_PRECISION #endif -! ------------------------------------------------------------------ - IF (LHOOK) CALL DR_HOOK('TRMTOL_CUDAAWARE',0,ZHOOK_HANDLE) -ITAG = MTAGML - -DO J=1,NPRTRW - ILENS(J) = D%NLTSFTB(J)*KFIELD - IOFFS(J) = D%NSTAGT1B(J)*KFIELD - ILENR(J) = D%NLTSGTB(J)*KFIELD - IOFFR(J) = D%NSTAGT0B(J)*KFIELD -ENDDO - -!write(300+myproc,*)"0:TRMTOL:PFBUF",sum(PFBUF) -!write(300+myproc,*)"0:TRMTOL:PFBUF_IN",sum(PFBUF_IN) -!call flush(300) IF(NPROC > 1) THEN - IF (LHOOK) CALL DR_HOOK('TRMTOL_BAR',0,ZHOOK_HANDLE_BAR) - ! CALL GSTATS_BARRIER(764) - IF (LHOOK) CALL DR_HOOK('TRMTOL_BAR',1,ZHOOK_HANDLE_BAR) + DO J=1,NPRTRW + ILENS(J) = D%NLTSFTB(J)*2*KFIELD + IOFFS(J) = D%NSTAGT1B(J)*2*KFIELD + ILENR(J) = D%NLTSGTB(J)*2*KFIELD + IOFFR(J) = D%NSTAGT0B(J)*2*KFIELD + ENDDO + + CALL GSTATS(807,0) ! copy to self workaround IRANK = MPL_MYRANK(MPL_ALL_MS_COMM) + IF (ILENS(IRANK) .ne. ILENR(IRANK)) THEN + PRINT *, "ERROR", ILENS(IRANK), ILENR(IRANK) + stop 1 + ENDIF IF (ILENS(IRANK) > 0) THEN FROM_SEND = IOFFS(IRANK) + 1 TO_SEND = FROM_SEND + ILENS(IRANK) - 1 @@ -134,9 +117,8 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) ILENR(IRANK) = 0 ENDIF - !$ACC DATA PRESENT(PFBUF_IN, PFBUF) !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) - CALL GSTATS(807,0) + IF (LSYNC_TRANS) THEN CALL GSTATS(421,0) CALL MPL_BARRIER(CDSTRING='TRLTOM BARRIER') @@ -148,17 +130,16 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) & MPL_ALL_MS_COMM,IERROR) !$ACC END HOST_DATA - !$ACC END DATA - CALL GSTATS(807,1) !$ACC WAIT(1) - CALL GSTATS(410,1) - IF (LHOOK) CALL DR_HOOK('TRMTOL_BAR2',0,ZHOOK_HANDLE_BAR2) - !CALL GSTATS_BARRIER2(764) - IF (LHOOK) CALL DR_HOOK('TRMTOL_BAR2',1,ZHOOK_HANDLE_BAR2) + IF (LSYNC_TRANS) THEN + CALL MPI_BARRIER(MPI_COMM_WORLD,IERROR) + ENDIF + CALL GSTATS(410,1) + CALL GSTATS(807,1) ELSE - ILEN = D%NLTSGTB(MYSETW)*KFIELD - ISTA = D%NSTAGT0B(MYSETW)*KFIELD+1 + ILEN = D%NLTSGTB(MYSETW)*2*KFIELD + ISTA = D%NSTAGT0B(MYSETW)*2*KFIELD+1 CALL GSTATS(1608,0) !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) DO J=ISTA,ISTA+ILEN-1 @@ -166,19 +147,15 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) ENDDO CALL GSTATS(1608,1) ENDIF -!write(300+myproc,*)"10:TRMTOL:PFBUF",sum(PFBUF), KFIELD, D%NLTSGTB(MYSETW), MYSETW, D%NSTAGT0B(MYSETW), D%NLTSFTB(MYSETW), D%MSTABF(MYSETW) -!write(300+myproc,*)"10:TRMTOL:PFBUF_IN",sum(PFBUF_IN) -!call flush(300+myproc) IF (LHOOK) CALL DR_HOOK('TRMTOL_CUDAAWARE',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ - END SUBROUTINE TRMTOL_CUDAAWARE SUBROUTINE TRMTOL(PFBUF_IN,PFBUF,KFIELD) -!**** *trmtol * - transposition in Fourier space +!**** *TRMTOL * - transposition in Fourier space ! Purpose. ! -------- @@ -191,7 +168,7 @@ SUBROUTINE TRMTOL(PFBUF_IN,PFBUF,KFIELD) !** Interface. ! ---------- -! *call* *trmtol(...)* +! *CALL* *TRMTOL(...)* ! Explicit arguments : PFBUF - Fourier coefficient buffer. It is ! -------------------- used for both input and output. @@ -225,87 +202,59 @@ SUBROUTINE TRMTOL(PFBUF_IN,PFBUF,KFIELD) ! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message ! passing and buffer packing ! G.Mozdzynski: 08-01-01 Cleanup -! Y.Seity : 07-08-31 add barrien synchronisation under LSYNC_TRANS +! Y.Seity : 07-08-31 Add barrier synchronisation under LSYNC_TRANS ! ------------------------------------------------------------------ - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - -USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_WAIT, JP_NON_BLOCKING_STANDARD, MPL_MYRANK - -USE TPM_DISTR ,ONLY : D, MTAGML, MYSETW, NPRTRW, NPROC, MYPROC +USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM +USE TPM_DISTR ,ONLY : D, NPRTRW, NPROC, MYPROC, MYSETW USE TPM_GEN ,ONLY : LSYNC_TRANS - +USE MPI IMPLICIT NONE - -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD -REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) -REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF_IN(:) +INTEGER(KIND=JPIM) ,INTENT(IN) :: KFIELD +REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) +REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) - -INTEGER(KIND=JPIM) :: ITAG, J, ILEN, ISTA - +INTEGER(KIND=JPIM) :: J, ILEN, ISTA REAL(KIND=JPHOOK) :: ZHOOK_HANDLE -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR2 - -REAL(KIND=JPRBT) :: ZDUM(1) -INTEGER(KIND=JPIM) :: IREQ - - -! ------------------------------------------------------------------ +INTEGER(KIND=JPIM) :: IERROR IF (LHOOK) CALL DR_HOOK('TRMTOL',0,ZHOOK_HANDLE) - -ITAG = MTAGML - -DO J=1,NPRTRW - ILENS(J) = D%NLTSFTB(J)*KFIELD - IOFFS(J) = D%NSTAGT1B(J)*KFIELD - ILENR(J) = D%NLTSGTB(J)*KFIELD - IOFFR(J) = D%NSTAGT0B(J)*KFIELD -ENDDO - -!write(300+myproc,*)"0:TRMTOL:PFBUF",sum(PFBUF) -!write(300+myproc,*)"0:TRMTOL:PFBUF_IN",sum(PFBUF_IN) -!call flush(300) IF(NPROC > 1) THEN - IF (LHOOK) CALL DR_HOOK('TRMTOL_BAR',0,ZHOOK_HANDLE_BAR) - ! CALL GSTATS_BARRIER(764) - IF (LHOOK) CALL DR_HOOK('TRMTOL_BAR',1,ZHOOK_HANDLE_BAR) -! IF (LSYNC_TRANS) THEN -! CALL MPL_BARRIER(CDSTRING='TRMTOL') -! ENDIF + DO J=1,NPRTRW + ILENS(J) = D%NLTSFTB(J)*2*KFIELD + IOFFS(J) = D%NSTAGT1B(J)*2*KFIELD + ILENR(J) = D%NLTSGTB(J)*2*KFIELD + IOFFR(J) = D%NSTAGT0B(J)*2*KFIELD + ENDDO CALL GSTATS(807,0) + !$ACC UPDATE HOST(PFBUF_IN) + CALL MPL_ALLTOALLV(PSENDBUF=PFBUF_IN,KSENDCOUNTS=ILENS,& & PRECVBUF=PFBUF,KRECVCOUNTS=ILENR,KSENDDISPL=IOFFS,KRECVDISPL=IOFFR,& & KCOMM=MPL_ALL_MS_COMM,CDSTRING='TRMTOL:') + + !$ACC UPDATE DEVICE(PFBUF) + CALL MPI_BARRIER(MPI_COMM_WORLD,IERROR) CALL GSTATS(807,1) - IF (LHOOK) CALL DR_HOOK('TRMTOL_BAR2',0,ZHOOK_HANDLE_BAR2) - !CALL GSTATS_BARRIER2(764) - IF (LHOOK) CALL DR_HOOK('TRMTOL_BAR2',1,ZHOOK_HANDLE_BAR2) ELSE - ILEN = D%NLTSGTB(MYSETW)*KFIELD - ISTA = D%NSTAGT0B(MYSETW)*KFIELD+1 + ILEN = D%NLTSGTB(MYSETW)*2*KFIELD + ISTA = D%NSTAGT0B(MYSETW)*2*KFIELD+1 CALL GSTATS(1608,0) + !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) DO J=ISTA,ISTA+ILEN-1 PFBUF(J) = PFBUF_IN(J) ENDDO CALL GSTATS(1608,1) ENDIF -!write(300+myproc,*)"10:TRMTOL:PFBUF",sum(PFBUF), KFIELD, D%NLTSGTB(MYSETW), MYSETW, D%NSTAGT0B(MYSETW), D%NLTSFTB(MYSETW), D%MSTABF(MYSETW) -!write(300+myproc,*)"10:TRMTOL:PFBUF_IN",sum(PFBUF_IN) -!call flush(300+myproc) IF (LHOOK) CALL DR_HOOK('TRMTOL',1,ZHOOK_HANDLE) - ! ------------------------------------------------------------------ - END SUBROUTINE TRMTOL END MODULE TRMTOL_MOD From 5326ba67b19a68c3b71402baf9dffcd8a7ce3345 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:57 -0700 Subject: [PATCH 051/263] Apply the usual kernel pattern to leinv --- src/trans/gpu/internal/leinv_mod.F90 | 178 ++++++++++----------------- 1 file changed, 67 insertions(+), 111 deletions(-) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 407077a07..2c5ca7dd2 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -89,7 +89,6 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) ! LOCAL INTEGER(KIND=JPIM) :: IA, ILA, ILS, IS, ISKIP, ISL, J1, IF, JGL,JK, J,JI, IRET -INTEGER(KIND=JPIM) :: ITHRESHOLD REAL(KIND=JPRBT), ALLOCATABLE :: ZZBS(:,:,:), ZZCSTS(:,:,:) REAL(KIND=JPRBT), ALLOCATABLE :: ZZBA(:,:,:), ZZCSTA(:,:,:) @@ -117,12 +116,11 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) ALLOCATE(ZZCSTS(ITDZCS,ILDZCS,D_NUMP)) !$ACC DATA CREATE(ZZBA,ZZCSTA,ZZBS,ZZCSTS) -!$ACC DATA COPYIN (S,S%ITHRESHOLD,S%LUSEFLT) & -!$ACC& COPYIN (D,D_MYMS,R,G,G_NDGLU,D_NUMP,R_NDGNH,R_NSMAX) & -!$ACC& PRESENT (ZAA,ZAS) & -!$ACC& PRESENT (ZZBA,ZZBS,ZZCSTA,ZZCSTS) & -!$ACC& PRESENT (PIA) & -!$ACC& PRESENT (PSOA1,PAOA1,IZBS) +!$ACC DATA COPYIN (D,D_MYMS,R,G,G_NDGLU,D_NUMP,R_NDGNH,R_NSMAX) & +!$ACC& PRESENT (ZAA,ZAS) & +!$ACC& PRESENT (ZZBA,ZZBS,ZZCSTA,ZZCSTS) & +!$ACC& PRESENT (PIA) & +!$ACC& PRESENT (PSOA1,PAOA1,IZBS) !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM) DEFAULT(NONE) DO KMLOC=1,D_NUMP @@ -139,54 +137,26 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) !end loop over wavenumber END DO -!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM,ISKIP,ILA,ILS,IA) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO J=1,(R_NSMAX+2)/2 - DO JK=1,KFC - - KM = D_MYMS(KMLOC) - IF (KM == 0) THEN - ISKIP = 2 - ELSE - ISKIP = 1 - ENDIF - - IF (MOD((JK-1),ISKIP) .EQ. 0) THEN - ILA = (R_NSMAX-KM+2)/2 - IF (J .LE. ILA) THEN - IA = 1+MOD(R_NSMAX-KM+2,2) -!! IZBA((JK-1)/ISKIP+1,J,KMLOC)=PIA(JK,IA+1+(J-1)*2,KMLOC)*RRPELTMINV/ZAMAX((JK-1)/ISKIP+1,KMLOC) - IZBS((JK-1)/ISKIP+1+(J-1+(KMLOC-1)*TDZAA)*IF_FS_INV)=PIA(JK,IA+1+(J-1)*2,KMLOC) - ENDIF - ENDIF - ENDDO - ENDDO -ENDDO - -!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM,ISKIP,ILA,ILS,IA) DEFAULT(NONE) -DO KMLOC=1,D_NUMP - DO J=1,(R_NSMAX+2)/2 - DO JK=1,KFC - KM = D_MYMS(KMLOC) - IF(KM == 0)THEN - ISKIP = 2 - ELSE - ISKIP = 1 - ENDIF - - IF (MOD((JK-1),ISKIP) .EQ. 0) THEN - ILA = (R_NSMAX-KM+2)/2 - IF (J .LE. ILA) THEN - IA = 1+MOD(R_NSMAX-KM+2,2) - ZZBA((JK-1)/ISKIP+1,J,KMLOC)=PIA(JK,IA+1+(J-1)*2,KMLOC) - ENDIF - ENDIF - ENDDO + DO JK=1,KFC + KM = D_MYMS(KMLOC) + IF(KM /= 0)THEN + !$ACC LOOP SEQ + DO J=1,(R_NSMAX-KM+2)/2 + IA = 1+MOD(R_NSMAX-KM+2,2) + ZZBA((JK-1)+1,J,KMLOC)=PIA(JK,IA+1+(J-1)*2,KMLOC) + ENDDO + ELSEIF (MOD((JK-1),2) .EQ. 0) THEN + !$ACC LOOP SEQ + DO J=1,(R_NSMAX+2)/2 + IA = 1+MOD(R_NSMAX+2,2) + ZZBA((JK-1)/2+1,J,KMLOC)=PIA(JK,IA+1+(J-1)*2,KMLOC) + ENDDO + ENDIF ENDDO ENDDO -ITHRESHOLD=S%ITHRESHOLD - ! operate on full arrays, where non-relavent entries have been set to zero ! call CUDA_DGEMM_BATCHED('N','N',LDZAA,TDZBA,TDZAA,1.0_JPRB,ZAA,LDZAA,TDZAA,ZBA,LDZBA,TDZBA,0._JPRB,ZCA,LDZCA,TDZCA,D_NUMP) ! Get C in transpose format to get better memory access patterns later @@ -207,49 +177,42 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) & D_NUMP) !$ACC END HOST_DATA -!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM,KDGLU,ISKIP,ISL) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL,KDGLU) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO JI=1,R_NDGNH - DO JK=1,KFC - KM = D_MYMS(KMLOC) - KDGLU = MIN(R_NDGNH,G_NDGLU(KM)) - IF (JI .LE. KDGLU) then - IF(KM == 0)THEN - ISKIP = 2 - ELSE - ISKIP = 1 - END IF - - ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) - IF (MOD((JK-1),ISKIP) .EQ. 0) THEN - PAOA1(JK,ISL+JI-1,KMLOC) = ZZCSTA((JK-1)/ISKIP+1,JI,KMLOC) - END IF - END IF - ENDDO - ENDDO -END DO + DO JK=1,KFC + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + KDGLU = MIN(R_NDGNH,G_NDGLU(KM)) + !$ACC LOOP SEQ + DO JGL=ISL,R_NDGNH + IF(KM /= 0) THEN + PAOA1(JK,JGL,KMLOC) = ZZCSTA((JK-1)+1,JGL-ISL+1,KMLOC) + ELSEIF (MOD((JK-1),2) .EQ. 0) THEN + PAOA1(JK,JGL,KMLOC) = ZZCSTA((JK-1)/2+1,JGL-ISL+1,KMLOC) + ENDIF + ENDDO + ENDDO +ENDDO ! 2. +++++++++++++ symmetric -!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM,ISKIP,ILS,IS) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO J=1,(R_NSMAX+3)/2 - DO JK=1,KFC - KM = D_MYMS(KMLOC) - IF(KM == 0)THEN - ISKIP = 2 - ELSE - ISKIP = 1 - ENDIF - - IF (MOD((JK-1),ISKIP) .EQ. 0) THEN - ILS = (R_NSMAX-KM+3)/2 - IF (J .LE. ILS) THEN - IS = 1+MOD(R_NSMAX-KM+1,2) - ZZBS((JK-1)/ISKIP+1,J,KMLOC)=PIA(JK,IS+1+(J-1)*2,KMLOC) - END IF - END IF - ENDDO + DO JK=1,KFC + KM = D_MYMS(KMLOC) + IF(KM /= 0)THEN + !$ACC LOOP SEQ + DO J=1,(R_NSMAX-KM+3)/2 + IS = 1+MOD(R_NSMAX-KM+1,2) + ZZBS((JK-1)+1,J,KMLOC)=PIA(JK,IS+1+(J-1)*2,KMLOC) + ENDDO + ELSEIF (MOD((JK-1),2) .EQ. 0) THEN + !$ACC LOOP SEQ + DO J=1,(R_NSMAX+3)/2 + IS = 1+MOD(R_NSMAX+1,2) + ZZBS((JK-1)/2+1,J,KMLOC)=PIA(JK,IS+1+(J-1)*2,KMLOC) + ENDDO + ENDIF ENDDO ENDDO @@ -268,29 +231,22 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) & D_NUMP) !$ACC END HOST_DATA - -!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM,KDGLU,ISKIP,ISL) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL,KDGLU) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO JI=1,R_NDGNH - DO JK=1,KFC - KM = D_MYMS(KMLOC) - KDGLU = MIN(R_NDGNH,G_NDGLU(KM)) - IF (JI .LE. KDGLU) then - IF(KM == 0)THEN - ISKIP = 2 - ELSE - ISKIP = 1 - END IF - - ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) - IF (MOD((JK-1),ISKIP) .EQ. 0) THEN - !PSOA1(JK,ISL+JI-1,KMLOC) = IZCST((JK-1)/ISKIP+1+(JI-1+(KMLOC-1)*R_NDGNH)*IF_FS_INV) - PSOA1(JK,ISL+JI-1,KMLOC) = ZZCSTS((JK-1)/ISKIP+1,JI,KMLOC) - END IF - END IF - ENDDO - ENDDO -END DO + DO JK=1,KFC + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + KDGLU = MIN(R_NDGNH,G_NDGLU(KM)) + !$ACC LOOP SEQ + DO JGL=ISL,R_NDGNH + IF(KM /= 0) THEN + PSOA1(JK,JGL,KMLOC) = ZZCSTS((JK-1)+1,JGL-ISL+1,KMLOC) + ELSEIF (MOD((JK-1),2) .EQ. 0) THEN + PSOA1(JK,JGL,KMLOC) = ZZCSTS((JK-1)/2+1,JGL-ISL+1,KMLOC) + ENDIF + ENDDO + ENDDO +ENDDO !$ACC END DATA From 6a4652c5acf0cd6697e248944b441767b9eb8a95 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:57 -0700 Subject: [PATCH 052/263] simplify recombination inv le --- src/trans/gpu/internal/asre1b_mod.F90 | 35 +++++++++++++-------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/trans/gpu/internal/asre1b_mod.F90 b/src/trans/gpu/internal/asre1b_mod.F90 index 585036e6e..9ef494c95 100755 --- a/src/trans/gpu/internal/asre1b_mod.F90 +++ b/src/trans/gpu/internal/asre1b_mod.F90 @@ -83,25 +83,24 @@ SUBROUTINE ASRE1B(KFIELD,PAOA,PSOA) ! --------------------------------------------------- !$ACC DATA PRESENT(PAOA,PSOA,D_MYMS,D_NPROCL,D_NSTAGT1B,D_NPNTGTB1,G_NDGLU) PRESENT(FOUBUF_IN) -!$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(KM,ISL,IPROC,ISTAN,IGLS,IPROCS,ISTAS) +!$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(KM,ISL,IPROC,ISTAN,IGLS,IPROCS,ISTAS) DO KMLOC=1,D_NUMP - KM = D_MYMS(KMLOC) - ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) - DO JGL=ISL, R_NDGNH -! if (JGL .ge. ISL) then - IPROC = D_NPROCL(JGL) - ISTAN = (D_NSTAGT1B(IPROC) + D_NPNTGTB1(KMLOC,JGL))*2*KFIELD - IGLS = R_NDGL+1-JGL - IPROCS = D_NPROCL(IGLS) - ISTAS = (D_NSTAGT1B(IPROCS) + D_NPNTGTB1(KMLOC,IGLS))*2*KFIELD - DO JFLD=1,2*KFIELD - !write(iunit,*) 'xx ', KM, KFIELD, ISL , KMLOC, JFLD, ISTAN, ISTAS, IGLS, JGL, IPROC, PAOA(JFLD,JGL,KMLOC), PSOA(JFLD,JGL,KMLOC) - !call flush(iunit) - FOUBUF_IN(ISTAN+JFLD) = PAOA(JFLD,JGL,KMLOC)+PSOA(JFLD,JGL,KMLOC) - FOUBUF_IN(ISTAS+JFLD) = PSOA(JFLD,JGL,KMLOC)-PAOA(JFLD,JGL,KMLOC) - ENDDO -! end if - ENDDO + DO JFLD=1,2*KFIELD + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + !$ACC LOOP SEQ + DO JGL=ISL, R_NDGNH + IPROC = D_NPROCL(JGL) + ISTAN = (D_NSTAGT1B(IPROC) + D_NPNTGTB1(KMLOC,JGL))*2*KFIELD + + IGLS = R_NDGL+1-JGL + IPROCS = D_NPROCL(IGLS) + ISTAS = (D_NSTAGT1B(IPROCS) + D_NPNTGTB1(KMLOC,IGLS))*2*KFIELD + + FOUBUF_IN(ISTAN+JFLD) = PAOA(JFLD,JGL,KMLOC)+PSOA(JFLD,JGL,KMLOC) + FOUBUF_IN(ISTAS+JFLD) = PSOA(JFLD,JGL,KMLOC)-PAOA(JFLD,JGL,KMLOC) + ENDDO + ENDDO ENDDO !$ACC END DATA From 13f4c3ab4e292e6790c1fa81214edfd505a9c020 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:58 -0700 Subject: [PATCH 053/263] First cleanup prfi1b --- src/trans/gpu/internal/prfi1b_mod.F90 | 82 +++++++++++++-------------- 1 file changed, 39 insertions(+), 43 deletions(-) diff --git a/src/trans/gpu/internal/prfi1b_mod.F90 b/src/trans/gpu/internal/prfi1b_mod.F90 index fc2f491ea..f92ba2619 100755 --- a/src/trans/gpu/internal/prfi1b_mod.F90 +++ b/src/trans/gpu/internal/prfi1b_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -72,7 +73,7 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) INTEGER(KIND=JPIM),INTENT(IN),OPTIONAL :: KFLDPTR(:) ! LOCAL INTEGER SCALARS - INTEGER(KIND=JPIM) :: II, INM, IR, J, JFLD, ILCM, IOFF,IFLD + INTEGER(KIND=JPIM) :: II, INM, IR, JN, JFLD, ILCM, IASM0,IFLD ! ------------------------------------------------------------------ @@ -93,20 +94,20 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) !loop over wavenumber - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ILCM,IFLD,IOFF,IR,II,INM) + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ILCM,IFLD,IASM0,IR,II,INM) DO KMLOC=1,D_NUMP - DO J=1,R_NSMAX+1 + DO JN=1,R_NSMAX+1 DO JFLD=1,KFIELDS KM = D_MYMS(KMLOC) ILCM = R_NSMAX+1-KM IFLD = KFLDPTR(JFLD) - IF (J .LE. ILCM) THEN - IOFF = D_NASM0(KM) - INM = IOFF+(ILCM-J)*2 + IF (JN .LE. ILCM) THEN + IASM0 = D_NASM0(KM) + INM = IASM0+(ILCM-JN)*2 IR = 2*(JFLD-1)+1 II = IR+1 - PIA(IR,J+2,KMLOC) = PSPEC(iFLD,INM ) - PIA(II,J+2,KMLOC) = PSPEC(iFLD,INM+1) + PIA(IR,JN+2,KMLOC) = PSPEC(iFLD,INM ) + PIA(II,JN+2,KMLOC) = PSPEC(iFLD,INM+1) END IF ENDDO ENDDO @@ -126,45 +127,40 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) ! end loop over wavenumber END DO - ELSE +ELSE - !loop over wavenumber + !loop over wavenumber - !$ACC PARALLEL LOOP !!COLLAPSE(3) PRIVATE(KM,ILCM,IOFF,INM,IR,II) - DO KMLOC=1,D_NUMP - DO J=1,R_NSMAX+1 - DO JFLD=1,KFIELDS - KM = D_MYMS(KMLOC) - ILCM = R_NSMAX+1-KM - if (J .le. ILCM) then - IOFF = D_NASM0(KM) - INM = IOFF+(ILCM-J)*2 - IR = 2*(JFLD-1)+1 - II = IR+1 - IF( INM .LT. KDIM ) THEN - PIA(IR,J+2,KMLOC) = PSPEC(JFLD,INM ) - PIA(II,J+2,KMLOC) = PSPEC(JFLD,INM+1) - ENDIF - end if - ENDDO + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ILCM,IASM0,INM,JN) DEFAULT(NONE) + DO KMLOC=1,D_NUMP + DO JFLD=1,KFIELDS + KM = D_MYMS(KMLOC) + IASM0 = D_NASM0(KM) + + !$ACC LOOP SEQ + DO JN=2,R_NSMAX+2-KM + INM = IASM0+((R_NSMAX+2-JN)-KM)*2 + IF( INM .LT. KDIM ) THEN ! TODO is this really needed, we don't have it in the reverse... + ! TODO THIS IS NOT JN+1 in the reverse code but JN + PIA(2*JFLD-1,JN+1,KMLOC) = PSPEC(JFLD,INM ) + PIA(2*JFLD ,JN+1,KMLOC) = PSPEC(JFLD,INM+1) + END IF ENDDO - - ! end loop over wavenumber - END DO + ENDDO + END DO - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ILCM) - DO KMLOC=1,D_NUMP - DO JFLD=1,2*KFIELDS - KM = D_MYMS(KMLOC) - ILCM = R_NSMAX+1-KM - PIA(JFLD,1,KMLOC) = 0.0_JPRB - PIA(JFLD,2,KMLOC) = 0.0_JPRB - PIA(JFLD,ILCM+3,KMLOC) = 0.0_JPRB - ENDDO - ! end loop over wavenumber - END DO - - END IF + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,JN) + DO KMLOC=1,D_NUMP + DO JFLD=1,2*KFIELDS + PIA(JFLD,1,KMLOC) = 0.0_JPRB + PIA(JFLD,2,KMLOC) = 0.0_JPRB + + KM = D_MYMS(KMLOC) + JN = R_NSMAX+3-KM + PIA(JFLD,JN+1,KMLOC) = 0.0_JPRB + ENDDO + END DO +END IF !$ACC END DATA !$ACC END DATA From 0698f15692ccb1bebe52c0fde6e1fe32dc593bde Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:58 -0700 Subject: [PATCH 054/263] Remove ZN --- src/trans/gpu/internal/vdtuv_mod.F90 | 58 ++++++++++++++-------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/src/trans/gpu/internal/vdtuv_mod.F90 b/src/trans/gpu/internal/vdtuv_mod.F90 index bb74bf375..73c5e2a8c 100755 --- a/src/trans/gpu/internal/vdtuv_mod.F90 +++ b/src/trans/gpu/internal/vdtuv_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -13,7 +14,7 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT -USE TPM_DIM ,ONLY : R +USE TPM_DIM ,ONLY : R, R_NTMAX USE TPM_FIELDS ,ONLY : F USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS use tpm_gen, only: nout @@ -77,7 +78,7 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) INTEGER(KIND=JPIM) :: KM, kmloc INTEGER(KIND=JPIM), INTENT(IN) :: KFIELD REAL(KIND=JPRBT), INTENT(IN) :: PEPSNM(1:D%NUMP,0:R%NTMAX+2) -REAL(KIND=JPRB), INTENT(IN) :: PVOR(:,:,:),PDIV(:,:,:) +REAL(KIND=JPRB), INTENT(INOUT) :: PVOR(:,:,:),PDIV(:,:,:) REAL(KIND=JPRB), INTENT(OUT) :: PU (:,:,:),PV (:,:,:) ! LOCAL INTEGER SCALARS @@ -85,15 +86,14 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) ! LOCAL REAL SCALARS REAL(KIND=JPRBT) :: ZKM -REAL(KIND=JPRBT) :: ZN(-1:R%NTMAX+4) REAL(KIND=JPRBT) :: ZLAPIN(-1:R%NSMAX+4) REAL(KIND=JPRBT) :: ZEPSNM(-1:R%NSMAX+4) !$ACC DATA & -!$ACC CREATE (ZEPSNM, ZN, ZLAPIN) & +!$ACC CREATE (ZEPSNM, ZLAPIN) & !$ACC COPYIN (D,D%MYMS,F,F%RLAPIN,F%RN) & !$ACC PRESENT(PEPSNM, PVOR, PDIV) & -!$ACC PRESENT(PU, PV) +!$ACC PRESENT(PU, PV, D_MYMS) ! ------------------------------------------------------------------ @@ -103,35 +103,34 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) ISMAX = R%NSMAX DO KMLOC=1,D%NUMP - ZKM = D%MYMS(KMLOC) + KM = D_MYMS(KMLOC) + ZKM = REAL(KM,JPRBT) !$ACC PARALLEL LOOP DEFAULT(NONE) - DO JN=ZKM-1,ISMAX+2 + DO JN=KM-1,ISMAX+2 IJ = ISMAX+3-JN - ZN(IJ) = F%RN(JN) ZLAPIN(IJ) = F%RLAPIN(JN) IF( JN >= 0 ) THEN - ZEPSNM(IJ) = PEPSNM(KMLOC,JN) + ZEPSNM(ISMAX+3-JN) = PEPSNM(KMLOC,JN) ELSE - ZEPSNM(IJ) = 0 + ZEPSNM(ISMAX+3-JN) = 0 ENDIF ENDDO - !$ACC KERNELS DEFAULT(NONE) - ZN(0) = F%RN(ISMAX+3) - !$ACC END KERNELS !* 1.1 U AND V (KM=0) . -IF(ZKM == 0) THEN +IF(KM == 0) THEN !$ACC PARALLEL LOOP DEFAULT(NONE) DO J=1,KFIELD IR = 2*J-1 - DO JI=2,ISMAX+3 + + DO JN=0,R_NTMAX+1 + JI = R_NTMAX+3-JN PU(IR,JI,KMLOC) = +& - &ZN(JI+1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PVOR(IR,JI+1,KMLOC)-& - &ZN(JI-2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PVOR(IR,JI-1,KMLOC) + &(JN-1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PVOR(IR,JI+1,KMLOC)-& + &(JN+2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PVOR(IR,JI-1,KMLOC) PV(IR,JI,KMLOC) = -& - &ZN(JI+1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PDIV(IR,JI+1,KMLOC)+& - &ZN(JI-2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PDIV(IR,JI-1,KMLOC) + &(JN-1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PDIV(IR,JI+1,KMLOC)+& + &(JN+2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PDIV(IR,JI-1,KMLOC) ENDDO ENDDO ELSE @@ -139,23 +138,24 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) DO J=1,KFIELD - DO JI=2,ISMAX+3-ZKM - !ZKM = D_MYMS(KMLOC) + + DO JN=KM,R_NTMAX+1 + JI = R_NTMAX+3-JN IR = 2*J-1 II = IR+1 !IF (ZKM>0 .AND. JI<=ISMAX+3-zKM) THEN PU(ir,JI,kmloc) = -ZKM*ZLAPIN(JI)*PDIV(ii,JI,kmloc)+& - &ZN(JI+1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PVOR(ir,JI+1,kmloc)-& - &ZN(JI-2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PVOR(ir,JI-1,kmloc) + &(JN-1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PVOR(ir,JI+1,kmloc)-& + &(JN+2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PVOR(ir,JI-1,kmloc) PU(ii,JI,kmloc) = +ZKM*ZLAPIN(JI)*PDIV(ir,JI,kmloc)+& - &ZN(JI+1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PVOR(ii,JI+1,kmloc)-& - &ZN(JI-2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PVOR(ii,JI-1,kmloc) + &(JN-1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PVOR(ii,JI+1,kmloc)-& + &(JN+2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PVOR(ii,JI-1,kmloc) PV(ir,JI,kmloc) = -ZKM*ZLAPIN(JI)*PVOR(ii,JI,kmloc)-& - &ZN(JI+1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PDIV(ir,JI+1,kmloc)+& - &ZN(JI-2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PDIV(ir,JI-1,kmloc) + &(JN-1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PDIV(ir,JI+1,kmloc)+& + &(JN+2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PDIV(ir,JI-1,kmloc) PV(ii,JI,kmloc) = +ZKM*ZLAPIN(JI)*PVOR(ir,JI,kmloc)-& - &ZN(JI+1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PDIV(ii,JI+1,kmloc)+& - &ZN(JI-2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PDIV(ii,JI-1,kmloc) + &(JN-1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PDIV(ii,JI+1,kmloc)+& + &(JN+2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PDIV(ii,JI-1,kmloc) !ENDIF ENDDO ENDDO From 5afa4aadc0f1755b63fb07a6dece7342775af17f Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:58 -0700 Subject: [PATCH 055/263] Remove ZALPIN and ZEPSMN --- src/trans/gpu/internal/vdtuv_mod.F90 | 46 ++++++++++------------------ 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/src/trans/gpu/internal/vdtuv_mod.F90 b/src/trans/gpu/internal/vdtuv_mod.F90 index 73c5e2a8c..70dfc09fe 100755 --- a/src/trans/gpu/internal/vdtuv_mod.F90 +++ b/src/trans/gpu/internal/vdtuv_mod.F90 @@ -86,11 +86,8 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) ! LOCAL REAL SCALARS REAL(KIND=JPRBT) :: ZKM -REAL(KIND=JPRBT) :: ZLAPIN(-1:R%NSMAX+4) -REAL(KIND=JPRBT) :: ZEPSNM(-1:R%NSMAX+4) !$ACC DATA & -!$ACC CREATE (ZEPSNM, ZLAPIN) & !$ACC COPYIN (D,D%MYMS,F,F%RLAPIN,F%RN) & !$ACC PRESENT(PEPSNM, PVOR, PDIV) & !$ACC PRESENT(PU, PV, D_MYMS) @@ -105,16 +102,6 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) DO KMLOC=1,D%NUMP KM = D_MYMS(KMLOC) ZKM = REAL(KM,JPRBT) - !$ACC PARALLEL LOOP DEFAULT(NONE) - DO JN=KM-1,ISMAX+2 - IJ = ISMAX+3-JN - ZLAPIN(IJ) = F%RLAPIN(JN) - IF( JN >= 0 ) THEN - ZEPSNM(ISMAX+3-JN) = PEPSNM(KMLOC,JN) - ELSE - ZEPSNM(ISMAX+3-JN) = 0 - ENDIF - ENDDO !* 1.1 U AND V (KM=0) . @@ -126,11 +113,11 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) DO JN=0,R_NTMAX+1 JI = R_NTMAX+3-JN PU(IR,JI,KMLOC) = +& - &(JN-1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PVOR(IR,JI+1,KMLOC)-& - &(JN+2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PVOR(IR,JI-1,KMLOC) + &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PVOR(IR,JI+1,KMLOC)-& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PVOR(IR,JI-1,KMLOC) PV(IR,JI,KMLOC) = -& - &(JN-1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PDIV(IR,JI+1,KMLOC)+& - &(JN+2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PDIV(IR,JI-1,KMLOC) + &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PDIV(IR,JI+1,KMLOC)+& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PDIV(IR,JI-1,KMLOC) ENDDO ENDDO ELSE @@ -138,24 +125,23 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) DO J=1,KFIELD - DO JN=KM,R_NTMAX+1 JI = R_NTMAX+3-JN IR = 2*J-1 II = IR+1 !IF (ZKM>0 .AND. JI<=ISMAX+3-zKM) THEN - PU(ir,JI,kmloc) = -ZKM*ZLAPIN(JI)*PDIV(ii,JI,kmloc)+& - &(JN-1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PVOR(ir,JI+1,kmloc)-& - &(JN+2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PVOR(ir,JI-1,kmloc) - PU(ii,JI,kmloc) = +ZKM*ZLAPIN(JI)*PDIV(ir,JI,kmloc)+& - &(JN-1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PVOR(ii,JI+1,kmloc)-& - &(JN+2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PVOR(ii,JI-1,kmloc) - PV(ir,JI,kmloc) = -ZKM*ZLAPIN(JI)*PVOR(ii,JI,kmloc)-& - &(JN-1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PDIV(ir,JI+1,kmloc)+& - &(JN+2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PDIV(ir,JI-1,kmloc) - PV(ii,JI,kmloc) = +ZKM*ZLAPIN(JI)*PVOR(ir,JI,kmloc)-& - &(JN-1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PDIV(ii,JI+1,kmloc)+& - &(JN+2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PDIV(ii,JI-1,kmloc) + PU(ir,JI,kmloc) = -ZKM*F%RLAPIN(JN)*PDIV(ii,JI,kmloc)+& + &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PVOR(ir,JI+1,kmloc)-& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PVOR(ir,JI-1,kmloc) + PU(ii,JI,kmloc) = +ZKM*F%RLAPIN(JN)*PDIV(ir,JI,kmloc)+& + &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PVOR(ii,JI+1,kmloc)-& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PVOR(ii,JI-1,kmloc) + PV(ir,JI,kmloc) = -ZKM*F%RLAPIN(JN)*PVOR(ii,JI,kmloc)-& + &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PDIV(ir,JI+1,kmloc)+& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PDIV(ir,JI-1,kmloc) + PV(ii,JI,kmloc) = +ZKM*F%RLAPIN(JN)*PVOR(ir,JI,kmloc)-& + &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PDIV(ii,JI+1,kmloc)+& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PDIV(ii,JI-1,kmloc) !ENDIF ENDDO ENDDO From 9e2bd808eb26c5fb1f666a8e2da79760e43cceb8 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:58 -0700 Subject: [PATCH 056/263] Restructure loop --- src/trans/gpu/internal/vdtuv_mod.F90 | 73 ++++++++++++---------------- 1 file changed, 32 insertions(+), 41 deletions(-) diff --git a/src/trans/gpu/internal/vdtuv_mod.F90 b/src/trans/gpu/internal/vdtuv_mod.F90 index 70dfc09fe..12d8073bb 100755 --- a/src/trans/gpu/internal/vdtuv_mod.F90 +++ b/src/trans/gpu/internal/vdtuv_mod.F90 @@ -82,7 +82,7 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) REAL(KIND=JPRB), INTENT(OUT) :: PU (:,:,:),PV (:,:,:) ! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: II, IJ, IR, J, JN, ISMAX,JI +INTEGER(KIND=JPIM) :: II, IJ, IR, J, JN, JI ! LOCAL REAL SCALARS REAL(KIND=JPRBT) :: ZKM @@ -98,54 +98,45 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) !* 1. COMPUTE U V FROM VORTICITY AND DIVERGENCE. ! ------------------------------------------ -ISMAX = R%NSMAX +!$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) DO KMLOC=1,D%NUMP - KM = D_MYMS(KMLOC) - ZKM = REAL(KM,JPRBT) - -!* 1.1 U AND V (KM=0) . - -IF(KM == 0) THEN - !$ACC PARALLEL LOOP DEFAULT(NONE) DO J=1,KFIELD IR = 2*J-1 + II = IR+1 + KM = D_MYMS(KMLOC) + ZKM = REAL(KM,JPRBT) - DO JN=0,R_NTMAX+1 - JI = R_NTMAX+3-JN - PU(IR,JI,KMLOC) = +& - &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PVOR(IR,JI+1,KMLOC)-& - &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PVOR(IR,JI-1,KMLOC) - PV(IR,JI,KMLOC) = -& - &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PDIV(IR,JI+1,KMLOC)+& - &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PDIV(IR,JI-1,KMLOC) - ENDDO - ENDDO -ELSE -!* 1.2 U AND V (KM!=0) . + IF(KM == 0) THEN + !$ACC LOOP SEQ + DO JN=0,R_NTMAX+1 + JI = R_NTMAX+3-JN + PU(IR,JI,KMLOC) = +& + &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PVOR(IR,JI+1,KMLOC)-& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PVOR(IR,JI-1,KMLOC) + PV(IR,JI,KMLOC) = -& + &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PDIV(IR,JI+1,KMLOC)+& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PDIV(IR,JI-1,KMLOC) + ENDDO - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) - DO J=1,KFIELD + ELSE + !$ACC LOOP SEQ DO JN=KM,R_NTMAX+1 JI = R_NTMAX+3-JN - IR = 2*J-1 - II = IR+1 - !IF (ZKM>0 .AND. JI<=ISMAX+3-zKM) THEN - PU(ir,JI,kmloc) = -ZKM*F%RLAPIN(JN)*PDIV(ii,JI,kmloc)+& - &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PVOR(ir,JI+1,kmloc)-& - &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PVOR(ir,JI-1,kmloc) - PU(ii,JI,kmloc) = +ZKM*F%RLAPIN(JN)*PDIV(ir,JI,kmloc)+& - &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PVOR(ii,JI+1,kmloc)-& - &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PVOR(ii,JI-1,kmloc) - PV(ir,JI,kmloc) = -ZKM*F%RLAPIN(JN)*PVOR(ii,JI,kmloc)-& - &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PDIV(ir,JI+1,kmloc)+& - &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PDIV(ir,JI-1,kmloc) - PV(ii,JI,kmloc) = +ZKM*F%RLAPIN(JN)*PVOR(ir,JI,kmloc)-& - &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PDIV(ii,JI+1,kmloc)+& - &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PDIV(ii,JI-1,kmloc) - !ENDIF + PU(ir,JI,kmloc) = -ZKM*F%RLAPIN(JN)*PDIV(ii,JI,kmloc)+& + &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PVOR(ir,JI+1,kmloc)-& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PVOR(ir,JI-1,kmloc) + PU(ii,JI,kmloc) = +ZKM*F%RLAPIN(JN)*PDIV(ir,JI,kmloc)+& + &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PVOR(ii,JI+1,kmloc)-& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PVOR(ii,JI-1,kmloc) + PV(ir,JI,kmloc) = -ZKM*F%RLAPIN(JN)*PVOR(ii,JI,kmloc)-& + &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PDIV(ir,JI+1,kmloc)+& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PDIV(ir,JI-1,kmloc) + PV(ii,JI,kmloc) = +ZKM*F%RLAPIN(JN)*PVOR(ir,JI,kmloc)-& + &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PDIV(ii,JI+1,kmloc)+& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PDIV(ii,JI-1,kmloc) ENDDO - ENDDO - ENDIF + ENDIF + ENDDO ENDDO !$ACC END DATA From b1ca5b98d7e12b2a43bdfee465beb5a6c4db1e47 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:58 -0700 Subject: [PATCH 057/263] Usual restructuring for SPNSDE --- src/trans/gpu/internal/spnsde_mod.F90 | 81 +++++++++------------------ 1 file changed, 28 insertions(+), 53 deletions(-) diff --git a/src/trans/gpu/internal/spnsde_mod.F90 b/src/trans/gpu/internal/spnsde_mod.F90 index 6d9a649c1..b8e0d02d0 100755 --- a/src/trans/gpu/internal/spnsde_mod.F90 +++ b/src/trans/gpu/internal/spnsde_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -14,8 +15,7 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT USE TPM_GEN, only: nout -USE TPM_DIM ,ONLY : R -USE TPM_FIELDS ,ONLY : F +USE TPM_DIM ,ONLY : R, R_NTMAX USE TPM_DISTR ,ONLY : D !USE TPM_TRANS @@ -82,14 +82,11 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) REAL(KIND=JPRB), INTENT(OUT) :: PNSD(:,:,:) ! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: IJ, ISKIP, J, JN,JI,ISMAX, IR, II -REAL(KIND=JPRBT) :: ZZEPSNM(-1:R%NSMAX+4) -REAL(KIND=JPRBT) :: ZN(-1:R%NTMAX+4) +INTEGER(KIND=JPIM) :: IJ, ISKIP, J, JN,JI, IR, II !$ACC DATA & -!$ACC CREATE (ZN,ZZEPSNM) & -!$ACC PRESENT (F,F%RN) & -!$ACC PRESENT (PEPSNM, PF, PNSD) +!$ACC PRESENT (D) & +!$ACC PRESENT (PEPSNM,PF,PNSD) ! ------------------------------------------------------------------ @@ -99,54 +96,32 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) !* 1.1 COMPUTE -ISMAX = R%NSMAX -!loop over wavenumber +!$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(IR) DO KMLOC=1,D%NUMP - KM = D%MYMS(KMLOC) - !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IJ) - DO JN=KM-1,ISMAX+2 - IJ = ISMAX+3-JN - ZN(IJ) = F%RN(JN) - IF( JN >= 0 ) THEN - ZZEPSNM(IJ) = PEPSNM(KMLOC,JN) - ELSE - ZZEPSNM(IJ) = 0 - ENDIF - !write(nout,*) 'deriv dy debug in ; ',JN, IJ, ZN(IJ),ZZEPSNM(IJ),PEPSNM(KMLOC,JN) - ENDDO - !$ACC KERNELS DEFAULT(NONE) - ZN(0) = F%RN(ISMAX+3) - !$ACC END KERNELS - - IF(KM == 0) THEN - !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IR) - DO J=1,KF_SCALARS - IR = 2*J-1 - DO JI=2,ISMAX+3 - PNSD(IR,JI,KMLOC) = -ZN(JI+1)*ZZEPSNM(JI)*PF(IR,JI+1,KMLOC)+& - &ZN(JI-2)*ZZEPSNM(JI-1)*PF(IR,JI-1,KMLOC) - ENDDO - ENDDO - ELSE - - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(IR,II) - DO J=1,KF_SCALARS - DO JI=2,ISMAX+3-KM - IR = 2*J-1 - II = IR+1 - PNSD(IR,JI,KMLOC) = -ZN(JI+1)*ZZEPSNM(JI)*PF(IR,JI+1,KMLOC)+& - &ZN(JI-2)*ZZEPSNM(JI-1)*PF(IR,JI-1,KMLOC) - PNSD(II,JI,KMLOC) = -ZN(JI+1)*ZZEPSNM(JI)*PF(II,JI+1,KMLOC)+& - &ZN(JI-2)*ZZEPSNM(JI-1)*PF(II,JI-1,KMLOC) - !write(301,*) 'deriv dy debug 2nd; ',KMLOC,IR,II,JI,J,PNSD(IR,JI,KMLOC),PNSD(II,JI,KMLOC) - !call flush(301) + DO J=1,KF_SCALARS + KM = D%MYMS(KMLOC) + IR = 2*J-1 + II = IR+1 + + IF(KM == 0) THEN + !$ACC LOOP SEQ + DO JN=0,R_NTMAX+1 + JI = R_NTMAX+3-JN + PNSD(IR,JI,KMLOC) = -(JN-1)*PEPSNM(KMLOC,JN)*PF(IR,JI+1,KMLOC)+& + &(JN+2)*PEPSNM(KMLOC,JN+1)*PF(IR,JI-1,KMLOC) ENDDO - ENDDO - !write(301,*) 'deriv dy debug 2nd; ',KMLOC,maxval(PNSD(1,:,KMLOC)),maxval(PNSD(2,:,KMLOC)) - !call flush(301) - ENDIF -!end loop over wavenumber + ELSE + !$ACC LOOP SEQ + DO JN=KM,R_NTMAX+1 + JI = R_NTMAX+3-JN + PNSD(IR,JI,KMLOC) = -(JN-1)*PEPSNM(KMLOC,JN)*PF(IR,JI+1,KMLOC)+& + &(JN+2)*PEPSNM(KMLOC,JN+1)*PF(IR,JI-1,KMLOC) + PNSD(II,JI,KMLOC) = -(JN-1)*PEPSNM(KMLOC,JN)*PF(II,JI+1,KMLOC)+& + &(JN+2)*PEPSNM(KMLOC,JN+1)*PF(II,JI-1,KMLOC) + ENDDO + ENDIF + ENDDO END DO !$ACC END DATA From cdeae31a08c5dc0d919e10748a0b230d893e3720 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:59 -0700 Subject: [PATCH 058/263] Simplify GEMMS --- src/trans/gpu/internal/leinv_mod.F90 | 36 +++++++++++++--------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 2c5ca7dd2..c12718c03 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -81,7 +81,6 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) INTEGER(KIND=JPIM) :: KMLOC INTEGER(KIND=JPIM), INTENT(IN) :: KFC INTEGER(KIND=JPIM) :: KIFC -INTEGER(KIND=JPIM) :: KDGLU INTEGER(KIND=JPIM), INTENT(IN) :: KF_OUT_LT REAL(KIND=JPRB), INTENT(IN) :: PIA(:,:,:) REAL(KIND=JPRBT), INTENT(OUT) :: PSOA1(:,:,:) @@ -110,10 +109,10 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) ENDIF CALL GSTATS(453,0) -ALLOCATE(ZZBA(ITDZBA,ILDZBA,D_NUMP)) -ALLOCATE(ZZCSTA(ITDZCA,ILDZCA,D_NUMP)) -ALLOCATE(ZZBS(ITDZBS,ILDZBS,D_NUMP)) -ALLOCATE(ZZCSTS(ITDZCS,ILDZCS,D_NUMP)) +ALLOCATE(ZZBA(KFC,TDZAA,D_NUMP)) +ALLOCATE(ZZCSTA(KFC,R_NDGNH,D_NUMP)) +ALLOCATE(ZZBS(KFC,TDZAS,D_NUMP)) +ALLOCATE(ZZCSTS(KFC,R_NDGNH,D_NUMP)) !$ACC DATA CREATE(ZZBA,ZZCSTA,ZZBS,ZZCSTS) !$ACC DATA COPYIN (D,D_MYMS,R,G,G_NDGLU,D_NUMP,R_NDGNH,R_NSMAX) & @@ -126,7 +125,6 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) DO KMLOC=1,D_NUMP DO JGL=1,R_NDGNH DO J1=2,KFC,2 - KM = D_MYMS(KMLOC) IF(KM == 0)THEN PSOA1(J1,JGL,KMLOC) = 0.0_JPRBT @@ -145,7 +143,7 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+2)/2 IA = 1+MOD(R_NSMAX-KM+2,2) - ZZBA((JK-1)+1,J,KMLOC)=PIA(JK,IA+1+(J-1)*2,KMLOC) + ZZBA(JK,J,KMLOC)=PIA(JK,IA+1+(J-1)*2,KMLOC) ENDDO ELSEIF (MOD((JK-1),2) .EQ. 0) THEN !$ACC LOOP SEQ @@ -168,21 +166,20 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) !$ACC HOST_DATA USE_DEVICE(ZAA,ZZBA,ZZCSTA) CALL CUDA_GEMM_BATCHED( & & 'N', 'T', & - & ITDZCA, ILDZCA, ILDZBA, & + & KFC, R_NDGNH, TDZAA, & & 1.0_JPRBT, & - & ZZBA, ITDZBA, ILDZBA,& - & ZAA, LDZAA, TDZAA, & + & ZZBA, KFC, TDZAA,& + & ZAA, R_NDGNH, TDZAA, & & 0._JPRBT, & - & ZZCSTA, ITDZCA, ILDZCA, & + & ZZCSTA, KFC, R_NDGNH, & & D_NUMP) !$ACC END HOST_DATA -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL,KDGLU) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO JK=1,KFC KM = D_MYMS(KMLOC) ISL = R_NDGNH-G_NDGLU(KM)+1 - KDGLU = MIN(R_NDGNH,G_NDGLU(KM)) !$ACC LOOP SEQ DO JGL=ISL,R_NDGNH IF(KM /= 0) THEN @@ -204,7 +201,7 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+3)/2 IS = 1+MOD(R_NSMAX-KM+1,2) - ZZBS((JK-1)+1,J,KMLOC)=PIA(JK,IS+1+(J-1)*2,KMLOC) + ZZBS(JK,J,KMLOC)=PIA(JK,IS+1+(J-1)*2,KMLOC) ENDDO ELSEIF (MOD((JK-1),2) .EQ. 0) THEN !$ACC LOOP SEQ @@ -222,21 +219,20 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) !$ACC HOST_DATA USE_DEVICE(ZAS,ZZBS,ZZCSTS) CALL CUDA_GEMM_BATCHED( & & 'N', 'T', & - & ITDZCS, ILDZCS, ILDZBS, & + & KFC, R_NDGNH, TDZAS, & & 1.0_JPRBT, & - & ZZBS, ITDZBS, ILDZBS, & - & ZAS, LDZAS, TDZAS, & + & ZZBS, KFC, TDZAS, & + & ZAS, R_NDGNH, TDZAS, & & 0._JPRBT, & - & ZZCSTS, ITDZCS, ILDZCS, & + & ZZCSTS, KFC, R_NDGNH, & & D_NUMP) !$ACC END HOST_DATA -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL,KDGLU) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO JK=1,KFC KM = D_MYMS(KMLOC) ISL = R_NDGNH-G_NDGLU(KM)+1 - KDGLU = MIN(R_NDGNH,G_NDGLU(KM)) !$ACC LOOP SEQ DO JGL=ISL,R_NDGNH IF(KM /= 0) THEN From 5d3c0d85a180d08c568083894d1d0e00a7a51730 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:59 -0700 Subject: [PATCH 059/263] LEINV/LEDIR are more similar now and reallocate data --- src/trans/gpu/internal/ledir_mod.F90 | 78 ++++++++++++++++------------ src/trans/gpu/internal/leinv_mod.F90 | 78 +++++++++++----------------- 2 files changed, 76 insertions(+), 80 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 524b74874..6ae86db8f 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -58,10 +58,7 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX,R_NTMAX,R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU -USE TPM_FIELDS ,ONLY : F, & - & ZAA,DZBST,DZCAT,ZAS,DZCST,& - & ZAA0,DZBST0,DZCAT0,ZAS0,DZCST0,& - & TDZAA,TDZAS,KMLOC0 +USE TPM_FIELDS ,ONLY : F,ZAA,ZAS,ZAA0,ZAS0,TDZAA,TDZAS,KMLOC0 USE TPM_DISTR USE TPM_GEN, ONLY: NOUT USE TPM_FLT @@ -90,23 +87,34 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) INTEGER(KIND=JPIM) :: IGLS, JF, JGL INTEGER(KIND=JPIM) :: OFFSET1, OFFSET2 +REAL(KIND=JPRBT), ALLOCATABLE :: ZINP(:), ZOUT(:) +REAL(KIND=JPRD), ALLOCATABLE :: ZINP0(:), ZOUT0(:) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) +IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='LEDIR BARRIER') +ENDIF +CALL GSTATS(452,0) + +ALLOCATE(ZINP(2*KF_FS*R_NDGNH*D_NUMP)) +ALLOCATE(ZOUT(2*KF_FS*TDZAS*D_NUMP)) +ALLOCATE(ZINP0(2*KF_FS*R_NDGNH)) +ALLOCATE(ZOUT0(2*KF_FS*TDZAS)) + !$ACC DATA & +!$ACC& CREATE(ZINP,ZOUT,ZINP0,ZOUT0) & !$ACC& PRESENT(F,F%RW) & !$ACC& PRESENT(D,D_MYMS,R,G,G_NDGLU) & -!$ACC& PRESENT(ZAA,ZAS,DZBST,DZCST,DZCAT) & -!$ACC& PRESENT(POA1,DZBST0,DZCAT0,DZBST0,DZCST0) & +!$ACC& PRESENT(ZAA,ZAS,POA1) & !$ACC& PRESENT(FOUBUF,D_NPNTGTB1,D_NSTAGT1B,D_NPROCL) +! TODO this doesn't make sense that we need it (???) +!$ACC KERNELS +ZINP(:) = 0 +!$ACC END KERNELS ! anti-symmetric -IF (LSYNC_TRANS) THEN - CALL MPL_BARRIER(CDSTRING='LEDIR BARRIER') -ENDIF -CALL GSTATS(452,0) - !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA) DO KMLOC=1,D_NUMP DO JF=1,KF_FS*2 @@ -121,7 +129,7 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) IF (JF .LE. 4*KF_UV) THEN PAIA = PAIA*F%RACTHE(JGL) ENDIF - DZBST((JF-1)+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIA*F%RW(JGL) + ZINP(JF+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIA*F%RW(JGL) ENDDO ENDDO END DO @@ -130,15 +138,15 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) ! Get C in transpose format to get better memory access patterns later !C=A*B => ! C^T=B^T*A^T -!$ACC HOST_DATA USE_DEVICE(ZAA,DZBST,DZCAT) +!$ACC HOST_DATA USE_DEVICE(ZAA,ZINP,ZOUT) CALL CUDA_GEMM_BATCHED( & & 'N', 'N', & & 2*KF_FS, TDZAA, R_NDGNH, & & 1.0_JPRBT, & - & DZBST, 2*KF_FS, R_NDGNH, & + & ZINP, 2*KF_FS, R_NDGNH, & & ZAA, R_NDGNH, TDZAA, & & 0.0_JPRBT, & - & DZCAT, 2*KF_FS, TDZAA, & + & ZOUT, 2*KF_FS, TDZAA, & & D_NUMP) !$ACC END HOST_DATA @@ -150,7 +158,7 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) IA = 1+MOD(R_NTMAX-KM+2,2) !$ACC LOOP SEQ DO J=1,(R%NSMAX-KM+2)/2 - POA1(JK,IA+(J-1)*2,KMLOC) = DZCAT((JK-1)+1+(J-1+(KMLOC-1)*TDZAA)*2*KF_FS) + POA1(JK,IA+(J-1)*2,KMLOC) = ZOUT(JK+(J-1+(KMLOC-1)*TDZAA)*2*KF_FS) ENDDO ENDIF ENDDO @@ -163,8 +171,8 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) DO JGL=1,G_NDGLU(0) DO JF=1,2*KF_FS,2 - DZBST0((JF-1)/2+1+(JGL-1)*2*KF_FS) & - & = DZBST((JF-1)+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*2*KF_FS) + ZINP0((JF-1)/2+1+(JGL-1)*2*KF_FS) & + & = ZINP((JF-1)+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*2*KF_FS) ENDDO ENDDO @@ -173,15 +181,15 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) !C=A*B => ! C^T=B^T*A^T - !$ACC HOST_DATA USE_DEVICE(ZAA0,DZBST0,DZCAT0) + !$ACC HOST_DATA USE_DEVICE(ZAA0,ZINP0,ZOUT0) CALL CUDA_DGEMM_BATCHED( & & 'N', 'N', & & 2*KF_FS, TDZAA, R_NDGNH, & & 1.0_JPRD, & - & DZBST0, 2*KF_FS, R_NDGNH, & + & ZINP0, 2*KF_FS, R_NDGNH, & & ZAA0, R_NDGNH, TDZAA, & & 0.0_JPRD, & - & DZCAT0, 2*KF_FS, TDZAA, & + & ZOUT0, 2*KF_FS, TDZAA, & & 1) !$ACC END HOST_DATA @@ -189,7 +197,7 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) DO J=1,(R_NSMAX+2)/2 DO JK=1,2*KF_FS,2 IA = 1+MOD(R_NTMAX+2,2) - POA1(JK,IA+(J-1)*2,KMLOC0) = DZCAT0((JK-1)/2+1+(J-1)*2*KF_FS) + POA1(JK,IA+(J-1)*2,KMLOC0) = ZOUT0((JK-1)/2+1+(J-1)*2*KF_FS) ENDDO ENDDO ENDIF @@ -210,7 +218,7 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) IF (JF .LE. 4*KF_UV) THEN PAIA = PAIA*F%RACTHE(JGL) ENDIF - DZBST((JF-1)+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIA*F%RW(JGL) + ZINP((JF-1)+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIA*F%RW(JGL) ENDDO ENDDO END DO @@ -218,15 +226,15 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) ! Get C in transpose format to get better memory access patterns later !C=A*B => ! C^T=B^T*A^T -!$ACC HOST_DATA USE_DEVICE(ZAS,DZBST,DZCST) +!$ACC HOST_DATA USE_DEVICE(ZAS,ZINP,ZOUT) CALL CUDA_GEMM_BATCHED( & & 'N', 'N', & & 2*KF_FS, TDZAS, R_NDGNH, & & 1.0_JPRBT, & - & DZBST, 2*KF_FS, R_NDGNH, & + & ZINP, 2*KF_FS, R_NDGNH, & & ZAS, R_NDGNH, TDZAS, & & 0.0_JPRBT, & - & DZCST, 2*KF_FS, TDZAS, & + & ZOUT, 2*KF_FS, TDZAS, & & D_NUMP) !$ACC END HOST_DATA @@ -238,7 +246,7 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) IS = 1+MOD(R_NTMAX-KM+1,2) !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+3)/2 - POA1(JK,IS+(J-1)*2,KMLOC) = DZCST(JK+(J-1+(KMLOC-1)*TDZAS)*2*KF_FS) + POA1(JK,IS+(J-1)*2,KMLOC) = ZOUT(JK+(J-1+(KMLOC-1)*TDZAS)*2*KF_FS) ENDDO ENDIF ENDDO @@ -248,8 +256,8 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) DO JGL=1,G_NDGLU(0) DO JF=1,2*KF_FS,2 - DZBST0((JF-1)/2+1+(JGL-1)*2*KF_FS) & - & = DZBST((JF-1)+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*2*KF_FS) + ZINP0((JF-1)/2+1+(JGL-1)*2*KF_FS) & + & = ZINP((JF-1)+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*2*KF_FS) ENDDO ENDDO @@ -257,15 +265,15 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) !C=A*B => ! C^T=B^T*A^T - !$ACC host_data use_device(ZAS0,DZBST0,DZCST0) + !$ACC host_data use_device(ZAS0,ZINP0,ZOUT0) call CUDA_DGEMM_BATCHED( & & 'N', 'N', & & 2*KF_FS, TDZAS, R_NDGNH, & & 1.0_JPRD, & - & DZBST0, 2*KF_FS, R_NDGNH, & + & ZINP0, 2*KF_FS, R_NDGNH, & & ZAS0, R_NDGNH, TDZAS, & & 0.0_JPRD, & - & DZCST0, 2*KF_FS, TDZAS, & + & ZOUT0, 2*KF_FS, TDZAS, & & 1) !$ACC end host_data @@ -273,13 +281,17 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) DO J=1,(R_NSMAX+3)/2 DO JK=1,2*KF_FS,2 IS = 1+MOD(R_NTMAX+1,2) - POA1(JK,IS+(J-1)*2,KMLOC0) = DZCST0((JK-1)/2+1+(J-1)*2*KF_FS) + POA1(JK,IS+(J-1)*2,KMLOC0) = ZOUT0((JK-1)/2+1+(J-1)*2*KF_FS) ENDDO ENDDO ENDIF !$ACC END DATA +DEALLOCATE(ZINP) +DEALLOCATE(ZOUT) +DEALLOCATE(ZINP0) +DEALLOCATE(ZOUT0) IF (LSYNC_TRANS) THEN diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index c12718c03..da40c9f6b 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -59,11 +59,7 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX USE TPM_GEOMETRY ,ONLY : G, G_NDGLU -USE TPM_FIELDS ,ONLY : F, ZIA, & - & ZAA,ZAS,LDZAA,LDZAS,TDZAA,TDZAS,& - & IZBS,ILDZBA,ILDZBS,ITDZBA,ITDZBS,& - & IZCS,IZCST,ILDZCA,ILDZCS,ITDZCA,ITDZCS,& - & TDZAS, IF_FS_INV, ZAMAX, ZSMAX +USE TPM_FIELDS ,ONLY : ZAA,ZAS,TDZAA,TDZAS USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS USE TPM_GEN, ONLY: NOUT USE TPM_FLT @@ -88,8 +84,7 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) ! LOCAL INTEGER(KIND=JPIM) :: IA, ILA, ILS, IS, ISKIP, ISL, J1, IF, JGL,JK, J,JI, IRET -REAL(KIND=JPRBT), ALLOCATABLE :: ZZBS(:,:,:), ZZCSTS(:,:,:) -REAL(KIND=JPRBT), ALLOCATABLE :: ZZBA(:,:,:), ZZCSTA(:,:,:) +REAL(KIND=JPRBT), ALLOCATABLE :: ZINP(:), ZOUT(:) INTEGER(KIND=JPIM) :: ISTAT @@ -109,17 +104,12 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) ENDIF CALL GSTATS(453,0) -ALLOCATE(ZZBA(KFC,TDZAA,D_NUMP)) -ALLOCATE(ZZCSTA(KFC,R_NDGNH,D_NUMP)) -ALLOCATE(ZZBS(KFC,TDZAS,D_NUMP)) -ALLOCATE(ZZCSTS(KFC,R_NDGNH,D_NUMP)) -!$ACC DATA CREATE(ZZBA,ZZCSTA,ZZBS,ZZCSTS) +ALLOCATE(ZINP(KFC*TDZAS*D_NUMP)) +ALLOCATE(ZOUT(KFC*R_NDGNH*D_NUMP)) -!$ACC DATA COPYIN (D,D_MYMS,R,G,G_NDGLU,D_NUMP,R_NDGNH,R_NSMAX) & -!$ACC& PRESENT (ZAA,ZAS) & -!$ACC& PRESENT (ZZBA,ZZBS,ZZCSTA,ZZCSTS) & -!$ACC& PRESENT (PIA) & -!$ACC& PRESENT (PSOA1,PAOA1,IZBS) +!$ACC DATA COPYIN (D,D_MYMS,G,G_NDGLU,D_NUMP,R,R_NDGNH,R_NSMAX) & +!$ACC& CREATE (ZINP,ZOUT) & +!$ACC& PRESENT (ZAA,ZAS,PIA,PSOA1,PAOA1) !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM) DEFAULT(NONE) DO KMLOC=1,D_NUMP @@ -135,47 +125,45 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) !end loop over wavenumber END DO -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO JK=1,KFC KM = D_MYMS(KMLOC) + IA = 1+MOD(R_NSMAX-KM+2,2) IF(KM /= 0)THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+2)/2 - IA = 1+MOD(R_NSMAX-KM+2,2) - ZZBA(JK,J,KMLOC)=PIA(JK,IA+1+(J-1)*2,KMLOC) + ZINP(JK+(J-1+(KMLOC-1)*TDZAA)*KFC)=PIA(JK,IA+1+(J-1)*2,KMLOC) ENDDO - ELSEIF (MOD((JK-1),2) .EQ. 0) THEN + ELSEIF (MOD((JK+1),2) .EQ. 0) THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX+2)/2 - IA = 1+MOD(R_NSMAX+2,2) - ZZBA((JK-1)/2+1,J,KMLOC)=PIA(JK,IA+1+(J-1)*2,KMLOC) + ZINP((JK-1)/2+1+(J-1+(KMLOC-1)*TDZAA)*KFC)=PIA(JK,IA+1+(J-1)*2,KMLOC) ENDDO ENDIF ENDDO ENDDO ! operate on full arrays, where non-relavent entries have been set to zero -! call CUDA_DGEMM_BATCHED('N','N',LDZAA,TDZBA,TDZAA,1.0_JPRB,ZAA,LDZAA,TDZAA,ZBA,LDZBA,TDZBA,0._JPRB,ZCA,LDZCA,TDZCA,D_NUMP) ! Get C in transpose format to get better memory access patterns later !C=A*B => ! C^T=B^T*A^T ! OVERLOADED FOR SINGLE AND DOUBLE PRECISION -!$ACC HOST_DATA USE_DEVICE(ZAA,ZZBA,ZZCSTA) +!$ACC HOST_DATA USE_DEVICE(ZAA,ZINP,ZOUT) CALL CUDA_GEMM_BATCHED( & & 'N', 'T', & & KFC, R_NDGNH, TDZAA, & & 1.0_JPRBT, & - & ZZBA, KFC, TDZAA,& + & ZINP, KFC, TDZAA,& & ZAA, R_NDGNH, TDZAA, & & 0._JPRBT, & - & ZZCSTA, KFC, R_NDGNH, & + & ZOUT, KFC, R_NDGNH, & & D_NUMP) !$ACC END HOST_DATA -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL,JGL) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO JK=1,KFC KM = D_MYMS(KMLOC) @@ -183,9 +171,9 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) !$ACC LOOP SEQ DO JGL=ISL,R_NDGNH IF(KM /= 0) THEN - PAOA1(JK,JGL,KMLOC) = ZZCSTA((JK-1)+1,JGL-ISL+1,KMLOC) + PAOA1(JK,JGL,KMLOC) = ZOUT(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFC) ELSEIF (MOD((JK-1),2) .EQ. 0) THEN - PAOA1(JK,JGL,KMLOC) = ZZCSTA((JK-1)/2+1,JGL-ISL+1,KMLOC) + PAOA1(JK,JGL,KMLOC) = ZOUT((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFC) ENDIF ENDDO ENDDO @@ -193,21 +181,20 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) ! 2. +++++++++++++ symmetric -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS,J) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO JK=1,KFC KM = D_MYMS(KMLOC) + IS = 1+MOD(R_NSMAX-KM+1,2) IF(KM /= 0)THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+3)/2 - IS = 1+MOD(R_NSMAX-KM+1,2) - ZZBS(JK,J,KMLOC)=PIA(JK,IS+1+(J-1)*2,KMLOC) + ZINP(JK+(J-1+(KMLOC-1)*TDZAS)*KFC)=PIA(JK,IS+1+(J-1)*2,KMLOC) ENDDO - ELSEIF (MOD((JK-1),2) .EQ. 0) THEN + ELSEIF (MOD((JK+1),2) .EQ. 0) THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX+3)/2 - IS = 1+MOD(R_NSMAX+1,2) - ZZBS((JK-1)/2+1,J,KMLOC)=PIA(JK,IS+1+(J-1)*2,KMLOC) + ZINP((JK-1)/2+1+(J-1+(KMLOC-1)*TDZAS)*KFC)=PIA(JK,IS+1+(J-1)*2,KMLOC) ENDDO ENDIF ENDDO @@ -216,19 +203,19 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) !C=A*B => ! C^T=B^T*A^T -!$ACC HOST_DATA USE_DEVICE(ZAS,ZZBS,ZZCSTS) +!$ACC HOST_DATA USE_DEVICE(ZAS,ZINP,ZOUT) CALL CUDA_GEMM_BATCHED( & & 'N', 'T', & & KFC, R_NDGNH, TDZAS, & & 1.0_JPRBT, & - & ZZBS, KFC, TDZAS, & + & ZINP, KFC, TDZAS, & & ZAS, R_NDGNH, TDZAS, & & 0._JPRBT, & - & ZZCSTS, KFC, R_NDGNH, & + & ZOUT, KFC, R_NDGNH, & & D_NUMP) !$ACC END HOST_DATA -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL,JGL) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO JK=1,KFC KM = D_MYMS(KMLOC) @@ -236,9 +223,9 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) !$ACC LOOP SEQ DO JGL=ISL,R_NDGNH IF(KM /= 0) THEN - PSOA1(JK,JGL,KMLOC) = ZZCSTS((JK-1)+1,JGL-ISL+1,KMLOC) + PSOA1(JK,JGL,KMLOC) = ZOUT(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFC) ELSEIF (MOD((JK-1),2) .EQ. 0) THEN - PSOA1(JK,JGL,KMLOC) = ZZCSTS((JK-1)/2+1,JGL-ISL+1,KMLOC) + PSOA1(JK,JGL,KMLOC) = ZOUT((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFC) ENDIF ENDDO ENDDO @@ -246,11 +233,8 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) !$ACC END DATA -!$ACC END DATA -DEALLOCATE(ZZBS) -DEALLOCATE(ZZBA) -DEALLOCATE(ZZCSTS) -DEALLOCATE(ZZCSTA) +DEALLOCATE(ZINP) +DEALLOCATE(ZOUT) IF (LSYNC_TRANS) THEN CALL MPL_BARRIER(CDSTRING='LEINV BARRIER') From 365eb7471a080961280947d36f99f65e508217d0 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:59 -0700 Subject: [PATCH 060/263] Use same indexing and PIA/POA size. TODO: Ideally we will move indexing to negative or use a good name for the halo size (which is 3 to the left right now) --- src/trans/gpu/internal/ledir_mod.F90 | 8 ++++---- src/trans/gpu/internal/prfi1b_mod.F90 | 2 +- src/trans/gpu/internal/updspb_mod.F90 | 8 ++++---- src/trans/gpu/internal/uvtvd_mod.F90 | 10 +++++----- src/trans/gpu/internal/vdtuv_mod.F90 | 2 +- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 6ae86db8f..88acfc7d3 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -158,7 +158,7 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) IA = 1+MOD(R_NTMAX-KM+2,2) !$ACC LOOP SEQ DO J=1,(R%NSMAX-KM+2)/2 - POA1(JK,IA+(J-1)*2,KMLOC) = ZOUT(JK+(J-1+(KMLOC-1)*TDZAA)*2*KF_FS) + POA1(JK,IA+1+(J-1)*2,KMLOC) = ZOUT(JK+(J-1+(KMLOC-1)*TDZAA)*2*KF_FS) ENDDO ENDIF ENDDO @@ -197,7 +197,7 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) DO J=1,(R_NSMAX+2)/2 DO JK=1,2*KF_FS,2 IA = 1+MOD(R_NTMAX+2,2) - POA1(JK,IA+(J-1)*2,KMLOC0) = ZOUT0((JK-1)/2+1+(J-1)*2*KF_FS) + POA1(JK,IA+1+(J-1)*2,KMLOC0) = ZOUT0((JK-1)/2+1+(J-1)*2*KF_FS) ENDDO ENDDO ENDIF @@ -246,7 +246,7 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) IS = 1+MOD(R_NTMAX-KM+1,2) !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+3)/2 - POA1(JK,IS+(J-1)*2,KMLOC) = ZOUT(JK+(J-1+(KMLOC-1)*TDZAS)*2*KF_FS) + POA1(JK,IS+1+(J-1)*2,KMLOC) = ZOUT(JK+(J-1+(KMLOC-1)*TDZAS)*2*KF_FS) ENDDO ENDIF ENDDO @@ -281,7 +281,7 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) DO J=1,(R_NSMAX+3)/2 DO JK=1,2*KF_FS,2 IS = 1+MOD(R_NTMAX+1,2) - POA1(JK,IS+(J-1)*2,KMLOC0) = ZOUT0((JK-1)/2+1+(J-1)*2*KF_FS) + POA1(JK,IS+1+(J-1)*2,KMLOC0) = ZOUT0((JK-1)/2+1+(J-1)*2*KF_FS) ENDDO ENDDO diff --git a/src/trans/gpu/internal/prfi1b_mod.F90 b/src/trans/gpu/internal/prfi1b_mod.F90 index f92ba2619..a2b410b71 100755 --- a/src/trans/gpu/internal/prfi1b_mod.F90 +++ b/src/trans/gpu/internal/prfi1b_mod.F90 @@ -131,7 +131,7 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) !loop over wavenumber - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ILCM,IASM0,INM,JN) DEFAULT(NONE) + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IASM0,INM,JN) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO JFLD=1,KFIELDS KM = D_MYMS(KMLOC) diff --git a/src/trans/gpu/internal/updspb_mod.F90 b/src/trans/gpu/internal/updspb_mod.F90 index 9d319e643..25dc71fbf 100755 --- a/src/trans/gpu/internal/updspb_mod.F90 +++ b/src/trans/gpu/internal/updspb_mod.F90 @@ -104,15 +104,15 @@ SUBROUTINE UPDSPB(KFIELD,POA,PSPEC,KFLDPTR) IF(KM == 0) THEN !$ACC LOOP SEQ - DO JN=R%NTMAX+2-R%NSMAX,R%NTMAX+2 - INM = IASM0+(R%NTMAX+2-JN)*2 + DO JN=3,R%NTMAX+3 + INM = IASM0+(R%NTMAX+3-JN)*2 PSPEC(JFLD,INM) = POA(2*JFLD-1,JN,KMLOC) PSPEC(JFLD,INM+1) = 0.0_JPRBT ENDDO ELSE !$ACC LOOP SEQ - DO JN=R%NTMAX+2-R%NSMAX,R%NTMAX+2-KM - INM = IASM0+((R%NTMAX+2-JN)-KM)*2 + DO JN=3,R%NTMAX+3-KM + INM = IASM0+((R%NTMAX+3-JN)-KM)*2 PSPEC(JFLD,INM) = POA(2*JFLD-1,JN,KMLOC) PSPEC(JFLD,INM+1) = POA(2*JFLD ,JN,KMLOC) ENDDO diff --git a/src/trans/gpu/internal/uvtvd_mod.F90 b/src/trans/gpu/internal/uvtvd_mod.F90 index 3fe9a9778..f5fa2021c 100755 --- a/src/trans/gpu/internal/uvtvd_mod.F90 +++ b/src/trans/gpu/internal/uvtvd_mod.F90 @@ -110,12 +110,12 @@ SUBROUTINE UVTVD(KF_UV) !* 1.1 SET N=KM-1 COMPONENT TO 0 FOR U AND V -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IN) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO J=1,2*KF_UV KM = D_MYMS(KMLOC) - PU(J,R_NTMAX+3-KM,KMLOC) = 0.0_JPRBT - PV(J,R_NTMAX+3-KM,KMLOC) = 0.0_JPRBT + PU(J,R_NTMAX+4-KM,KMLOC) = 0.0_JPRBT + PV(J,R_NTMAX+4-KM,KMLOC) = 0.0_JPRBT ENDDO ENDDO @@ -132,7 +132,7 @@ SUBROUTINE UVTVD(KF_UV) IF(KM /= 0) THEN !$ACC LOOP SEQ DO JN=KM,R_NTMAX - IN = R_NTMAX+2-JN + IN = R_NTMAX+3-JN ZJN = JN PVOR(IR,IN,kmloc) = -ZKM*PV(II,IN,kmloc)-& @@ -151,7 +151,7 @@ SUBROUTINE UVTVD(KF_UV) ELSE !$ACC LOOP SEQ DO JN=0,R_NTMAX - IN = R_NTMAX+2-JN + IN = R_NTMAX+3-JN ZJN = JN PVOR(IR,IN,kmloc) = -& diff --git a/src/trans/gpu/internal/vdtuv_mod.F90 b/src/trans/gpu/internal/vdtuv_mod.F90 index 12d8073bb..7e5b68b47 100755 --- a/src/trans/gpu/internal/vdtuv_mod.F90 +++ b/src/trans/gpu/internal/vdtuv_mod.F90 @@ -88,7 +88,7 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) REAL(KIND=JPRBT) :: ZKM !$ACC DATA & -!$ACC COPYIN (D,D%MYMS,F,F%RLAPIN,F%RN) & +!$ACC PRESENT(D,D%MYMS,F,F%RLAPIN,F%RN) & !$ACC PRESENT(PEPSNM, PVOR, PDIV) & !$ACC PRESENT(PU, PV, D_MYMS) From afc73802a3ca043be6d5b8719a25d8d0e7d3df25 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:48:59 -0700 Subject: [PATCH 061/263] Inline asre1b into LEDIR --- src/trans/gpu/internal/asre1_mod.F90 | 97 ----------------------- src/trans/gpu/internal/asre1b_mod.F90 | 110 -------------------------- src/trans/gpu/internal/ledir_mod.F90 | 7 +- src/trans/gpu/internal/leinv_mod.F90 | 29 +++++-- src/trans/gpu/internal/ltinv_mod.F90 | 28 +------ 5 files changed, 29 insertions(+), 242 deletions(-) delete mode 100755 src/trans/gpu/internal/asre1_mod.F90 delete mode 100755 src/trans/gpu/internal/asre1b_mod.F90 diff --git a/src/trans/gpu/internal/asre1_mod.F90 b/src/trans/gpu/internal/asre1_mod.F90 deleted file mode 100755 index 025502bd9..000000000 --- a/src/trans/gpu/internal/asre1_mod.F90 +++ /dev/null @@ -1,97 +0,0 @@ -! (C) Copyright 2001- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE ASRE1_MOD -CONTAINS -SUBROUTINE ASRE1(KM,KMLOC,KF_OUT_LT,PAOA1,PSOA1) - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT -USE TPM_DIM ,ONLY : R - -!USE TPM_TRANS - -USE ASRE1B_MOD ,ONLY : ASRE1B - - -!**** *ASRE1* - Recombine antisymmetric and symmetric parts - -! Purpose. -! -------- -! To recombine the antisymmetric and symmetric parts of the -! Fourier arrays and update the correct parts of the state -! variables. - -!** Interface. -! ---------- -! *CALL* *ASRE1(...) - -! Explicit arguments : -! -------------------- -! KM - zonal wavenumber -! KMLOC - local zonal wavenumber -! PAOA1 - antisymmetric part of Fourier -! fields for zonal wavenumber KM (basic -! variables and N-S derivatives) -! PSOA1 - symmetric part of Fourier -! fields for zonal wavenumber KM (basic -! variables and N-S derivatives) - -! Implicit arguments : None -! -------------------- - -! Method. -! ------- - -! Externals. ASRE1B - basic recombination routine -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 00-02-01 From ASRE1 in IFS CY22R1 - -! ------------------------------------------------------------------ - - -IMPLICIT NONE - - -! DUMMY INTEGER SCALARS -INTEGER(KIND=JPIM) , INTENT(IN) :: KM -INTEGER(KIND=JPIM) , INTENT(IN) :: KMLOC -INTEGER(KIND=JPIM) , INTENT(IN) :: KF_OUT_LT - -REAL(KIND=JPRBT) , INTENT(IN) :: PSOA1(:,:), PAOA1(:,:) - -! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: IFLDS - -! WORK ARRAYS FOR ASREL1B -INTEGER(KIND=JPIM) :: ISTAN(R%NDGNH),ISTAS(R%NDGNH) - -stop 'Error: this code path is not (yet) supported in GPU version' - - -! ------------------------------------------------------------------ - -IFLDS = KF_OUT_LT - -!CALL ASRE1B(IFLDS,KM,KMLOC,PAOA1,PSOA1) - -! ------------------------------------------------------------------ - -END SUBROUTINE ASRE1 -END MODULE ASRE1_MOD diff --git a/src/trans/gpu/internal/asre1b_mod.F90 b/src/trans/gpu/internal/asre1b_mod.F90 deleted file mode 100755 index 9ef494c95..000000000 --- a/src/trans/gpu/internal/asre1b_mod.F90 +++ /dev/null @@ -1,110 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE ASRE1B_MOD -CONTAINS -SUBROUTINE ASRE1B(KFIELD,PAOA,PSOA) - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - -USE TPM_DIM ,ONLY : R, R_NDGNH, R_NDGL -USE TPM_TRANS ,ONLY : FOUBUF_IN -USE TPM_GEOMETRY ,ONLY : G, G_NDGLU -USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,D_NSTAGT1B,D_NPROCL,D_NPNTGTB1 -use tpm_gen, only: nout - -!**** *ASRE1B* - Recombine antisymmetric and symmetric parts - -! Purpose. -! -------- -! To recombine the antisymmetric and symmetric parts of the -! Fourier arrays and update the correct parts of the state -! variables. - -!** Interface. -! ---------- -! *CALL* *ASRE1B(..) - -! Explicit arguments : -! ------------------- KFIELD - number of fields (input-c) -! KM - zonal wavenumber(input-c) -! KMLOC - local version of KM (input-c) -! PAOA - antisymmetric part of Fourier -! fields for zonal wavenumber KM (input) -! PSOA - symmetric part of Fourier -! fields for zonal wavenumber KM (input) - -! Implicit arguments : FOUBUF_IN - output buffer (output) -! -------------------- - -! Method. -! ------- - -! Externals. None. -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 00-02-01 From ASRE1B in IFS CY22R1 - -! ------------------------------------------------------------------ - - -IMPLICIT NONE - -INTEGER(KIND=JPIM), INTENT(IN) :: KFIELD -INTEGER(KIND=JPIM) :: KM,KMLOC -REAL(KIND=JPRBT), INTENT(IN) :: PSOA(:,:,:) -REAL(KIND=JPRBT), INTENT(IN) :: PAOA(:,:,:) -!INTEGER(KIND=JPIM), INTENT(OUT) :: ISTAN(:,:) -!INTEGER(KIND=JPIM), INTENT(OUT) :: ISTAS(:,:) - -! LOCAL INTEGERS -INTEGER(KIND=JPIM) :: ISL, IGLS, JFLD, JGL ,IPROC, IPROCS, IDGNH, ISTAN, ISTAS - -! ------------------------------------------------------------------ - -!* 1. RECOMBINATION OF SYMMETRIC AND ANTSYMMETRIC PARTS. -! --------------------------------------------------- - -!$ACC DATA PRESENT(PAOA,PSOA,D_MYMS,D_NPROCL,D_NSTAGT1B,D_NPNTGTB1,G_NDGLU) PRESENT(FOUBUF_IN) -!$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(KM,ISL,IPROC,ISTAN,IGLS,IPROCS,ISTAS) -DO KMLOC=1,D_NUMP - DO JFLD=1,2*KFIELD - KM = D_MYMS(KMLOC) - ISL = R_NDGNH-G_NDGLU(KM)+1 - !$ACC LOOP SEQ - DO JGL=ISL, R_NDGNH - IPROC = D_NPROCL(JGL) - ISTAN = (D_NSTAGT1B(IPROC) + D_NPNTGTB1(KMLOC,JGL))*2*KFIELD - - IGLS = R_NDGL+1-JGL - IPROCS = D_NPROCL(IGLS) - ISTAS = (D_NSTAGT1B(IPROCS) + D_NPNTGTB1(KMLOC,IGLS))*2*KFIELD - - FOUBUF_IN(ISTAN+JFLD) = PAOA(JFLD,JGL,KMLOC)+PSOA(JFLD,JGL,KMLOC) - FOUBUF_IN(ISTAS+JFLD) = PSOA(JFLD,JGL,KMLOC)-PAOA(JFLD,JGL,KMLOC) - ENDDO - ENDDO -ENDDO -!$ACC END DATA - -! ------------------------------------------------------------------ - -END SUBROUTINE ASRE1B -END MODULE ASRE1B_MOD diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 88acfc7d3..514b86cdf 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -54,16 +54,13 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) ! ------------------------------------------------------------------ USE TPM_GEN ,ONLY : LSYNC_TRANS -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPIB ,JPRB, JPRBT +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX,R_NTMAX,R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU USE TPM_FIELDS ,ONLY : F,ZAA,ZAS,ZAA0,ZAS0,TDZAA,TDZAS,KMLOC0 -USE TPM_DISTR -USE TPM_GEN, ONLY: NOUT -USE TPM_FLT +USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPROCL,D_NPNTGTB1,D_NSTAGT1B USE TPM_TRANS ,ONLY : FOUBUF -USE BUTTERFLY_ALG_MOD USE CUDA_GEMM_BATCHED_MOD!!, ONLY: CUDA_TCGEMM_BATCHED, CUDA_GEMM_BATCHED USE CUBLAS_MOD, ONLY : CUDA_DGEMM_BATCHED USE MPL_MODULE ,ONLY : MPL_BARRIER diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index da40c9f6b..6183f9f84 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -57,13 +57,12 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) USE TPM_GEN ,ONLY : LSYNC_TRANS USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK -USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX +USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX, R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU USE TPM_FIELDS ,ONLY : ZAA,ZAS,TDZAA,TDZAS -USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS +USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPROCL,D_NPNTGTB1,D_NSTAGT1B USE TPM_GEN, ONLY: NOUT -USE TPM_FLT -USE BUTTERFLY_ALG_MOD +USE TPM_TRANS ,ONLY : FOUBUF_IN USE CUDA_GEMM_BATCHED_MOD USE, INTRINSIC :: ISO_C_BINDING USE MPL_MODULE ,ONLY : MPL_BARRIER @@ -83,10 +82,9 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) REAL(KIND=JPRBT), INTENT(OUT) :: PAOA1(:,:,:) ! LOCAL -INTEGER(KIND=JPIM) :: IA, ILA, ILS, IS, ISKIP, ISL, J1, IF, JGL,JK, J,JI, IRET REAL(KIND=JPRBT), ALLOCATABLE :: ZINP(:), ZOUT(:) -INTEGER(KIND=JPIM) :: ISTAT +INTEGER(KIND=JPIM) :: JF, IA, IS, ISL, J1, JGL, JK, J, IGLS, ISTAS, OFFSET1, OFFSET2 REAL(KIND=JPHOOK) :: ZHOOK_HANDLE @@ -233,6 +231,25 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) !$ACC END DATA +!$ACC DATA PRESENT(PAOA1,PSOA1,D_MYMS,D_NPROCL,D_NSTAGT1B,D_NPNTGTB1,G_NDGLU) PRESENT(FOUBUF_IN) +!$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,ISTAS) +DO KMLOC=1,D_NUMP + DO JF=1,2*KF_OUT_LT + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + !$ACC LOOP SEQ + DO JGL=ISL, R_NDGNH + IGLS = R_NDGL+1-JGL + OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL)) + D_NPNTGTB1(KMLOC,JGL))*2*KF_OUT_LT + OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS)) + D_NPNTGTB1(KMLOC,IGLS))*2*KF_OUT_LT + + FOUBUF_IN(OFFSET1+JF) = PAOA1(JF,JGL,KMLOC)+PSOA1(JF,JGL,KMLOC) + FOUBUF_IN(OFFSET2+JF) = PSOA1(JF,JGL,KMLOC)-PAOA1(JF,JGL,KMLOC) + ENDDO + ENDDO +ENDDO +!$ACC END DATA + DEALLOCATE(ZINP) DEALLOCATE(ZOUT) diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index 1edbe7993..96dabe7f1 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -30,7 +30,6 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& USE VDTUV_MOD ,ONLY : VDTUV USE SPNSDE_MOD ,ONLY : SPNSDE USE LEINV_MOD ,ONLY : LEINV - USE ASRE1B_MOD ,ONLY : ASRE1B USE FSPGL_INT_MOD ,ONLY : FSPGL_INT USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS use ieee_arithmetic @@ -73,7 +72,6 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& ! VDTUV - compute u and v from vorticity and divergence ! SPNSDE - compute north-south derivatives ! LEINV - Inverse Legendre transform - ! ASRE1 - recombination of symmetric/antisymmetric part ! Reference. ! ---------- @@ -280,29 +278,11 @@ END SUBROUTINE cudaProfilerStop ENDIF IF( KF_OUT_LT > 0 ) THEN - !call cudaProfilerStart - CALL LEINV(IFC,KF_OUT_LT,ZIA(ISTA:ISTA+IFC-1,:,:),ZAOA1,ZSOA1) - !call cudaProfilerStop - - ! ------------------------------------------------------------------ - - !* 5. RECOMBINATION SYMMETRIC/ANTISYMMETRIC PART. - ! -------------------------------------------- + CALL LEINV(IFC,KF_OUT_LT,ZIA(ISTA:ISTA+IFC-1,:,:),ZAOA1,ZSOA1) - !FROM ZAOA1/ZSOA to FOUBUF_IN - - !CALL ASRE1B(KF_OUT_LT,ZAOA1,ZSOA1,ISTAN,ISTAS) - CALL ASRE1B(KF_OUT_LT,ZAOA1,ZSOA1) - ! ------------------------------------------------------------------ - - ! 6. OPTIONAL COMPUTATIONS IN FOURIER SPACE - - - IF(PRESENT(FSPGL_PROC)) THEN - stop 'Error: SPGL_PROC is not (yet) optimized in GPU version' - CALL FSPGL_INT(KF_UV,KF_SCALARS,KF_SCDERS,KF_OUT_LT,FSPGL_PROC,& - & KFLDPTRUV,KFLDPTRSC) - ENDIF + IF(PRESENT(FSPGL_PROC)) THEN + stop 'Error: SPGL_PROC is not (yet) optimized in GPU version. Need to figure out how to implement' + ENDIF ENDIF IF (LHOOK) CALL DR_HOOK('LTINV_MOD',1,ZHOOK_HANDLE) From 11497e621cb6f4da05d55d64fa686704b9a0abe5 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:00 -0700 Subject: [PATCH 062/263] Zero risk cleanup for ltinv and leinv --- src/trans/gpu/internal/leinv_mod.F90 | 73 +++++++++++++--------------- src/trans/gpu/internal/ltinv_mod.F90 | 18 ++----- 2 files changed, 38 insertions(+), 53 deletions(-) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 6183f9f84..1b2aa81e7 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -10,7 +10,7 @@ MODULE LEINV_MOD CONTAINS -SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) +SUBROUTINE LEINV(KF_LT,PIA) !**** *LEINV* - Inverse Legendre transform. @@ -26,9 +26,9 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) ! -------------------- KFC - number of fields to tranform (input-c) ! PIA - spectral fields ! for zonal wavenumber KM (input) -! PAOA1 - antisymmetric part of Fourier +! ZAOA1 - antisymmetric part of Fourier ! fields for zonal wavenumber KM (output) -! PSOA1 - symmetric part of Fourier +! ZSOA1 - symmetric part of Fourier ! fields for zonal wavenumber KM (output) ! Implicit arguments : None. @@ -59,7 +59,7 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX, R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU -USE TPM_FIELDS ,ONLY : ZAA,ZAS,TDZAA,TDZAS +USE TPM_FIELDS ,ONLY : ZAA,ZAS,TDZAA,TDZAS,ZAOA1,ZSOA1 USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPROCL,D_NPNTGTB1,D_NSTAGT1B USE TPM_GEN, ONLY: NOUT USE TPM_TRANS ,ONLY : FOUBUF_IN @@ -74,12 +74,9 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) ! DUMMY ARGUMENTS INTEGER(KIND=JPIM) :: KM INTEGER(KIND=JPIM) :: KMLOC -INTEGER(KIND=JPIM), INTENT(IN) :: KFC INTEGER(KIND=JPIM) :: KIFC -INTEGER(KIND=JPIM), INTENT(IN) :: KF_OUT_LT +INTEGER(KIND=JPIM), INTENT(IN) :: KF_LT REAL(KIND=JPRB), INTENT(IN) :: PIA(:,:,:) -REAL(KIND=JPRBT), INTENT(OUT) :: PSOA1(:,:,:) -REAL(KIND=JPRBT), INTENT(OUT) :: PAOA1(:,:,:) ! LOCAL REAL(KIND=JPRBT), ALLOCATABLE :: ZINP(:), ZOUT(:) @@ -102,21 +99,21 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) ENDIF CALL GSTATS(453,0) -ALLOCATE(ZINP(KFC*TDZAS*D_NUMP)) -ALLOCATE(ZOUT(KFC*R_NDGNH*D_NUMP)) +ALLOCATE(ZINP(2*KF_LT*TDZAS*D_NUMP)) +ALLOCATE(ZOUT(2*KF_LT*R_NDGNH*D_NUMP)) !$ACC DATA COPYIN (D,D_MYMS,G,G_NDGLU,D_NUMP,R,R_NDGNH,R_NSMAX) & !$ACC& CREATE (ZINP,ZOUT) & -!$ACC& PRESENT (ZAA,ZAS,PIA,PSOA1,PAOA1) +!$ACC& PRESENT (ZAA,ZAS,PIA,ZSOA1,ZAOA1) !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO JGL=1,R_NDGNH - DO J1=2,KFC,2 + DO J1=2,2*KF_LT,2 KM = D_MYMS(KMLOC) IF(KM == 0)THEN - PSOA1(J1,JGL,KMLOC) = 0.0_JPRBT - PAOA1(J1,JGL,KMLOC) = 0.0_JPRBT + ZSOA1(J1,JGL,KMLOC) = 0.0_JPRBT + ZAOA1(J1,JGL,KMLOC) = 0.0_JPRBT END IF ENDDO ENDDO @@ -125,18 +122,18 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO JK=1,KFC + DO JK=1,2*KF_LT KM = D_MYMS(KMLOC) IA = 1+MOD(R_NSMAX-KM+2,2) IF(KM /= 0)THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+2)/2 - ZINP(JK+(J-1+(KMLOC-1)*TDZAA)*KFC)=PIA(JK,IA+1+(J-1)*2,KMLOC) + ZINP(JK+(J-1+(KMLOC-1)*TDZAA)*2*KF_LT)=PIA(JK,IA+1+(J-1)*2,KMLOC) ENDDO ELSEIF (MOD((JK+1),2) .EQ. 0) THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX+2)/2 - ZINP((JK-1)/2+1+(J-1+(KMLOC-1)*TDZAA)*KFC)=PIA(JK,IA+1+(J-1)*2,KMLOC) + ZINP((JK-1)/2+1+(J-1+(KMLOC-1)*TDZAA)*2*KF_LT)=PIA(JK,IA+1+(J-1)*2,KMLOC) ENDDO ENDIF ENDDO @@ -152,26 +149,26 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) !$ACC HOST_DATA USE_DEVICE(ZAA,ZINP,ZOUT) CALL CUDA_GEMM_BATCHED( & & 'N', 'T', & - & KFC, R_NDGNH, TDZAA, & + & 2*KF_LT, R_NDGNH, TDZAA, & & 1.0_JPRBT, & - & ZINP, KFC, TDZAA,& + & ZINP, 2*KF_LT, TDZAA,& & ZAA, R_NDGNH, TDZAA, & & 0._JPRBT, & - & ZOUT, KFC, R_NDGNH, & + & ZOUT, 2*KF_LT, R_NDGNH, & & D_NUMP) !$ACC END HOST_DATA !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL,JGL) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO JK=1,KFC + DO JK=1,2*KF_LT KM = D_MYMS(KMLOC) ISL = R_NDGNH-G_NDGLU(KM)+1 !$ACC LOOP SEQ DO JGL=ISL,R_NDGNH IF(KM /= 0) THEN - PAOA1(JK,JGL,KMLOC) = ZOUT(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFC) + ZAOA1(JK,JGL,KMLOC) = ZOUT(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) ELSEIF (MOD((JK-1),2) .EQ. 0) THEN - PAOA1(JK,JGL,KMLOC) = ZOUT((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFC) + ZAOA1(JK,JGL,KMLOC) = ZOUT((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) ENDIF ENDDO ENDDO @@ -181,18 +178,18 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS,J) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO JK=1,KFC + DO JK=1,2*KF_LT KM = D_MYMS(KMLOC) IS = 1+MOD(R_NSMAX-KM+1,2) IF(KM /= 0)THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+3)/2 - ZINP(JK+(J-1+(KMLOC-1)*TDZAS)*KFC)=PIA(JK,IS+1+(J-1)*2,KMLOC) + ZINP(JK+(J-1+(KMLOC-1)*TDZAS)*2*KF_LT)=PIA(JK,IS+1+(J-1)*2,KMLOC) ENDDO ELSEIF (MOD((JK+1),2) .EQ. 0) THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX+3)/2 - ZINP((JK-1)/2+1+(J-1+(KMLOC-1)*TDZAS)*KFC)=PIA(JK,IS+1+(J-1)*2,KMLOC) + ZINP((JK-1)/2+1+(J-1+(KMLOC-1)*TDZAS)*2*KF_LT)=PIA(JK,IS+1+(J-1)*2,KMLOC) ENDDO ENDIF ENDDO @@ -204,26 +201,26 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) !$ACC HOST_DATA USE_DEVICE(ZAS,ZINP,ZOUT) CALL CUDA_GEMM_BATCHED( & & 'N', 'T', & - & KFC, R_NDGNH, TDZAS, & + & 2*KF_LT, R_NDGNH, TDZAS, & & 1.0_JPRBT, & - & ZINP, KFC, TDZAS, & + & ZINP, 2*KF_LT, TDZAS, & & ZAS, R_NDGNH, TDZAS, & & 0._JPRBT, & - & ZOUT, KFC, R_NDGNH, & + & ZOUT, 2*KF_LT, R_NDGNH, & & D_NUMP) !$ACC END HOST_DATA !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL,JGL) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO JK=1,KFC + DO JK=1,2*KF_LT KM = D_MYMS(KMLOC) ISL = R_NDGNH-G_NDGLU(KM)+1 !$ACC LOOP SEQ DO JGL=ISL,R_NDGNH IF(KM /= 0) THEN - PSOA1(JK,JGL,KMLOC) = ZOUT(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFC) + ZSOA1(JK,JGL,KMLOC) = ZOUT(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) ELSEIF (MOD((JK-1),2) .EQ. 0) THEN - PSOA1(JK,JGL,KMLOC) = ZOUT((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFC) + ZSOA1(JK,JGL,KMLOC) = ZOUT((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) ENDIF ENDDO ENDDO @@ -231,20 +228,20 @@ SUBROUTINE LEINV(KFC,KF_OUT_LT,PIA,PAOA1,PSOA1) !$ACC END DATA -!$ACC DATA PRESENT(PAOA1,PSOA1,D_MYMS,D_NPROCL,D_NSTAGT1B,D_NPNTGTB1,G_NDGLU) PRESENT(FOUBUF_IN) +!$ACC DATA PRESENT(ZAOA1,ZSOA1,D_MYMS,D_NPROCL,D_NSTAGT1B,D_NPNTGTB1,G_NDGLU) PRESENT(FOUBUF_IN) !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,ISTAS) DO KMLOC=1,D_NUMP - DO JF=1,2*KF_OUT_LT + DO JF=1,2*KF_LT KM = D_MYMS(KMLOC) ISL = R_NDGNH-G_NDGLU(KM)+1 !$ACC LOOP SEQ DO JGL=ISL, R_NDGNH IGLS = R_NDGL+1-JGL - OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL)) + D_NPNTGTB1(KMLOC,JGL))*2*KF_OUT_LT - OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS)) + D_NPNTGTB1(KMLOC,IGLS))*2*KF_OUT_LT + OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL)) + D_NPNTGTB1(KMLOC,JGL))*2*KF_LT + OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS)) + D_NPNTGTB1(KMLOC,IGLS))*2*KF_LT - FOUBUF_IN(OFFSET1+JF) = PAOA1(JF,JGL,KMLOC)+PSOA1(JF,JGL,KMLOC) - FOUBUF_IN(OFFSET2+JF) = PSOA1(JF,JGL,KMLOC)-PAOA1(JF,JGL,KMLOC) + FOUBUF_IN(OFFSET1+JF) = ZAOA1(JF,JGL,KMLOC)+ZSOA1(JF,JGL,KMLOC) + FOUBUF_IN(OFFSET2+JF) = ZSOA1(JF,JGL,KMLOC)-ZAOA1(JF,JGL,KMLOC) ENDDO ENDDO ENDDO diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index 96dabe7f1..c0c919d4b 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -33,8 +33,7 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& USE FSPGL_INT_MOD ,ONLY : FSPGL_INT USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS use ieee_arithmetic - !USE TPM_FIELDS ,ONLY : F,ZIA,ZSOA1,ZAOA1,ISTAN,ISTAS,ZEPSNM - USE TPM_FIELDS ,ONLY : F,ZIA,ZSOA1,ZAOA1,ZEPSNM + USE TPM_FIELDS ,ONLY : F,ZIA,ZEPSNM USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS @@ -125,9 +124,7 @@ END SUBROUTINE cudaProfilerStop EXTERNAL FSPGL_PROC OPTIONAL FSPGL_PROC - !REAL(KIND=JPRBT) :: ZEPSNM(d%nump,0:R%NTMAX+2) - - INTEGER(KIND=JPIM) :: IFC, ISTA, IIFC, IDGLU + INTEGER(KIND=JPIM) :: ISTA, IIFC, IDGLU INTEGER(KIND=JPIM) :: IVORL,IVORU,IDIVL,IDIVU,IUL,IUU,IVL,IVU,ISL,ISU,IDL,IDU INTEGER(KIND=JPIM) :: IFIRST, ILAST, IDIM1,IDIM2,IDIM3,J3 @@ -159,14 +156,6 @@ END SUBROUTINE cudaProfilerStop IFIRST = 1 ILAST = 0 - !* 1. PREPARE ZEPSNM. - ! --------------- - - !IF ( KF_UV > 0 .OR. KF_SCDERS > 0 ) THEN - ! CALL PREPSNM(ZEPSNM) - ! !$ACC update host(ZEPSNM) - !ENDIF - ! COPY FROM PSPXXXX TO ZIA IF (LSYNC_TRANS) THEN @@ -269,7 +258,6 @@ END SUBROUTINE cudaProfilerStop ! FROM ZIA TO ZAOA1 and ZSOA1 ISTA = 1 - IFC = 2*KF_OUT_LT IF(KF_UV > 0 .AND. .NOT. LVORGP) THEN ISTA = ISTA+2*KF_UV ENDIF @@ -278,7 +266,7 @@ END SUBROUTINE cudaProfilerStop ENDIF IF( KF_OUT_LT > 0 ) THEN - CALL LEINV(IFC,KF_OUT_LT,ZIA(ISTA:ISTA+IFC-1,:,:),ZAOA1,ZSOA1) + CALL LEINV(KF_OUT_LT,ZIA(ISTA:ISTA+2*KF_OUT_LT-1,:,:)) IF(PRESENT(FSPGL_PROC)) THEN stop 'Error: SPGL_PROC is not (yet) optimized in GPU version. Need to figure out how to implement' From 729ff7c0128c71536d78907090178b469afc5077 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:00 -0700 Subject: [PATCH 063/263] Merge write back and remove ZAOA and ZSOA --- src/trans/gpu/internal/leinv_mod.F90 | 92 +++++++++++++--------------- 1 file changed, 44 insertions(+), 48 deletions(-) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 1b2aa81e7..0a5d9c93a 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -26,10 +26,6 @@ SUBROUTINE LEINV(KF_LT,PIA) ! -------------------- KFC - number of fields to tranform (input-c) ! PIA - spectral fields ! for zonal wavenumber KM (input) -! ZAOA1 - antisymmetric part of Fourier -! fields for zonal wavenumber KM (output) -! ZSOA1 - symmetric part of Fourier -! fields for zonal wavenumber KM (output) ! Implicit arguments : None. ! -------------------- @@ -59,7 +55,7 @@ SUBROUTINE LEINV(KF_LT,PIA) USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX, R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU -USE TPM_FIELDS ,ONLY : ZAA,ZAS,TDZAA,TDZAS,ZAOA1,ZSOA1 +USE TPM_FIELDS ,ONLY : ZAA,ZAS,TDZAA,TDZAS USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPROCL,D_NPNTGTB1,D_NSTAGT1B USE TPM_GEN, ONLY: NOUT USE TPM_TRANS ,ONLY : FOUBUF_IN @@ -80,8 +76,9 @@ SUBROUTINE LEINV(KF_LT,PIA) ! LOCAL REAL(KIND=JPRBT), ALLOCATABLE :: ZINP(:), ZOUT(:) +REAL(KIND=JPRBT) :: ZAOA, ZSOA -INTEGER(KIND=JPIM) :: JF, IA, IS, ISL, J1, JGL, JK, J, IGLS, ISTAS, OFFSET1, OFFSET2 +INTEGER(KIND=JPIM) :: IA, IS, ISL, J1, JGL, JK, J, IGLS, ISTAS, OFFSET1, OFFSET2 REAL(KIND=JPHOOK) :: ZHOOK_HANDLE @@ -102,23 +99,21 @@ SUBROUTINE LEINV(KF_LT,PIA) ALLOCATE(ZINP(2*KF_LT*TDZAS*D_NUMP)) ALLOCATE(ZOUT(2*KF_LT*R_NDGNH*D_NUMP)) -!$ACC DATA COPYIN (D,D_MYMS,G,G_NDGLU,D_NUMP,R,R_NDGNH,R_NSMAX) & -!$ACC& CREATE (ZINP,ZOUT) & -!$ACC& PRESENT (ZAA,ZAS,PIA,ZSOA1,ZAOA1) +!$ACC DATA COPYIN(D,D_MYMS,G,G_NDGLU,D_NUMP,R,R_NDGNH,R_NSMAX) & +!$ACC& CREATE (ZINP,ZOUT) & +!$ACC& PRESENT(ZAA,ZAS,PIA,FOUBUF_IN) & +!$ACC& PRESENT(D_MYMS,D_NPROCL,D_NSTAGT1B,D_NPNTGTB1,G_NDGLU) -!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM) DEFAULT(NONE) -DO KMLOC=1,D_NUMP - DO JGL=1,R_NDGNH - DO J1=2,2*KF_LT,2 - KM = D_MYMS(KMLOC) - IF(KM == 0)THEN - ZSOA1(J1,JGL,KMLOC) = 0.0_JPRBT - ZAOA1(J1,JGL,KMLOC) = 0.0_JPRBT - END IF - ENDDO - ENDDO - !end loop over wavenumber -END DO +! READ 2:NSMAX+3 + +!IF KM=0 and NSMAX is 6: +! IA=1 +! DO=1,6/2+1 ... 1..4 +! PIA_2=1+1+(J-1)*2 ...2+(0..3)*2 .... 2,4,6,8 +!IF KM=0 and NSMAX is 7: +! IA=2 +! DO=1,7/2+1 ... 1..4 +! PIA_2=2+1+(1..4-1)*2 ...3+(0..3)*2 .... 3,5,7,9 !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) DEFAULT(NONE) DO KMLOC=1,D_NUMP @@ -158,23 +153,36 @@ SUBROUTINE LEINV(KF_LT,PIA) & D_NUMP) !$ACC END HOST_DATA -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL,JGL) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL,JGL,OFFSET1,OFFSET2,ZAOA) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO JK=1,2*KF_LT KM = D_MYMS(KMLOC) ISL = R_NDGNH-G_NDGLU(KM)+1 !$ACC LOOP SEQ DO JGL=ISL,R_NDGNH + IGLS = R_NDGL+1-JGL + OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL)) + D_NPNTGTB1(KMLOC,JGL))*2*KF_LT + OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS)) + D_NPNTGTB1(KMLOC,IGLS))*2*KF_LT + IF(KM /= 0) THEN - ZAOA1(JK,JGL,KMLOC) = ZOUT(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) + ZAOA = ZOUT(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) ELSEIF (MOD((JK-1),2) .EQ. 0) THEN - ZAOA1(JK,JGL,KMLOC) = ZOUT((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) + ZAOA = ZOUT((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) ENDIF + FOUBUF_IN(OFFSET1+JK) = ZAOA ENDDO ENDDO ENDDO ! 2. +++++++++++++ symmetric +!IF KM=0 and NSMAX is 6: +! IS=2 +! DO=1,4 +! PIA_2=2+1+(0..3)*2 ... 3+(0..3)*2 ... 3,5,7,9 +!IF KM=0 and NSMAX is 7: +! IS=1 +! DO=1,5 +! PIA_2=1+1+(1..5-1)*2 ...2+(0..4)*2 .... 2,4,6,8,10 !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS,J) DEFAULT(NONE) DO KMLOC=1,D_NUMP @@ -210,43 +218,31 @@ SUBROUTINE LEINV(KF_LT,PIA) & D_NUMP) !$ACC END HOST_DATA -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL,JGL) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,ISTAS,ZAOA,ZSOA) DO KMLOC=1,D_NUMP DO JK=1,2*KF_LT KM = D_MYMS(KMLOC) ISL = R_NDGNH-G_NDGLU(KM)+1 !$ACC LOOP SEQ DO JGL=ISL,R_NDGNH - IF(KM /= 0) THEN - ZSOA1(JK,JGL,KMLOC) = ZOUT(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) - ELSEIF (MOD((JK-1),2) .EQ. 0) THEN - ZSOA1(JK,JGL,KMLOC) = ZOUT((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) - ENDIF - ENDDO - ENDDO -ENDDO - -!$ACC END DATA - -!$ACC DATA PRESENT(ZAOA1,ZSOA1,D_MYMS,D_NPROCL,D_NSTAGT1B,D_NPNTGTB1,G_NDGLU) PRESENT(FOUBUF_IN) -!$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,ISTAS) -DO KMLOC=1,D_NUMP - DO JF=1,2*KF_LT - KM = D_MYMS(KMLOC) - ISL = R_NDGNH-G_NDGLU(KM)+1 - !$ACC LOOP SEQ - DO JGL=ISL, R_NDGNH IGLS = R_NDGL+1-JGL OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL)) + D_NPNTGTB1(KMLOC,JGL))*2*KF_LT OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS)) + D_NPNTGTB1(KMLOC,IGLS))*2*KF_LT - FOUBUF_IN(OFFSET1+JF) = ZAOA1(JF,JGL,KMLOC)+ZSOA1(JF,JGL,KMLOC) - FOUBUF_IN(OFFSET2+JF) = ZSOA1(JF,JGL,KMLOC)-ZAOA1(JF,JGL,KMLOC) + IF(KM /= 0) THEN + ZSOA = ZOUT(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) + ELSEIF (MOD((JK-1),2) .EQ. 0) THEN + ZSOA = ZOUT((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) + ENDIF + + ZAOA = FOUBUF_IN(OFFSET1+JK) + FOUBUF_IN(OFFSET1+JK) = ZAOA+ZSOA + FOUBUF_IN(OFFSET2+JK) = ZSOA-ZAOA ENDDO ENDDO ENDDO -!$ACC END DATA +!$ACC END DATA DEALLOCATE(ZINP) DEALLOCATE(ZOUT) From 22b06bcad314072ebf4723ef9cc75c9813cc0c82 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:00 -0700 Subject: [PATCH 064/263] Merge FOUBUF_IN filling for LEINV --- src/trans/gpu/internal/leinv_mod.F90 | 46 +++++++++------------------- 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 0a5d9c93a..87a1e5b4a 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -75,7 +75,7 @@ SUBROUTINE LEINV(KF_LT,PIA) REAL(KIND=JPRB), INTENT(IN) :: PIA(:,:,:) ! LOCAL -REAL(KIND=JPRBT), ALLOCATABLE :: ZINP(:), ZOUT(:) +REAL(KIND=JPRBT), ALLOCATABLE :: ZINP(:), ZOUTS(:), ZOUTA(:) REAL(KIND=JPRBT) :: ZAOA, ZSOA INTEGER(KIND=JPIM) :: IA, IS, ISL, J1, JGL, JK, J, IGLS, ISTAS, OFFSET1, OFFSET2 @@ -97,10 +97,11 @@ SUBROUTINE LEINV(KF_LT,PIA) CALL GSTATS(453,0) ALLOCATE(ZINP(2*KF_LT*TDZAS*D_NUMP)) -ALLOCATE(ZOUT(2*KF_LT*R_NDGNH*D_NUMP)) +ALLOCATE(ZOUTS(2*KF_LT*R_NDGNH*D_NUMP)) +ALLOCATE(ZOUTA(2*KF_LT*R_NDGNH*D_NUMP)) !$ACC DATA COPYIN(D,D_MYMS,G,G_NDGLU,D_NUMP,R,R_NDGNH,R_NSMAX) & -!$ACC& CREATE (ZINP,ZOUT) & +!$ACC& CREATE (ZINP,ZOUTS,ZOUTA) & !$ACC& PRESENT(ZAA,ZAS,PIA,FOUBUF_IN) & !$ACC& PRESENT(D_MYMS,D_NPROCL,D_NSTAGT1B,D_NPNTGTB1,G_NDGLU) @@ -141,7 +142,7 @@ SUBROUTINE LEINV(KF_LT,PIA) ! OVERLOADED FOR SINGLE AND DOUBLE PRECISION -!$ACC HOST_DATA USE_DEVICE(ZAA,ZINP,ZOUT) +!$ACC HOST_DATA USE_DEVICE(ZAA,ZINP,ZOUTA) CALL CUDA_GEMM_BATCHED( & & 'N', 'T', & & 2*KF_LT, R_NDGNH, TDZAA, & @@ -149,31 +150,10 @@ SUBROUTINE LEINV(KF_LT,PIA) & ZINP, 2*KF_LT, TDZAA,& & ZAA, R_NDGNH, TDZAA, & & 0._JPRBT, & - & ZOUT, 2*KF_LT, R_NDGNH, & + & ZOUTA, 2*KF_LT, R_NDGNH, & & D_NUMP) !$ACC END HOST_DATA -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,ISL,JGL,OFFSET1,OFFSET2,ZAOA) DEFAULT(NONE) -DO KMLOC=1,D_NUMP - DO JK=1,2*KF_LT - KM = D_MYMS(KMLOC) - ISL = R_NDGNH-G_NDGLU(KM)+1 - !$ACC LOOP SEQ - DO JGL=ISL,R_NDGNH - IGLS = R_NDGL+1-JGL - OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL)) + D_NPNTGTB1(KMLOC,JGL))*2*KF_LT - OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS)) + D_NPNTGTB1(KMLOC,IGLS))*2*KF_LT - - IF(KM /= 0) THEN - ZAOA = ZOUT(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) - ELSEIF (MOD((JK-1),2) .EQ. 0) THEN - ZAOA = ZOUT((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) - ENDIF - FOUBUF_IN(OFFSET1+JK) = ZAOA - ENDDO - ENDDO -ENDDO - ! 2. +++++++++++++ symmetric !IF KM=0 and NSMAX is 6: ! IS=2 @@ -206,7 +186,7 @@ SUBROUTINE LEINV(KF_LT,PIA) !C=A*B => ! C^T=B^T*A^T -!$ACC HOST_DATA USE_DEVICE(ZAS,ZINP,ZOUT) +!$ACC HOST_DATA USE_DEVICE(ZAS,ZINP,ZOUTS) CALL CUDA_GEMM_BATCHED( & & 'N', 'T', & & 2*KF_LT, R_NDGNH, TDZAS, & @@ -214,7 +194,7 @@ SUBROUTINE LEINV(KF_LT,PIA) & ZINP, 2*KF_LT, TDZAS, & & ZAS, R_NDGNH, TDZAS, & & 0._JPRBT, & - & ZOUT, 2*KF_LT, R_NDGNH, & + & ZOUTS, 2*KF_LT, R_NDGNH, & & D_NUMP) !$ACC END HOST_DATA @@ -230,12 +210,13 @@ SUBROUTINE LEINV(KF_LT,PIA) OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS)) + D_NPNTGTB1(KMLOC,IGLS))*2*KF_LT IF(KM /= 0) THEN - ZSOA = ZOUT(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) + ZSOA = ZOUTS(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) + ZAOA = ZOUTA(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) ELSEIF (MOD((JK-1),2) .EQ. 0) THEN - ZSOA = ZOUT((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) + ZSOA = ZOUTS((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) + ZAOA = ZOUTA((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) ENDIF - ZAOA = FOUBUF_IN(OFFSET1+JK) FOUBUF_IN(OFFSET1+JK) = ZAOA+ZSOA FOUBUF_IN(OFFSET2+JK) = ZSOA-ZAOA ENDDO @@ -244,7 +225,8 @@ SUBROUTINE LEINV(KF_LT,PIA) !$ACC END DATA DEALLOCATE(ZINP) -DEALLOCATE(ZOUT) +DEALLOCATE(ZOUTS) +DEALLOCATE(ZOUTA) IF (LSYNC_TRANS) THEN CALL MPL_BARRIER(CDSTRING='LEINV BARRIER') From 868d88d019cea2eb7d579796ec7205c3b18533fb Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:00 -0700 Subject: [PATCH 065/263] Merge FOUBUF_IN reading for LEDIR --- src/trans/gpu/internal/ledir_mod.F90 | 55 +++++++++++----------------- src/trans/gpu/internal/leinv_mod.F90 | 4 ++ 2 files changed, 25 insertions(+), 34 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 514b86cdf..b09f39380 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -80,11 +80,11 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) INTEGER(KIND=JPIM) :: KMLOC INTEGER(KIND=JPIM) :: IA, IS, ISL, J, JK REAL(KIND=JPHOOK) :: ZHOOK_HANDLE -REAL(KIND=JPRBT) :: PAIA +REAL(KIND=JPRBT) :: PAIA, PAIS INTEGER(KIND=JPIM) :: IGLS, JF, JGL INTEGER(KIND=JPIM) :: OFFSET1, OFFSET2 -REAL(KIND=JPRBT), ALLOCATABLE :: ZINP(:), ZOUT(:) +REAL(KIND=JPRBT), ALLOCATABLE :: ZINPS(:), ZINPA(:), ZOUT(:) REAL(KIND=JPRD), ALLOCATABLE :: ZINP0(:), ZOUT0(:) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) @@ -94,13 +94,14 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) ENDIF CALL GSTATS(452,0) -ALLOCATE(ZINP(2*KF_FS*R_NDGNH*D_NUMP)) +ALLOCATE(ZINPA(2*KF_FS*R_NDGNH*D_NUMP)) +ALLOCATE(ZINPS(2*KF_FS*R_NDGNH*D_NUMP)) ALLOCATE(ZOUT(2*KF_FS*TDZAS*D_NUMP)) ALLOCATE(ZINP0(2*KF_FS*R_NDGNH)) ALLOCATE(ZOUT0(2*KF_FS*TDZAS)) !$ACC DATA & -!$ACC& CREATE(ZINP,ZOUT,ZINP0,ZOUT0) & +!$ACC& CREATE(ZINPS,ZINPA,ZOUT,ZINP0,ZOUT0) & !$ACC& PRESENT(F,F%RW) & !$ACC& PRESENT(D,D_MYMS,R,G,G_NDGLU) & !$ACC& PRESENT(ZAA,ZAS,POA1) & @@ -108,11 +109,11 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) ! TODO this doesn't make sense that we need it (???) !$ACC KERNELS -ZINP(:) = 0 +ZINPS(:) = 0 +ZINPA(:) = 0 !$ACC END KERNELS -! anti-symmetric -!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA) +!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) DO KMLOC=1,D_NUMP DO JF=1,KF_FS*2 KM = D_MYMS(KMLOC) @@ -123,24 +124,28 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL) )+D_NPNTGTB1(KMLOC,JGL ))*2*KF_FS OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS))+D_NPNTGTB1(KMLOC,IGLS))*2*KF_FS PAIA = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) + PAIS = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) IF (JF .LE. 4*KF_UV) THEN PAIA = PAIA*F%RACTHE(JGL) + PAIS = PAIS*F%RACTHE(JGL) ENDIF - ZINP(JF+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIA*F%RW(JGL) + ZINPA(JF+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIA*F%RW(JGL) + ZINPS(JF+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIS*F%RW(JGL) ENDDO ENDDO END DO +! anti-symmetric ! Get C in transpose format to get better memory access patterns later !C=A*B => ! C^T=B^T*A^T -!$ACC HOST_DATA USE_DEVICE(ZAA,ZINP,ZOUT) +!$ACC HOST_DATA USE_DEVICE(ZAA,ZINPA,ZOUT) CALL CUDA_GEMM_BATCHED( & & 'N', 'N', & & 2*KF_FS, TDZAA, R_NDGNH, & & 1.0_JPRBT, & - & ZINP, 2*KF_FS, R_NDGNH, & + & ZINPA, 2*KF_FS, R_NDGNH, & & ZAA, R_NDGNH, TDZAA, & & 0.0_JPRBT, & & ZOUT, 2*KF_FS, TDZAA, & @@ -169,7 +174,7 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) DO JGL=1,G_NDGLU(0) DO JF=1,2*KF_FS,2 ZINP0((JF-1)/2+1+(JGL-1)*2*KF_FS) & - & = ZINP((JF-1)+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*2*KF_FS) + & = ZINPA((JF-1)+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*2*KF_FS) ENDDO ENDDO @@ -201,34 +206,15 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) ! symmetric -!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA) -DO KMLOC=1,D_NUMP - DO JF=1,KF_FS*2 - KM = D_MYMS(KMLOC) - ISL = R_NDGNH-G_NDGLU(KM)+1 - !$ACC LOOP SEQ - DO JGL=ISL,R_NDGNH - IGLS = R_NDGL+1-JGL - OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL) )+D_NPNTGTB1(KMLOC,JGL ))*2*KF_FS - OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS))+D_NPNTGTB1(KMLOC,IGLS))*2*KF_FS - PAIA = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) - IF (JF .LE. 4*KF_UV) THEN - PAIA = PAIA*F%RACTHE(JGL) - ENDIF - ZINP((JF-1)+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIA*F%RW(JGL) - ENDDO - ENDDO -END DO - ! Get C in transpose format to get better memory access patterns later !C=A*B => ! C^T=B^T*A^T -!$ACC HOST_DATA USE_DEVICE(ZAS,ZINP,ZOUT) +!$ACC HOST_DATA USE_DEVICE(ZAS,ZINPS,ZOUT) CALL CUDA_GEMM_BATCHED( & & 'N', 'N', & & 2*KF_FS, TDZAS, R_NDGNH, & & 1.0_JPRBT, & - & ZINP, 2*KF_FS, R_NDGNH, & + & ZINPS, 2*KF_FS, R_NDGNH, & & ZAS, R_NDGNH, TDZAS, & & 0.0_JPRBT, & & ZOUT, 2*KF_FS, TDZAS, & @@ -254,7 +240,7 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) DO JGL=1,G_NDGLU(0) DO JF=1,2*KF_FS,2 ZINP0((JF-1)/2+1+(JGL-1)*2*KF_FS) & - & = ZINP((JF-1)+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*2*KF_FS) + & = ZINPS((JF-1)+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*2*KF_FS) ENDDO ENDDO @@ -285,7 +271,8 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) ENDIF !$ACC END DATA -DEALLOCATE(ZINP) +DEALLOCATE(ZINPA) +DEALLOCATE(ZINPS) DEALLOCATE(ZOUT) DEALLOCATE(ZINP0) DEALLOCATE(ZOUT0) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 87a1e5b4a..d123375cb 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -215,6 +215,10 @@ SUBROUTINE LEINV(KF_LT,PIA) ELSEIF (MOD((JK-1),2) .EQ. 0) THEN ZSOA = ZOUTS((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) ZAOA = ZOUTA((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) + ELSE + ! Imaginary values of KM=0 is zero, though I don't think we care + ZSOA = 0_JPRBT + ZAOA = 0_JPRBT ENDIF FOUBUF_IN(OFFSET1+JK) = ZAOA+ZSOA From f7ab48fe3efc66ef73a9909940670ee783de6000 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:00 -0700 Subject: [PATCH 066/263] Cleanup setup_trans and allocate less data --- src/trans/gpu/external/dir_trans.F90 | 7 +- src/trans/gpu/external/inv_trans.F90 | 7 +- src/trans/gpu/external/setup_trans.F90 | 191 ++++------------------- src/trans/gpu/external/trans_end.F90 | 55 ++----- src/trans/gpu/internal/ltdir_ctl_mod.F90 | 2 +- src/trans/gpu/internal/ltdir_mod.F90 | 2 +- src/trans/gpu/internal/tpm_fields.F90 | 54 ------- 7 files changed, 45 insertions(+), 273 deletions(-) diff --git a/src/trans/gpu/external/dir_trans.F90 b/src/trans/gpu/external/dir_trans.F90 index 9060faff1..7c4939467 100755 --- a/src/trans/gpu/external/dir_trans.F90 +++ b/src/trans/gpu/external/dir_trans.F90 @@ -116,7 +116,7 @@ SUBROUTINE DIR_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& & NF_SC2, NF_SC3A, NF_SC3B, & & NGPBLKS, NPROMA USE TPM_DISTR ,ONLY : D, NPRTRV, MYSETV -USE TPM_FIELDS ,ONLY : IF_FS_DIR,IF_FS_DIR0,NFLEV,NFLEV0,DTDZBA,DTDZBS,DTDZCA,DTDZCS +USE TPM_FIELDS ,ONLY : IF_FS_DIR,IF_FS_DIR0,NFLEV,NFLEV0 USE TPM_FLT, ONLY: S USE TPM_GEOMETRY ,ONLY : G USE SET_RESOL_MOD ,ONLY : SET_RESOL @@ -329,11 +329,6 @@ SUBROUTINE DIR_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& IF_FS_DIR=2*IF_FS+2!2*(2*IF_UV+NFLEV+2+IF_PP) print*,"dir_trans: IF_FS_DIR=",IF_FS_DIR," IF_FS_DIR0=",IF_FS_DIR0 -DTDZBA=IF_FS_DIR -DTDZBS=IF_FS_DIR -DTDZCA=IF_FS_DIR -DTDZCS=IF_FS_DIR - ! Consistency checks IF (IF_UV > 0) THEN diff --git a/src/trans/gpu/external/inv_trans.F90 b/src/trans/gpu/external/inv_trans.F90 index ca75c2171..01f89013c 100755 --- a/src/trans/gpu/external/inv_trans.F90 +++ b/src/trans/gpu/external/inv_trans.F90 @@ -132,7 +132,7 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, LATLON, & & NF_SC2, NF_SC3A, NF_SC3B, NGPBLKS, NPROMA -USE TPM_FIELDS ,ONLY : IF_FS_INV,IF_FS_INV0,ITDZBA,ITDZBS,ITDZCA,ITDZCS +USE TPM_FIELDS ,ONLY : IF_FS_INV,IF_FS_INV0 USE TPM_FLT, ONLY: S USE TPM_GEOMETRY ,ONLY : G !USE TPM_GEOMETRY @@ -421,11 +421,6 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& IF_FS_INV=2*IF_OUT_LT print*,"inv_trans: IF_FS_INV=",IF_FS_INV," IF_FS_INV0=",IF_FS_INV0 -ITDZBA=IF_FS_INV -ITDZBS=IF_FS_INV -ITDZCA=IF_FS_INV -ITDZCS=IF_FS_INV - ! Consistency checks IF (IF_UV > 0) THEN diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index cd5f8a43c..fe7159659 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -107,15 +108,10 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& USE TPM_DISTR ,ONLY : D, DISTR_RESOL,NPROC,nprtrv, D_NUMP,D_MYMS,D_NSTAGT0B,D_NSTAGT1B,D_NPROCL,D_NPNTGTB1, D_NASM0, & & D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS,mysetv,mysetw, MYPROC USE TPM_GEOMETRY ,ONLY : G, GEOM_RESOL, G_NDGLU, G_NMEN, G_NMEN_MAX,G_NLOEN, G_NLOEN_MAX -USE TPM_FIELDS ,ONLY : FIELDS_RESOL, F,F_RW, ZIA,ZEPSNM,ZSOA1,ZAOA1,ISTAN,ISTAS,ZAIA,ZOA1,ZOA2, & -& ZAA,ZAS,LDZAA,LDZAS,TDZAA,TDZAS,& -& IZBS,ILDZBA,ILDZBS,ITDZBA0,ITDZBS0,& -& IZCA,IZCS,IZCST,ILDZCA,ILDZCS,ITDZCA0,ITDZCS0,& -& DZBAT,DZBST,DLDZBA,DLDZBS,DTDZBA0,DTDZBS0,& -& DZCAT,DZCST,DLDZCA,DLDZCS,DTDZCA0,DTDZCS0,& -& IF_FS_INV0,IF_FS_DIR0,NFLEV0,ZAA0,DZBST0,DZCAT0,& -& ZAS0,DZCST0,KMLOC0 -! IZBA,IZCAT +USE TPM_FIELDS ,ONLY : FIELDS_RESOL, F,F_RW, ZIA,ZEPSNM,ZOA1,ZOA2, & +& ZAA,ZAS,TDZAA,TDZAS,& +& IF_FS_INV0,IF_FS_DIR0,NFLEV0,ZAA0,& +& ZAS0,KMLOC0 USE TPM_FFT ,ONLY : T, FFT_RESOL #ifdef WITH_FFTW USE TPM_FFTW ,ONLY : TW, FFTW_RESOL @@ -518,113 +514,25 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& !leading and trailing dimensions of A for symmetric and antisymmetric cases ! (same for ltinv and ltdir) -LDZAA=R%NDGNH -LDZAS=R%NDGNH TDZAA=(R%NTMAX+2)/2 TDZAS=(R%NTMAX+3)/2 print*,'R%NTMAX=',R%NTMAX print*,'R%NSMAX=',R%NSMAX -!similarly for B (ltinv) -ILDZBA=(R%NSMAX+2)/2 -ILDZBS=(R%NSMAX+3)/2 -ITDZBA0=IF_FS_INV0 -ITDZBS0=IF_FS_INV0 - -!similarly for C (ltinv) -ILDZCA=R%NDGNH -ILDZCS=R%NDGNH -ITDZCA0=IF_FS_INV0 -ITDZCS0=IF_FS_INV0 - -!similarly for B (ltdir) -DLDZBA=R%NDGNH -DLDZBS=R%NDGNH -DTDZBA0=IF_FS_DIR0 -DTDZBS0=IF_FS_DIR0 - -!similarly for C (ltdir) -DLDZCA=(R%NTMAX+2)/2 -DLDZCS=(R%NTMAX+3)/2 -DTDZCA0=IF_FS_DIR0 -DTDZCS0=IF_FS_DIR0 - -! competition: NPRTRV ... larger == NUMP ... larger == NSMAX/NPRTRW -! setting NPRTRV=20 ... leads to 7GB ZAA since NUMP==55 - -!allocate matrices for matrix multiplications -!ALLOCATE(IZBA(IF_FS_INV0*TDZAA*D%NUMP)) -ALLOCATE(IZBS(IF_FS_INV0*TDZAS*D%NUMP)) -print*,"New: allocating IZBS as a 1D array!" -! just use IZBS -!IZBA=>IZBS(:,1:TDZAA,:) -ALLOCATE(ZAA(R%NDGNH,TDZAA,D%NUMP)) -ALLOCATE(ZAS(R%NDGNH,TDZAS,D%NUMP)) - -! Allocate matrices for rescaling to allow half-precision Legendre transforms -!ALLOCATE(ZAMAX(IF_FS_INV0,D%NUMP)) -!ALLOCATE(ZSMAX(IF_FS_INV0,D%NUMP)) - -! transpose of C (for better memory access patterns) -!ALLOCATE(IZCAT(IF_FS_INV0,R%NDGNH,D%NUMP)) -ALLOCATE(IZCST(IF_FS_INV0*R%NDGNH*D%NUMP)) - -!ALLOCATE(DZBAT(IF_FS_DIR0,R%NDGNH,D%NUMP)) -ALLOCATE(DZBST(IF_FS_DIR0*R%NDGNH*D%NUMP)) - -! transpose of C (for better memory access patterns) -ALLOCATE(DZCAT(IF_FS_DIR0*TDZAA*D%NUMP)) -ALLOCATE(DZCST(IF_FS_DIR0*TDZAS*D%NUMP)) -DZCAT(:) = 0 -DZCST(:) = 0 -IZCST(:) = 0 -!DZCAT=>DZCST(:,1:TDZAA,:) - -write(nout,*)'setup_trans: sizes1 NUMP=',D%NUMP -write(nout,*)'ZAS:',size(ZAS) -write(nout,*)'IZBS :',size(IZBS ) -write(nout,*)'IZCST:',size(IZCST) -write(nout,*)'DZBST:',size(DZBST) -write(nout,*)'DZCST:',size(DZCST) -write(nout,*)'DZCAT:',size(DZCAT) -!!!$ACC ENTER DATA CREATE(ZAA,ZAS,IZBA,IZBS,IZCAT,IZCST,DZBAT,DZBST,DZCAT,DZCST) -!$ACC ENTER DATA COPYIN(ZAA,ZAS,IZBS,IZCST,DZBST,DZCST,DZCAT) & +!$ACC ENTER DATA & !$ACC& COPYIN(F,F%RN,F%RLAPIN,S,S%FA,S%ITHRESHOLD,S%LUSEFLT,D,D%NUMP,D%MYMS,R,R%NDGNH,R%NSMAX,G,G%NDGLU) & !$ACC& copyin(D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,G%NMEN,D%NPROCM,D%NPTRLS,G,G%NLOEN,D%MSTABF) ! Initialize A arrays -izbs = 0._JPRBT -!$acc update device(izbs) -dzbst = 0._JPRBT -!$acc update device(dzbst) - -! zero arrays -!$ACC PARALLEL LOOP -DO JMLOC=1,D%NUMP - !$ACC loop - DO JK=1,TDZAA - !$ACC loop - DO J=1,LDZAA - ZAA(J,JK,JMLOC)=0._JPRBT - ENDDO - ENDDO -ENDDO - -!$ACC PARALLEL LOOP -DO JMLOC=1,D%NUMP - !$ACC loop - DO JK=1,TDZAS - !$ACC LOOP - DO J=1,LDZAS - ZAS(J,JK,JMLOC)=0._JPRBT - ENDDO - ENDDO -ENDDO +ALLOCATE(ZAA(R%NDGNH,TDZAA,D%NUMP)) +ALLOCATE(ZAS(R%NDGNH,TDZAS,D%NUMP)) -! Do this on the host +write(nout,*)'setup_trans: sizes1 NUMP=',D%NUMP +write(nout,*)'ZAS:',size(ZAS) +write(nout,*)'ZAA:',size(ZAA) -zaa(:,:,:) = 0 +ZAA(:,:,:) = 0 DO JMLOC=1,D%NUMP KM = D%MYMS(JMLOC) KDGLU = MIN(R%NDGNH,G%NDGLU(KM)) @@ -649,11 +557,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ENDDO ENDDO ENDDO - -! permanent copy of Legendre polynomials into device - -!$ACC update device(ZAA) -!$ACC update device(ZAS) +!$ACC ENTER DATA COPYIN(ZAA,ZAS) IF_FOUBUF=MAX(IF_OUT_LT,IF_FS) ALLOCATE(FOUBUF_IN(MAX(1,D%NLENGT0B*2*IF_FOUBUF))) @@ -662,46 +566,23 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ALLOCATE(ZGTF(2*IF_FS,D%NLENGTF)) write(nout,*)'ZGTF :',size(ZGTF) -!$ACC enter data create(ZGTF) +!$ACC ENTER DATA CREATE(ZGTF) ALLOCATE(ZIA(IF_FS_INV0,R%NLEI1,D%NUMP)) -ALLOCATE(ZEPSNM(d%nump,0:R%NTMAX+2)) -ALLOCATE(ZSOA1(2*IF_OUT_LT,R%NLEI3,D%NUMP)) -ALLOCATE(ZAOA1(2*IF_OUT_LT,R%NLEI3,D%NUMP)) -ALLOCATE(ISTAN(D%NUMP,R%NDGNH)) -ALLOCATE(ISTAS(D%NUMP,R%NDGNH)) -!ALLOCATE(ZSIA(IF_FS_INV0,R%NDGNH,D%NUMP)) -ALLOCATE(ZAIA(IF_FS_INV0,R%NDGNH,D%NUMP)) ALLOCATE(ZOA1(4*IF_FS_DIR0,R%NLED4,D%NUMP)) ALLOCATE(ZOA2(MAX(4*IF_UV,1),R%NLED4,D%NUMP)) +ALLOCATE(ZEPSNM(d%nump,0:R%NTMAX+2)) write(nout,*)'ZIA :',size(ZIA ) -write(nout,*)'ZSOA1:',size(ZSOA1) -write(nout,*)'ZAOA1:',size(ZAOA1) -write(nout,*)'ZAIA :',size(ZAIA ) write(nout,*)'ZOA1 :',size(ZOA1 ) write(nout,*)'ZOA2 :',size(ZOA2 ) -!!!$ACC enter data create(ZIA,ZEPSNM,ZSOA1,ZAOA1,ISTAN,ISTAS,ZSIA,ZAIA,ZOA1,ZOA2) -!$ACC enter data create(ZIA,ZEPSNM,ZSOA1,ZAOA1,ZAIA,ZOA1,ZOA2) -zepsnm = 0._JPRBT +ZEPSNM = 0._JPRBT CALL PREPSNM -!$acc update device(zepsnm) -zgtf = 0._JPRBT -!$acc update device(zgtf) -zia = 0._JPRBT -!$acc update device(zia) -!zsia = 0._JPRBT -!!!$acc update device(zsia) -zaia = 0._JPRBT -!$acc update device(zaia) -zoa1 = 0._JPRBT -!$acc update device(zoa1) -zoa2 = 0._JPRBT -!$acc update device(zoa2) -zaoa1 = 0._JPRBT -!$acc update device(zaoa1) -zsoa1 = 0._JPRBT -!$acc update device(zsoa1) +ZGTF = 0._JPRBT +ZIA = 0._JPRBT +ZOA1 = 0._JPRBT +ZOA2 = 0._JPRBT +!$ACC ENTER DATA COPYIN(ZIA,ZEPSNM,ZOA1,ZOA2) ! add arrays for GPNORM1 ALLOCATE(ZAVE(IF_FS,R%NDGL)) @@ -709,18 +590,13 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ALLOCATE(ZMAXGL(IF_FS,R%NDGL)) ALLOCATE(ZMINGPN(IF_FS)) ALLOCATE(ZMAXGPN(IF_FS)) -!$ACC enter data create(ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) - -zave = 0._JPRBT -!$acc update device(zave) -zmingl = 0._JPRBT -!$acc update device(zmingl) -zmaxgl = 0._JPRBT -!$acc update device(zmaxgl) -zmingpn = 0._JPRBT -!$acc update device(zmingpn) -zmaxgpn = 0._JPRBT -!$acc update device(zmaxgpn) + +ZAVE = 0._JPRBT +ZMINGL = 0._JPRBT +ZMAXGL = 0._JPRBT +ZMINGPN = 0._JPRBT +ZMAXGPN = 0._JPRBT +!$ACC ENTER DATA COPYIN(ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) !set up flat copies of constant data R_NSMAX=R%NSMAX @@ -821,18 +697,9 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& IF(KMLOC0 >= 0) THEN ALLOCATE(ZAA0(R%NDGNH,TDZAA)) ALLOCATE(ZAS0(R%NDGNH,TDZAS)) - ALLOCATE(DZBST0(IF_FS_DIR0*R%NDGNH)) - ALLOCATE(DZCAT0(IF_FS_DIR0*TDZAA)) - ALLOCATE(DZCST0(IF_FS_DIR0*TDZAS)) - DZCAT0(:) = 0 - DZCST0(:) = 0 - !$ACC ENTER DATA COPYIN(ZAA0,DZBST0,DZCAT0,ZAS0,DZCST0) ZAA0 = ZAA(:,:,KMLOC0) ZAS0 = ZAS(:,:,KMLOC0) - !$ACC update device(ZAA0) - !$ACC update device(ZAS0) - dzbst0 = 0._JPRD - !$acc update device(dzbst0) + !$ACC ENTER DATA COPYIN(ZAA0,ZAS0) WRITE(NOUT,*) 'GPU arrays for m=0 successfully allocated' ENDIF diff --git a/src/trans/gpu/external/trans_end.F90 b/src/trans/gpu/external/trans_end.F90 index 69d45f9ab..5f7156c9d 100755 --- a/src/trans/gpu/external/trans_end.F90 +++ b/src/trans/gpu/external/trans_end.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -49,12 +50,7 @@ SUBROUTINE TRANS_END(CDMODE) USE TPM_DISTR ,ONLY : D, DISTR_RESOL, NPRCIDS,D_NUMP,D_MYMS,D_NSTAGT0B,D_NSTAGT1B,D_NPROCL,D_NPNTGTB1, D_NASM0, & & D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS USE TPM_GEOMETRY ,ONLY : G, GEOM_RESOL, G_NDGLU, G_NMEN, G_NMEN_MAX,G_NLOEN, G_NLOEN_MAX -USE TPM_FIELDS ,ONLY : F, FIELDS_RESOL,F_RW, ZIA,ZEPSNM,ZSOA1,ZAOA1,ISTAN,ISTAS,ZSIA,ZAIA,ZOA1,ZOA2, & -& ZAA,ZAS,LDZAA,LDZAS,TDZAA,TDZAS,& -& IZBS,ILDZBA,ILDZBS,ITDZBA,ITDZBS,& -& IZCA,IZCS,IZCST,ILDZCA,ILDZCS,ITDZCA,ITDZCS,& -& DZBAT,DZBST,DLDZBA,DLDZBS,DTDZBA,DTDZBS,& -& DZCA,DZCS,DZCAT,DZCST,DLDZCA,DLDZCS,DTDZCA,DTDZCS +USE TPM_FIELDS ,ONLY : F, FIELDS_RESOL,F_RW, ZIA,ZEPSNM,ZOA1,ZOA2,ZAA,ZAS,ZAA0,ZAS0 USE TPM_FFT ,ONLY : T, FFT_RESOL USE TPM_CTL ,ONLY : C, CTL_RESOL #ifdef WITH_FFTW @@ -78,48 +74,21 @@ SUBROUTINE TRANS_END(CDMODE) IF (PRESENT(CDMODE)) CLMODE=CDMODE IF (CLMODE == 'FINAL') THEN + !$ACC EXIT DATA DELETE(ZAA0,ZAS0,ZIA,ZEPSNM,ZOA1,ZOA2,ZAA,ZAS,ZGTF) + DEALLOCATE(ZAA0) + DEALLOCATE(ZAS0) + DEALLOCATE(ZIA) + DEALLOCATE(ZEPSNM) + DEALLOCATE(ZOA1) + DEALLOCATE(ZOA2) DEALLOCATE(ZAA) DEALLOCATE(ZAS) - - !DEALLOCATE(IZBA) - DEALLOCATE(IZBS) - !DEALLOCATE(IZCA) - !DEALLOCATE(IZCS) - !DEALLOCATE(IZCAT) - DEALLOCATE(IZCST) - - - !DEALLOCATE(DZBA) - !DEALLOCATE(DZBS) - DEALLOCATE(DZBAT) - DEALLOCATE(DZBST) - !DEALLOCATE(DZCA) - !DEALLOCATE(DZCS) - DEALLOCATE(DZCAT) - DEALLOCATE(DZCST) - - !$ACC exit data delete(ZAA,ZAS,IZBS,IZCST,DZBAT,DZBST,DZCAT,DZCST) - + DEALLOCATE(ZGTF) + !memory save DEALLOCATE(FOUBUF_IN) DEALLOCATE(FOUBUF) - - - !$ACC exit data delete(ZGTF) - DEALLOCATE(ZGTF) - - DEALLOCATE(ZIA) - DEALLOCATE(ZEPSNM) - DEALLOCATE(ZSOA1) - DEALLOCATE(ZAOA1) - DEALLOCATE(ISTAN) - DEALLOCATE(ISTAS) - DEALLOCATE(ZSIA) - DEALLOCATE(ZAIA) - DEALLOCATE(ZOA1) - !DEALLOCATE(ZOA2) - !$ACC exit data delete(ZIA,ZEPSNM,ZSOA1,ZAOA1,ISTAN,ISTAS,ZSIA,ZAIA,ZOA1) - + DEALLOCATE(D_NSTAGT0B,D_NSTAGT1B,D_NPNTGTB1,D_MYMS,D_NPROCL,D_NASM0,D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS,G_NDGLU,G_NMEN,G_NLOEN,F_RW) !$ACC exit data delete(R_NSMAX,R_NTMAX,R_NDGL,R_NNOEXTZL,R_NDGNH,D_NSTAGT0B,D_NSTAGT1B,D_NPNTGTB1,D_NPROCL,D_NUMP,D_MYMS,G_NDGLU,G_NMEN,G_NMEN_MAX,G_NLOEN,G_NLOEN_MAX,D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS,D_NASM0,F_RW) diff --git a/src/trans/gpu/internal/ltdir_ctl_mod.F90 b/src/trans/gpu/internal/ltdir_ctl_mod.F90 index 7f461e719..fb6aa29cf 100755 --- a/src/trans/gpu/internal/ltdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltdir_ctl_mod.F90 @@ -51,7 +51,7 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS, & USE LTDIR_MOD ,ONLY : LTDIR USE TRLTOM_MOD ,ONLY : TRLTOM, TRLTOM_CUDAAWARE - USE TPM_FIELDS ,ONLY : ZSIA,ZAIA,ZOA1,ZEPSNM + USE TPM_FIELDS ,ONLY : ZOA1,ZEPSNM IMPLICIT NONE diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index d6fa3bdea..940d34561 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -28,7 +28,7 @@ SUBROUTINE LTDIR(KF_FS,KF_UV,KF_SCALARS,KLED2,& USE UVTVD_MOD USE UPDSP_MOD ,ONLY : UPDSP - USE TPM_FIELDS ,ONLY : ZAIA,ZOA1,ZOA2,ZEPSNM + USE TPM_FIELDS ,ONLY : ZOA1,ZOA2,ZEPSNM !**** *LTDIR* - Control of Direct Legendre transform step diff --git a/src/trans/gpu/internal/tpm_fields.F90 b/src/trans/gpu/internal/tpm_fields.F90 index 6781c0def..3067f2228 100755 --- a/src/trans/gpu/internal/tpm_fields.F90 +++ b/src/trans/gpu/internal/tpm_fields.F90 @@ -42,77 +42,23 @@ MODULE TPM_FIELDS REAL(KIND=JPRBT),ALLOCATABLE :: ZAA(:,:,:) !! JPRL for 1/2 REAL(KIND=JPRBT),ALLOCATABLE :: ZAS(:,:,:) !! JPRL for 1/2 - -REAL(KIND=JPRBT), POINTER :: IZBA(:,:,:) !! JPRL for 1/2 -!!origSam REAL(KIND=JPRBT),ALLOCATABLE,TARGET :: IZBS(:,:,:) !! JPRL for 1/2 -REAL(KIND=JPRBT),ALLOCATABLE :: IZBS(:) !! from working RAPS -REAL(KIND=JPRBT),ALLOCATABLE :: IZCA(:,:,:) -REAL(KIND=JPRBT),ALLOCATABLE :: IZCS(:,:,:) -!REAL(KIND=JPRBT),ALLOCATABLE :: IZCAT(:,:,:) -REAL(KIND=JPRBT),ALLOCATABLE :: IZCST(:) - -REAL(KIND=JPRBT),ALLOCATABLE :: DZBA(:,:,:) -REAL(KIND=JPRBT),ALLOCATABLE :: DZBS(:,:,:) -REAL(KIND=JPRBT),ALLOCATABLE :: DZBAT(:,:,:) -REAL(KIND=JPRBT),ALLOCATABLE :: DZBST(:) !! JPRL for 1/2 -REAL(KIND=JPRBT),ALLOCATABLE :: DZCA(:,:,:) -REAL(KIND=JPRBT),ALLOCATABLE :: DZCS(:,:,:) -!REAL(KIND=JPRBT),POINTER :: DZCAT(:,:,:) -REAL(KIND=JPRBT),ALLOCATABLE :: DZCAT(:) -REAL(KIND=JPRBT),ALLOCATABLE,TARGET :: DZCST(:) - -! Arrays used for rescaling to allow half-precision Legende transforms -REAL(KIND=JPRBT), ALLOCATABLE :: ZAMAX(:,:) -REAL(KIND=JPRBT), ALLOCATABLE :: ZSMAX(:,:) - ! for m=0 in ledir_mod: REAL(KIND=JPRD),ALLOCATABLE :: ZAA0(:,:) REAL(KIND=JPRD),ALLOCATABLE :: ZAS0(:,:) -REAL(KIND=JPRD),ALLOCATABLE :: DZBST0(:) -REAL(KIND=JPRD),ALLOCATABLE :: DZCAT0(:) -REAL(KIND=JPRD),ALLOCATABLE :: DZCST0(:) INTEGER(KIND=JPIM) :: KMLOC0 -INTEGER(KIND=JPIM) :: LDZAA -INTEGER(KIND=JPIM) :: LDZAS INTEGER(KIND=JPIM) :: TDZAA INTEGER(KIND=JPIM) :: TDZAS -INTEGER(KIND=JPIM) :: ILDZBA -INTEGER(KIND=JPIM) :: ILDZBS -INTEGER(KIND=JPIM) :: ILDZCA -INTEGER(KIND=JPIM) :: ILDZCS - - - -INTEGER(KIND=JPIM) :: DLDZBA -INTEGER(KIND=JPIM) :: DLDZBS -INTEGER(KIND=JPIM) :: DLDZCA -INTEGER(KIND=JPIM) :: DLDZCS - ! enable calling setup_trans with a different set of fields than inv_trans and dir_trans: ! IF_FS_INV0: size used for the allocation in setup_trans ! IF_FS_INV: size used in inv_trans and dir_Trans, needs to be <= IF_FS_INV0 INTEGER(KIND=JPIM) :: IF_FS_INV, IF_FS_INV0 INTEGER(KIND=JPIM) :: IF_FS_DIR, IF_FS_DIR0 INTEGER(KIND=JPIM) :: NFLEV, NFLEV0 -INTEGER(KIND=JPIM) :: ITDZBA, ITDZBA0 -INTEGER(KIND=JPIM) :: ITDZBS, ITDZBS0 -INTEGER(KIND=JPIM) :: DTDZBA, DTDZBA0 -INTEGER(KIND=JPIM) :: DTDZBS, DTDZBS0 -INTEGER(KIND=JPIM) :: DTDZCA, DTDZCA0 -INTEGER(KIND=JPIM) :: DTDZCS, DTDZCS0 -INTEGER(KIND=JPIM) :: ITDZCA, ITDZCA0 -INTEGER(KIND=JPIM) :: ITDZCS, ITDZCS0 REAL(KIND=JPRB),ALLOCATABLE, TARGET :: ZIA(:,:,:) REAL(KIND=JPRBT),ALLOCATABLE :: ZEPSNM(:,:) -REAL(KIND=JPRBT),ALLOCATABLE :: ZSOA1(:,:,:) -REAL(KIND=JPRBT),ALLOCATABLE :: ZAOA1(:,:,:) -INTEGER(KIND=JPIM),ALLOCATABLE :: ISTAN(:,:) -INTEGER(KIND=JPIM),ALLOCATABLE :: ISTAS(:,:) -REAL(KIND=JPRBT),ALLOCATABLE :: ZSIA(:,:,:) -REAL(KIND=JPRBT),ALLOCATABLE :: ZAIA(:,:,:) REAL(KIND=JPRBT),ALLOCATABLE, TARGET :: ZOA1(:,:,:) REAL(KIND=JPRBT),ALLOCATABLE, TARGET :: ZOA2(:,:,:) From c2f828d2bd3b5827d8525357bb8f966ef5c4a6cc Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:01 -0700 Subject: [PATCH 067/263] Document inigptr and trgtol --- src/trans/gpu/internal/inigptr_mod.F90 | 22 +++--- src/trans/gpu/internal/trgtol_mod.F90 | 99 +++++++++++++++----------- 2 files changed, 72 insertions(+), 49 deletions(-) diff --git a/src/trans/gpu/internal/inigptr_mod.F90 b/src/trans/gpu/internal/inigptr_mod.F90 index 87c2a5fd6..26391df10 100755 --- a/src/trans/gpu/internal/inigptr_mod.F90 +++ b/src/trans/gpu/internal/inigptr_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -37,18 +38,17 @@ SUBROUTINE INIGPTR(KGPTRSEND,KGPTRRECV) IROF=1 IBFIRST=1 IPROCLAST=D%NPROCL(D%NFRSTLOFF+1) +! for each latitude on this processor DO JGL=1,D%NDGL_GP - ! Find processor which deals with this latitude in Fourier distribution + ! find the processor where this row should be saved in the fourier distribution + ! this is called the "w-set" IPROC=D%NPROCL(D%NFRSTLOFF+JGL) - IF(IPROC > NPRTRNS) THEN - WRITE(NOUT,'(A,I8)')& - &' INIGPTR ERROR : exceeding processor limit ',NPRTRNS - CALL ABORT_TRANS(' INIGPTR ERROR : exceeding processor limit ') - ENDIF - ! for each latitude on this processor, find first and last points - ! for each NPROMA chunk, for each destination processor + ! for each latitude on this processor, find first and last points + ! for each NPROMA chunk, for each destination processor IF(IPROC /= IPROCLAST) THEN + ! we got onto a new process, we still need to finish the last block of the previous + ! process IF(IROF > 1) THEN KGPTRSEND(1,IBLOCK,IPROCLAST)=IBFIRST KGPTRSEND(2,IBLOCK,IPROCLAST)=IROF-1 @@ -56,10 +56,14 @@ SUBROUTINE INIGPTR(KGPTRSEND,KGPTRRECV) IF(IROF <= NPROMA) IBFIRST=IROF IPROCLAST=IPROC ENDIF + ! my offset of the first gridpoint in this row (globally, in EW-direction) IFIRST=D%NSTA(D%NPTRFLOFF+JGL,MY_REGION_EW) + ! my offset of the last gridpoint in this row (globally, in EW-direction) ILAST =IFIRST + D%NONL(D%NPTRFLOFF+JGL,MY_REGION_EW) -1 + ! now go through all gridpoints on this latitude DO JBL=IFIRST,ILAST IF(IROF == NPROMA) THEN + ! this block is full! IBLAST=IROF KGPTRSEND(1,IBLOCK,IPROC)=IBFIRST KGPTRSEND(2,IBLOCK,IPROC)=IBLAST @@ -71,7 +75,7 @@ SUBROUTINE INIGPTR(KGPTRSEND,KGPTRRECV) ENDDO ENDDO IF(IROF /= 1.AND.IROF /= IBFIRST) THEN -! non-empty residual block after last latitude line + ! non-empty residual block after last latitude line IBLAST=IROF-1 KGPTRSEND(1,IBLOCK,IPROC)=IBFIRST KGPTRSEND(2,IBLOCK,IPROC)=IBLAST diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 62b16304f..f7d57842e 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -107,8 +107,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& REAL(KIND=JPRBT),ALLOCATABLE :: ZCOMBUFS(:,:),ZCOMBUFR(:,:) REAL(KIND=JPRBT) :: ZDUM(2) - INTEGER(KIND=JPIM) :: ISENT (NPROC) - INTEGER(KIND=JPIM) :: IRCVD (NPROC) INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) INTEGER(KIND=JPIM) :: IREQ (NPROC*2) @@ -118,8 +116,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ! LOCAL INTEGER SCALARS INTEGER(KIND=JPIM) :: IFIRST, IFIRSTLAT, IGL, IGLL, ILAST,& &ILASTLAT, ILEN, JROC, IPOS, ISETA, & - &ISETB, IRECV, IRECVSET, & - &ISETV, ISEND, ISENDSET, ITAG, J, JBLK, JFLD, & + &ISETB, IRECV, & + &ISETV, ISEND, ITAG, J, JBLK, JFLD, & &JGL, JK, JL, JLOOP, ISETW, IFLD, & &II,INDOFFX,IBUFLENS,IBUFLENR,INRECV, IPROC,IFLDS, & &INSEND,INS,INR,IR, IUNIT, JKL, JK_MAX @@ -351,69 +349,90 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& DO JROC=1,NPROC CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) - IRECVSET = ISETA + ISEND = JROC - ISENDSET = ISETV - ISENT(JROC) = 0 - IRCVD(JROC) = 0 - - ! count up expected number of fields + + ! count up expected number of fields + ! if fields are not distributed over processes, KVSET=-=1, so IPOS=KF_GP, + ! otherwise we count how many fields are stored in this V-set. This is used to figure out + ! how much data we are going to send *to* that process JROC. Keep in mind at this point (g-space) + ! all fields are on this process so we have total overlap! + ! basically we count the size of the V set IPOS = 0 DO JFLD=1,KF_GP - IF(KVSET(JFLD) == ISENDSET .OR. KVSET(JFLD) == -1) IPOS = IPOS+1 + IF(KVSET(JFLD) == ISETV .OR. KVSET(JFLD) == -1) IPOS = IPOS+1 ENDDO ISEND_FLD_TOTAL(JROC) = IPOS + ! the W-set is horizontal distribution only - how much data are we going to send to + ! that process? IGPTRRECV tells me about one layer. ISENDTOT(JROC) = IGPTRRECV(ISETW)*IPOS IF( JROC /= MYPROC) THEN + ! IBUFLENS sums up all but my process IBUFLENS = MAX(IBUFLENS,ISENDTOT(JROC)) IF(ISENDTOT(JROC) > 0) THEN + ! I have to send something, so let me store that INSEND = INSEND+1 JSEND(INSEND)=JROC ENDIF ENDIF - - IFIRSTLAT = MAX(D%NPTRLS(MYSETW),D%NFRSTLAT(IRECVSET)) - ILASTLAT = MIN(D%NPTRLS(MYSETW)+D%NULTPP(MYSETW)-1,D%NLSTLAT(IRECVSET)) - + + ! THIS IS RECEIVER SIDE: + + ! MAX(Index of first fourier latitude for this W set, first latitude of a senders A set) + ! i.e. we find the overlap between what we have on sender side (others A set) and the receiver + ! (me, the W-set). Ideally those conincide, at least mostly. + IFIRSTLAT = MAX(D%NPTRLS(MYSETW),D%NFRSTLAT(ISETA)) + ! MIN(Index of last fourier latitude for this W set, last latitude of a senders A set) + ILASTLAT = MIN(D%NPTRLS(MYSETW)+D%NULTPP(MYSETW)-1,D%NLSTLAT(ISETA)) + IPOS = 0 DO JGL=IFIRSTLAT,ILASTLAT - IGL = D%NPTRFRSTLAT(IRECVSET)+JGL-D%NFRSTLAT(IRECVSET) + ! get from "actual" latitude to the latitude strip offset + IGL = D%NPTRFRSTLAT(ISETA)+JGL-D%NFRSTLAT(ISETA) + ! number of gridpoints on this latitude strip on my process IPOS = IPOS+D%NONL(IGL,ISETB) ENDDO - + ! We always receive the full fourier space IRECVTOT(JROC) = IPOS*KF_FS - - IF(IRECVTOT(JROC) > 0 .AND. MYPROC /= JROC) THEN - INRECV = INRECV + 1 - JRECV(INRECV)=JROC + + IF( JROC /= MYPROC) THEN + ! IBUFLENR sums up all but my process + IBUFLENR = MAX(IBUFLENR,IRECVTOT(JROC)) + IF(IRECVTOT(JROC) > 0) THEN + ! I have to recv something, so let me store that + INRECV = INRECV + 1 + JRECV(INRECV)=JROC + ENDIF ENDIF - - IF( JROC /= MYPROC) IBUFLENR = MAX(IBUFLENR,IRECVTOT(JROC)) - - IF(IPOS > 0) THEN + + IF(IRECVTOT(JROC) > 0) THEN + ! If I have to recv something, we need to fill KINDEX, this is the unpacking instruction... + + ! INDOFF is the offset to the first gridpoint on this process, only considering a + ! single layer, e.g KF_FS=1 INDOFF(JROC) = INDOFFX - INDOFFX = INDOFFX+IPOS + INDOFFX = INDOFFX+IRECVTOT(JROC)/KF_FS + IPOS = 0 DO JGL=IFIRSTLAT,ILASTLAT - IGL = D%NPTRFRSTLAT(IRECVSET)+JGL-D%NFRSTLAT(IRECVSET) + ! get from "actual" latitude to the latitude strip offset + IGL = D%NPTRFRSTLAT(ISETA)+JGL-D%NFRSTLAT(ISETA) + ! get from "actual" latitude to the latitude offset IGLL = JGL-D%NPTRLS(MYSETW)+1 DO JL=D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL),& &D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL)+D%NONL(IGL,ISETB)-1 IPOS = IPOS+1 - KINDEX(IPOS+INDOFF(JROC)) = JL + ! indicates where the data has to be stored + KINDEX(INDOFF(JROC)+IPOS) = JL ENDDO ENDDO ENDIF ENDDO - ISENDCOUNT=0 - IRECVCOUNT=0 - DO J=1,NPROC - ISENDCOUNT=MAX(ISENDCOUNT,ISENDTOT(J)) - IRECVCOUNT=MAX(IRECVCOUNT,IRECVTOT(J)) - ENDDO + ISENDCOUNT=MAXVAL(ISENDTOT) + IRECVCOUNT=MAXVAL(IRECVTOT) IF (IBUFLENS > 0) ALLOCATE(ZCOMBUFS(ISENDCOUNT,INSEND)) IF (IBUFLENR > 0) ALLOCATE(ZCOMBUFR(IRECVCOUNT,INRECV)) @@ -438,6 +457,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) COPYIN(IGP3BLEVS,IGP3BPARS) ASYNC(1) IF(ISENDTOT(MYPROC) > 0 )THEN + ! I have to send something to myself... + ! Input is KF_GP fields. We find the resulting KF_FS fields. IFLDS = 0 DO JFLD=1,KF_GP @@ -475,9 +496,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ILAST = IGPTRSEND(2,JBLK,MYSETW) JK = JKL+IFIRST-1 IF(IFIRST > 0 .AND. JK <= ILAST) THEN - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 - IFLD = IFLDOFF(JFLD) - PGLAT(JFLD,KINDEX(IPOS)) = PGP(JK,IFLD,JBLK) + IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JKL + PGLAT(JFLD,KINDEX(IPOS)) = PGP(JK,IFLDOFF(JFLD),JBLK) ENDIF ENDDO ENDDO @@ -491,7 +511,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ILAST = IGPTRSEND(2,JBLK,MYSETW) JK = JKL+IFIRST-1 IF(IFIRST > 0 .AND. JK <= ILAST) THEN - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 + IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JKL IFLD = IFLDOFF(JFLD) IF(LLUV(IFLD)) THEN PGLAT(JFLD,KINDEX(IPOS)) = PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) @@ -523,12 +543,11 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& DO INS=1,INSEND ISEND=JSEND(INS) CALL PE2SET(ISEND,ISETA,ISETB,ISETW,ISETV) - ISENDSET = ISETV ISEND_FLD_CNT = ISEND_FLD_TOTAL(ISEND) IFLD = 0 IPOS = 0 DO JFLD=1,KF_GP - IF(KVSET(JFLD) == ISENDSET .OR. KVSET(JFLD) == -1 ) THEN + IF(KVSET(JFLD) == ISETV .OR. KVSET(JFLD) == -1 ) THEN IFLD = IFLD+1 IFLDA(IFLD)=JFLD ENDIF From 91585879783389eb662f65f86fe4bf7f4007e81b Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:01 -0700 Subject: [PATCH 068/263] Simplify summing over blocks --- src/trans/gpu/internal/inigptr_mod.F90 | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/trans/gpu/internal/inigptr_mod.F90 b/src/trans/gpu/internal/inigptr_mod.F90 index 26391df10..9dc55eca0 100755 --- a/src/trans/gpu/internal/inigptr_mod.F90 +++ b/src/trans/gpu/internal/inigptr_mod.F90 @@ -33,7 +33,8 @@ SUBROUTINE INIGPTR(KGPTRSEND,KGPTRRECV) ! Compute tables to assist GP to/from Fourier space transpositions -KGPTRSEND(:,:,:)=0 +KGPTRSEND(1,:,:)=0 +KGPTRSEND(2,:,:)=-1 IBLOCK=1 IROF=1 IBFIRST=1 @@ -81,15 +82,7 @@ SUBROUTINE INIGPTR(KGPTRSEND,KGPTRRECV) KGPTRSEND(2,IBLOCK,IPROC)=IBLAST ENDIF ! sum up over blocks -KGPTRRECV(:)=0 -DO JPRTRNS=1,NPRTRNS - DO JBLKS=1,NGPBLKS - IF(KGPTRSEND(1,JBLKS,JPRTRNS) > 0) THEN - KGPTRRECV(JPRTRNS)=KGPTRRECV(JPRTRNS)+& - &KGPTRSEND(2,JBLKS,JPRTRNS)-KGPTRSEND(1,JBLKS,JPRTRNS)+1 - ENDIF - ENDDO -ENDDO +KGPTRRECV(:)=SUM(KGPTRSEND(2,:,:),1)-SUM(KGPTRSEND(1,:,:),1)+NGPBLKS END SUBROUTINE INIGPTR END MODULE INIGPTR_MOD From 6be9c7c08353582c378c13c811054bcb70b52b53 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:01 -0700 Subject: [PATCH 069/263] cleanup interface --- src/trans/gpu/internal/trgtol_mod.F90 | 146 +++----------------------- src/trans/gpu/internal/trltog_mod.F90 | 8 +- 2 files changed, 13 insertions(+), 141 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index f7d57842e..bc135bc5b 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -79,7 +79,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& USE TPM_GEN ,ONLY : NOUT, LSYNC_TRANS USE TPM_DISTR ,ONLY : D, NPRCIDS, NPRTRNS, MTAGGL, & & MYSETV, MYSETW, MYPROC, NPROC - USE TPM_TRANS ,ONLY : LDIVGP, LGPNORM, LSCDERS, LUVDER, LVORGP, NGPBLKS + USE TPM_TRANS ,ONLY : NGPBLKS USE INIGPTR_MOD ,ONLY : INIGPTR USE PE2SET_MOD ,ONLY : PE2SET @@ -123,13 +123,12 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& &INSEND,INS,INR,IR, IUNIT, JKL, JK_MAX ! LOCAL LOGICAL SCALARS - LOGICAL :: LLPGPUV,LLPGP3A,LLPGP3B,LLPGP2,LLPGPONLY LOGICAL :: LLUV(KF_GP),LLGP2(KF_GP),LLGP3A(KF_GP),LLGP3B(KF_GP) - LOGICAL :: LLDONE, LLINDER + LOGICAL :: LLINDER INTEGER(KIND=JPIM) :: IUVLEVS(KF_GP),IUVPARS(KF_GP),IGP2PARS(KF_GP) INTEGER(KIND=JPIM) :: IGP3APARS(KF_GP),IGP3ALEVS(KF_GP),IGP3BPARS(KF_GP),IGP3BLEVS(KF_GP) - INTEGER(KIND=JPIM) :: IUVPAR,IUVLEV,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR,IPAROFF - INTEGER(KIND=JPIM) :: IOFF,IOFF1,IOFFNS,IOFFEW,J1,J2 + INTEGER(KIND=JPIM) :: IUVPAR,IUVLEV,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR + INTEGER(KIND=JPIM) :: IOFF,IOFF1,J1,J2 INTEGER(KIND=JPIM) :: KINDEX(D%NLENGTF),INDOFF(NPROC),IFLDOFF(KF_FS) INTEGER(KIND=JPIM) :: ISEND_FLD_TOTAL(NPROC),ISEND_FLD_CNT INTEGER(KIND=JPIM) :: INUMFLDS @@ -163,47 +162,15 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL GSTATS(1805,0) - LLINDER = .FALSE. - LLPGPUV = .FALSE. - LLPGP3A = .FALSE. - LLPGP3B = .FALSE. - LLPGP2 = .FALSE. - LLPGPONLY = .FALSE. - IF(PRESENT(KPTRGP)) LLINDER = .TRUE. - IF(PRESENT(PGP)) LLPGPONLY = .TRUE. - IF(PRESENT(PGPUV)) LLPGPUV = .TRUE. - IF(PRESENT(PGP3A)) LLPGP3A = .TRUE. - IF(PRESENT(PGP3B)) LLPGP3B = .TRUE. - IF(PRESENT(PGP2)) LLPGP2 = .TRUE. IUVPAR=0 IUVLEV=0 IOFF1=0 - IOFFNS=KF_SCALARS_G - IOFFEW=2*KF_SCALARS_G LLUV(:) = .FALSE. IUVPARS(:) = -99 IUVLEVS(:) = -99 - IF (LLPGPUV) THEN + IF (PRESENT(PGPUV)) THEN IOFF=0 IUVLEV=UBOUND(PGPUV,2) - IF(LVORGP) THEN - IUVPAR=IUVPAR+1 - DO J=1,IUVLEV - IUVLEVS(IOFF+J)=J - IUVPARS(IOFF+J)=IUVPAR - LLUV(IOFF+J)=.TRUE. - ENDDO - IOFF=IOFF+IUVLEV - ENDIF - IF(LDIVGP) THEN - IUVPAR=IUVPAR+1 - DO J=1,IUVLEV - IUVLEVS(IOFF+J)=J - IUVPARS(IOFF+J)=IUVPAR - LLUV(IOFF+J)=.TRUE. - ENDDO - IOFF=IOFF+IUVLEV - ENDIF DO J=1,IUVLEV IUVLEVS(IOFF+J)=J IUVPARS(IOFF+J)=IUVPAR+1 @@ -214,54 +181,21 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& LLUV(IOFF+1:IOFF+2*IUVLEV)=.TRUE. IOFF=IOFF+2*IUVLEV IOFF1=IOFF - IOFFNS=IOFFNS+IOFF - IOFFEW=IOFFEW+IOFF - IOFF=IUVPAR*IUVLEV+KF_SCALARS_G - IF(LUVDER) THEN - IF(LSCDERS) IOFF=IOFF+KF_SCALARS_G - DO J=1,IUVLEV - IUVLEVS(IOFF+J)=J - IUVPARS(IOFF+J)=IUVPAR+1 - LLUV(IOFF+J)=.TRUE. - IUVLEVS(IOFF+J+IUVLEV)=J - IUVPARS(IOFF+J+IUVLEV)=IUVPAR+2 - LLUV(IOFF+J+IUVLEV)=.TRUE. - ENDDO - IUVPAR=IUVPAR+2 - IOFF=IOFF+2*IUVLEV - IOFFEW=IOFFEW+2*IUVLEV - ENDIF ENDIF LLGP2(:)=.FALSE. - IF(LLPGP2) THEN + IF(PRESENT(PGP2)) THEN IOFF=IOFF1 IGP2PAR=UBOUND(PGP2,2) - IF(LSCDERS) IGP2PAR=IGP2PAR/3 DO J=1,IGP2PAR LLGP2(J+IOFF) = .TRUE. IGP2PARS(J+IOFF)=J ENDDO IOFF1=IOFF1+IGP2PAR - IF(LSCDERS) THEN - IOFF=IOFFNS - DO J=1,IGP2PAR - LLGP2(J+IOFF) = .TRUE. - IGP2PARS(J+IOFF)=J+IGP2PAR - ENDDO - IOFFNS=IOFF+IGP2PAR - IOFF=IOFFEW - DO J=1,IGP2PAR - LLGP2(J+IOFF) = .TRUE. - IGP2PARS(J+IOFF)=J+2*IGP2PAR - ENDDO - IOFFEW=IOFF+IGP2PAR - ENDIF ENDIF LLGP3A(:) = .FALSE. - IF(LLPGP3A) THEN + IF(PRESENT(PGP3A)) THEN IGP3ALEV=UBOUND(PGP3A,2) IGP3APAR=UBOUND(PGP3A,3) - IF(LSCDERS) IGP3APAR=IGP3APAR/3 IOFF=IOFF1 DO J1=1,IGP3APAR DO J2=1,IGP3ALEV @@ -270,35 +204,12 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IGP3ALEVS(J2+(J1-1)*IGP3ALEV+IOFF)=J2 ENDDO ENDDO - IPAROFF=IGP3APAR IOFF1=IOFF1+IGP3APAR*IGP3ALEV - IF(LSCDERS) THEN - IOFF=IOFFNS - DO J1=1,IGP3APAR - DO J2=1,IGP3ALEV - LLGP3A(J2+(J1-1)*IGP3ALEV+IOFF) = .TRUE. - IGP3APARS(J2+(J1-1)*IGP3ALEV+IOFF)=J1+IPAROFF - IGP3ALEVS(J2+(J1-1)*IGP3ALEV+IOFF)=J2 - ENDDO - ENDDO - IPAROFF=IPAROFF+IGP3APAR - IOFFNS=IOFFNS+IGP3APAR*IGP3ALEV - IOFF=IOFFEW - DO J1=1,IGP3APAR - DO J2=1,IGP3ALEV - LLGP3A(J2+(J1-1)*IGP3ALEV+IOFF) = .TRUE. - IGP3APARS(J2+(J1-1)*IGP3ALEV+IOFF)=J1+IPAROFF - IGP3ALEVS(J2+(J1-1)*IGP3ALEV+IOFF)=J2 - ENDDO - ENDDO - IOFFEW=IOFFEW+IGP3APAR*IGP3ALEV - ENDIF ENDIF LLGP3B(:) = .FALSE. - IF(LLPGP3B) THEN + IF(PRESENT(PGP3B)) THEN IGP3BLEV=UBOUND(PGP3B,2) IGP3BPAR=UBOUND(PGP3B,3) - IF(LSCDERS) IGP3BPAR=IGP3BPAR/3 IOFF=IOFF1 DO J1=1,IGP3BPAR DO J2=1,IGP3BLEV @@ -307,34 +218,11 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IGP3BLEVS(J2+(J1-1)*IGP3BLEV+IOFF)=J2 ENDDO ENDDO - IPAROFF=IGP3BPAR IOFF1=IOFF1+IGP3BPAR*IGP3BLEV - IF(LSCDERS) THEN - IOFF=IOFFNS - DO J1=1,IGP3BPAR - DO J2=1,IGP3BLEV - LLGP3B(J2+(J1-1)*IGP3BLEV+IOFF) = .TRUE. - IGP3BPARS(J2+(J1-1)*IGP3BLEV+IOFF)=J1+IPAROFF - IGP3BLEVS(J2+(J1-1)*IGP3BLEV+IOFF)=J2 - ENDDO - ENDDO - IPAROFF=IPAROFF+IGP3BPAR - IOFFNS=IOFFNS+IGP3BPAR*IGP3BLEV - IOFF=IOFFEW - DO J1=1,IGP3BPAR - DO J2=1,IGP3BLEV - LLGP3B(J2+(J1-1)*IGP3BLEV+IOFF) = .TRUE. - IGP3BPARS(J2+(J1-1)*IGP3BLEV+IOFF)=J1+IPAROFF - IGP3BLEVS(J2+(J1-1)*IGP3BLEV+IOFF)=J2 - ENDDO - ENDDO - IOFFEW=IOFFEW+IGP3BPAR*IGP3BLEV - ENDIF ENDIF CALL INIGPTR(IGPTRSEND,IGPTRRECV) - LLDONE = .FALSE. !$ACC DATA COPYIN(LLGP2,LLGP3A,LLGP3B,LLUV,IGPTRSEND) ASYNC(1) @@ -464,7 +352,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& DO JFLD=1,KF_GP IF(KVSET(JFLD) == MYSETV .OR. KVSET(JFLD) == -1) THEN IFLDS = IFLDS+1 - IF(LLINDER) THEN + IF(PRESENT(KPTRGP)) THEN IFLDOFF(IFLDS) = KPTRGP(JFLD) ELSE IFLDOFF(IFLDS) = JFLD @@ -487,7 +375,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC DATA COPYIN(IFLDOFF,IGPTROFF) ASYNC(1) CALL GSTATS(1601,0) - IF(LLPGPONLY) THEN + IF(PRESENT(PGP)) THEN !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) ASYNC(1) DO JBLK=1,NGPBLKS DO JFLD=1,KF_FS @@ -575,9 +463,9 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& JK = JKL+IFIRST-1 JI=(JJ-1)*IPOS+IJPOS(JBLK)+JKL IF(IFIRST > 0 .AND. JK <= ILAST) THEN - IF(LLINDER) THEN + IF(PRESENT(KPTRGP)) THEN ZCOMBUFS(JI,INS) = PGP(JK,KPTRGP(IFLDT),JBLK) - ELSEIF(LLPGPONLY) THEN + ELSEIF(PRESENT(PGP)) THEN ZCOMBUFS(JI,INS) = PGP(JK,IFLDT,JBLK) ELSEIF(LLUV(IFLDT)) THEN ZCOMBUFS(JI,INS) = PGPUV(JK,IUVLEVS(IFLDT),IUVPARS(IFLDT),JBLK) @@ -609,11 +497,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL GSTATS_BARRIER(761) IF (LHOOK) CALL DR_HOOK('TRGTOL_BAR',1,ZHOOK_HANDLE_BAR) - IF(.NOT.LGPNORM)THEN - CALL GSTATS(803,0) - ELSE - CALL GSTATS(804,0) - ENDIF IR=0 #ifdef COMVERBOSE @@ -660,11 +543,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ! !IF(irank==0) WRITE(*,*) "non-CUDA-aware isend/irecv (trgtol) in sec: ", Tc !#endif - IF(.NOT.LGPNORM)THEN - CALL GSTATS(803,1) - ELSE - CALL GSTATS(804,1) - ENDIF CALL GSTATS_BARRIER2(761) !#ifdef COMVERBOSE diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index faa6b9182..cbe789df3 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -419,11 +419,6 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC DATA IF(IBUFLENS > 0) CREATE(ZCOMBUFS) !$ACC DATA IF(IBUFLENR > 0) CREATE(ZCOMBUFR) - !$ACC KERNELS DEFAULT(NONE) - IF (IBUFLENS > 0) ZCOMBUFS(:,:) = 0 - IF (IBUFLENR > 0) ZCOMBUFR(:,:) = 0 - !$ACC END KERNELS - !$ACC DATA & !$ACC PRESENT(PGLAT) & !$ACC COPYIN(IGPTRSEND,INDOFF,KINDEX, LLUV,LLGP2,LLGP3A,LLGP3B,KPTRGP) @@ -476,11 +471,10 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JKL IF(LLINDER) THEN IFLD = KPTRGP(JFLD) - PGP(JK,IFLD,JBLK) = PGLAT(JFLD,KINDEX(IPOS)) ELSE IFLD = IFLDOFF(JFLD) - PGP(JK,IFLD,JBLK) = PGLAT(JFLD,KINDEX(IPOS)) ENDIF + PGP(JK,IFLD,JBLK) = PGLAT(JFLD,KINDEX(IPOS)) ENDIF ENDDO ENDDO From 8d63730464584f9ebb9006f0d85da8cc1bb14015 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:01 -0700 Subject: [PATCH 070/263] Simplify some index computations in TRGTOL --- src/trans/gpu/internal/trgtol_mod.F90 | 36 +++++++++++---------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index bc135bc5b..296887e14 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -137,7 +137,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& INTEGER(KIND=JPIM) :: IGPTROFF(NGPBLKS) ! INTEGER FUNCTIONS INTEGER(KIND=JPIM) :: IRECVCOUNT,ISENDCOUNT,INDOFFL - INTEGER(KIND=JPIM) :: IJPOS(NGPBLKS),IFLDA(KF_GP),JJ,JI,IFLDT + INTEGER(KIND=JPIM) :: IFLDA(KF_GP),JJ,JI,IFLDT REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR @@ -360,17 +360,11 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDIF ENDDO - IPOS=0 - JK_MAX = 0 - DO JBLK=1,NGPBLKS - IGPTROFF(JBLK)=IPOS - IFIRST = IGPTRSEND(1,JBLK,MYSETW) - IF(IFIRST > 0) THEN - ILAST = IGPTRSEND(2,JBLK,MYSETW) - IPOS=IPOS+ILAST-IFIRST+1 - IF (JK_MAX<(ILAST-IFIRST+1)) JK_MAX = (ILAST-IFIRST+1) - ENDIF + IGPTROFF(1)=0 + DO JBLK=2,NGPBLKS + IGPTROFF(JBLK)=IGPTROFF(JBLK)+IGPTRSEND(2,JBLK,MYSETW)-IGPTRSEND(1,JBLK,MYSETW)+1 ENDDO + JK_MAX = MAXVAL(IGPTRSEND(2,:,MYSETW)-IGPTRSEND(1,:,MYSETW))+1 !$ACC DATA COPYIN(IFLDOFF,IGPTROFF) ASYNC(1) @@ -433,7 +427,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL PE2SET(ISEND,ISETA,ISETB,ISETW,ISETV) ISEND_FLD_CNT = ISEND_FLD_TOTAL(ISEND) IFLD = 0 - IPOS = 0 DO JFLD=1,KF_GP IF(KVSET(JFLD) == ISETV .OR. KVSET(JFLD) == -1 ) THEN IFLD = IFLD+1 @@ -441,19 +434,20 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDIF ENDDO - JK_MAX = 0 + IPOS = 0 DO JBLK=1,NGPBLKS IFIRST = IGPTRSEND(1,JBLK,ISETW) - IF(IFIRST > 0) THEN - ILAST = IGPTRSEND(2,JBLK,ISETW) - IJPOS(JBLK)=IPOS - IPOS = IPOS+ILAST-IFIRST+1 - IF (JK_MAX<(ILAST-IFIRST+1)) JK_MAX = (ILAST-IFIRST+1) - ENDIF + ILAST = IGPTRSEND(2,JBLK,ISETW) + IPOS = IPOS+ILAST-IFIRST+1 + ENDDO + IGPTROFF(1)=0 + DO JBLK=2,NGPBLKS + IGPTROFF(JBLK)=IGPTROFF(JBLK)+IGPTRSEND(2,JBLK,ISETW)-IGPTRSEND(1,JBLK,ISETW)+1 ENDDO + JK_MAX = MAXVAL(IGPTRSEND(2,:,ISETW)-IGPTRSEND(1,:,ISETW))+1 - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLDT,IFIRST,ILAST) COPYIN(IJPOS,IFLDA) ASYNC(1) + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLDT,IFIRST,ILAST) COPYIN(IGPTROFF,IFLDA) ASYNC(1) DO JJ=1,ISEND_FLD_CNT DO JBLK=1,NGPBLKS DO JKL=1, JK_MAX @@ -461,7 +455,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IFIRST = IGPTRSEND(1,JBLK,ISETW) ILAST = IGPTRSEND(2,JBLK,ISETW) JK = JKL+IFIRST-1 - JI=(JJ-1)*IPOS+IJPOS(JBLK)+JKL + JI=(JJ-1)*IPOS+IGPTROFF(JBLK)+JKL IF(IFIRST > 0 .AND. JK <= ILAST) THEN IF(PRESENT(KPTRGP)) THEN ZCOMBUFS(JI,INS) = PGP(JK,KPTRGP(IFLDT),JBLK) From 56de6575a9f321738befe07b83995884e7024506 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:01 -0700 Subject: [PATCH 071/263] FIX: Fix index computation --- src/trans/gpu/internal/trgtol_mod.F90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 296887e14..93445993a 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -362,7 +362,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IGPTROFF(1)=0 DO JBLK=2,NGPBLKS - IGPTROFF(JBLK)=IGPTROFF(JBLK)+IGPTRSEND(2,JBLK,MYSETW)-IGPTRSEND(1,JBLK,MYSETW)+1 + IGPTROFF(JBLK)=IGPTROFF(JBLK-1)+IGPTRSEND(2,JBLK-1,MYSETW)-IGPTRSEND(1,JBLK-1,MYSETW)+1 ENDDO JK_MAX = MAXVAL(IGPTRSEND(2,:,MYSETW)-IGPTRSEND(1,:,MYSETW))+1 @@ -442,7 +442,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDDO IGPTROFF(1)=0 DO JBLK=2,NGPBLKS - IGPTROFF(JBLK)=IGPTROFF(JBLK)+IGPTRSEND(2,JBLK,ISETW)-IGPTRSEND(1,JBLK,ISETW)+1 + IGPTROFF(JBLK)=IGPTROFF(JBLK-1)+IGPTRSEND(2,JBLK-1,ISETW)-IGPTRSEND(1,JBLK-1,ISETW)+1 ENDDO JK_MAX = MAXVAL(IGPTRSEND(2,:,ISETW)-IGPTRSEND(1,:,ISETW))+1 From 5592d715f206399826790858f75884d1e1db94f9 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:02 -0700 Subject: [PATCH 072/263] Some minor cleanup in TRGTOL --- src/trans/gpu/internal/trgtol_mod.F90 | 162 ++++++++++++-------------- 1 file changed, 74 insertions(+), 88 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 93445993a..0c43fea39 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -117,7 +117,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& INTEGER(KIND=JPIM) :: IFIRST, IFIRSTLAT, IGL, IGLL, ILAST,& &ILASTLAT, ILEN, JROC, IPOS, ISETA, & &ISETB, IRECV, & - &ISETV, ISEND, ITAG, J, JBLK, JFLD, & + &ISETV, ISEND, J, JBLK, JFLD, & &JGL, JK, JL, JLOOP, ISETW, IFLD, & &II,INDOFFX,IBUFLENS,IBUFLENR,INRECV, IPROC,IFLDS, & &INSEND,INS,INR,IR, IUNIT, JKL, JK_MAX @@ -220,25 +220,15 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDDO IOFF1=IOFF1+IGP3BPAR*IGP3BLEV ENDIF - - + CALL INIGPTR(IGPTRSEND,IGPTRRECV) - - !$ACC DATA COPYIN(LLGP2,LLGP3A,LLGP3B,LLUV,IGPTRSEND) ASYNC(1) - - ITAG = MTAGGL - - INDOFFX = 0 + IBUFLENS = 0 - IBUFLENR = 0 - INRECV = 0 INSEND = 0 - + ! Prepare sender arrays DO JROC=1,NPROC - - CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) - ISEND = JROC + CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) ! count up expected number of fields ! if fields are not distributed over processes, KVSET=-=1, so IPOS=KF_GP, @@ -253,7 +243,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ISEND_FLD_TOTAL(JROC) = IPOS ! the W-set is horizontal distribution only - how much data are we going to send to ! that process? IGPTRRECV tells me about one layer. - ISENDTOT(JROC) = IGPTRRECV(ISETW)*IPOS + ISENDTOT(JROC) = IGPTRRECV(ISETW)*ISEND_FLD_TOTAL(JROC) IF( JROC /= MYPROC) THEN ! IBUFLENS sums up all but my process @@ -264,8 +254,15 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& JSEND(INSEND)=JROC ENDIF ENDIF + ENDDO - ! THIS IS RECEIVER SIDE: + INRECV = 0 + INDOFFX = 0 + IBUFLENR = 0 + ! Prepare receiver arrays + DO JROC=1,NPROC + + CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) ! MAX(Index of first fourier latitude for this W set, first latitude of a senders A set) ! i.e. we find the overlap between what we have on sender side (others A set) and the receiver @@ -308,42 +305,26 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IGL = D%NPTRFRSTLAT(ISETA)+JGL-D%NFRSTLAT(ISETA) ! get from "actual" latitude to the latitude offset IGLL = JGL-D%NPTRLS(MYSETW)+1 - DO JL=D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL),& - &D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL)+D%NONL(IGL,ISETB)-1 + DO JL=1,D%NONL(IGL,ISETB) IPOS = IPOS+1 ! indicates where the data has to be stored - KINDEX(INDOFF(JROC)+IPOS) = JL + KINDEX(INDOFF(JROC)+IPOS) = D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL)+JL-1 ENDDO ENDDO ENDIF - ENDDO - - ISENDCOUNT=MAXVAL(ISENDTOT) - IRECVCOUNT=MAXVAL(IRECVTOT) - - IF (IBUFLENS > 0) ALLOCATE(ZCOMBUFS(ISENDCOUNT,INSEND)) - IF (IBUFLENR > 0) ALLOCATE(ZCOMBUFR(IRECVCOUNT,INRECV)) - !$ACC DATA IF(IBUFLENS > 0) CREATE(ZCOMBUFS) ASYNC(1) - !$ACC DATA IF(IBUFLENR > 0) CREATE(ZCOMBUFR) ASYNC(1) - - !$ACC KERNELS DEFAULT(NONE) ASYNC(1) - IF (IBUFLENS > 0) ZCOMBUFS(:,:) = 0. - IF (IBUFLENR > 0) ZCOMBUFR(:,:) = 0. - !$ACC END KERNELS CALL GSTATS(1805,1) - ! Send loop............................................................. - - ! Copy local contribution - !$ACC DATA PRESENT(PGLAT) COPYIN(KPTRGP,INDOFF,KINDEX) ASYNC(1) + !$ACC DATA PRESENT(PGLAT) COPYIN(KPTRGP,INDOFF,KINDEX) & + !$ACC& COPYIN(LLGP2,LLGP3A,LLGP3B,LLUV,IGPTRSEND) ASYNC(1) !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) COPYIN(IUVLEVS,IUVPARS) ASYNC(1) !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) COPYIN(IGP2PARS) ASYNC(1) !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) COPYIN(IGP3ALEVS,IGP3APARS) ASYNC(1) !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) COPYIN(IGP3BLEVS,IGP3BPARS) ASYNC(1) - + + ! Copy local contribution IF(ISENDTOT(MYPROC) > 0 )THEN ! I have to send something to myself... @@ -366,59 +347,66 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDDO JK_MAX = MAXVAL(IGPTRSEND(2,:,MYSETW)-IGPTRSEND(1,:,MYSETW))+1 - !$ACC DATA COPYIN(IFLDOFF,IGPTROFF) ASYNC(1) + !$ACC DATA COPYIN(IFLDOFF,IGPTROFF) ASYNC(1) - CALL GSTATS(1601,0) - IF(PRESENT(PGP)) THEN - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) ASYNC(1) - DO JBLK=1,NGPBLKS - DO JFLD=1,KF_FS - DO JKL=1, JK_MAX - IFIRST = IGPTRSEND(1,JBLK,MYSETW) - ILAST = IGPTRSEND(2,JBLK,MYSETW) - JK = JKL+IFIRST-1 - IF(IFIRST > 0 .AND. JK <= ILAST) THEN - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JKL - PGLAT(JFLD,KINDEX(IPOS)) = PGP(JK,IFLDOFF(JFLD),JBLK) - ENDIF + CALL GSTATS(1601,0) + IF(PRESENT(PGP)) THEN + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) ASYNC(1) + DO JBLK=1,NGPBLKS + DO JFLD=1,KF_FS + DO JKL=1, JK_MAX + IFIRST = IGPTRSEND(1,JBLK,MYSETW) + ILAST = IGPTRSEND(2,JBLK,MYSETW) + JK = JKL+IFIRST-1 + IF(IFIRST > 0 .AND. JK <= ILAST) THEN + IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JKL + PGLAT(JFLD,KINDEX(IPOS)) = PGP(JK,IFLDOFF(JFLD),JBLK) + ENDIF + ENDDO ENDDO ENDDO - ENDDO - ELSE - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) ASYNC(1) - DO JBLK=1,NGPBLKS - DO JFLD=1,KF_FS - DO JKL=1, JK_MAX - IFIRST = IGPTRSEND(1,JBLK,MYSETW) - ILAST = IGPTRSEND(2,JBLK,MYSETW) - JK = JKL+IFIRST-1 - IF(IFIRST > 0 .AND. JK <= ILAST) THEN - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JKL - IFLD = IFLDOFF(JFLD) - IF(LLUV(IFLD)) THEN - PGLAT(JFLD,KINDEX(IPOS)) = PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) - ELSEIF(LLGP2(IFLD)) THEN - PGLAT(JFLD,KINDEX(IPOS)) = PGP2(JK,IGP2PARS(IFLD),JBLK) - ELSEIF(LLGP3A(IFLD)) THEN - PGLAT(JFLD,KINDEX(IPOS)) = PGP3A(JK,IGP3ALEVS(IFLD),IGP3APARS(IFLD),JBLK) - ELSEIF(LLGP3B(IFLD)) THEN - PGLAT(JFLD,KINDEX(IPOS)) = PGP3B(JK,IGP3BLEVS(IFLD),IGP3BPARS(IFLD),JBLK) + ELSE + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) ASYNC(1) + DO JBLK=1,NGPBLKS + DO JFLD=1,KF_FS + DO JKL=1, JK_MAX + IFIRST = IGPTRSEND(1,JBLK,MYSETW) + ILAST = IGPTRSEND(2,JBLK,MYSETW) + JK = JKL+IFIRST-1 + IF(IFIRST > 0 .AND. JK <= ILAST) THEN + IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JKL + IFLD = IFLDOFF(JFLD) + IF(LLUV(IFLD)) THEN + PGLAT(JFLD,KINDEX(IPOS)) = PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) + ELSEIF(LLGP2(IFLD)) THEN + PGLAT(JFLD,KINDEX(IPOS)) = PGP2(JK,IGP2PARS(IFLD),JBLK) + ELSEIF(LLGP3A(IFLD)) THEN + PGLAT(JFLD,KINDEX(IPOS)) = PGP3A(JK,IGP3ALEVS(IFLD),IGP3APARS(IFLD),JBLK) + ELSEIF(LLGP3B(IFLD)) THEN + PGLAT(JFLD,KINDEX(IPOS)) = PGP3B(JK,IGP3BLEVS(IFLD),IGP3BPARS(IFLD),JBLK) + ENDIF ENDIF - ENDIF + ENDDO ENDDO ENDDO - ENDDO - ENDIF - CALL GSTATS(1601,1) + ENDIF + CALL GSTATS(1601,1) - !$ACC END DATA + !$ACC END DATA ENDIF - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=TIMEF() - #endif + ! Send loop............................................................. + + + ISENDCOUNT=MAXVAL(ISENDTOT) + IRECVCOUNT=MAXVAL(IRECVTOT) + IF (IBUFLENS > 0) ALLOCATE(ZCOMBUFS(ISENDCOUNT,INSEND)) + IF (IBUFLENR > 0) ALLOCATE(ZCOMBUFR(IRECVCOUNT,INRECV)) + + !$ACC DATA IF(IBUFLENS > 0) CREATE(ZCOMBUFS) ASYNC(1) + !$ACC DATA IF(IBUFLENR > 0) CREATE(ZCOMBUFR) ASYNC(1) + !....Pack loop......................................................... CALL GSTATS(1602,0) @@ -510,7 +498,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IR=IR+1 IRECV=JRECV(INR) CALL MPI_IRECV(ZCOMBUFR(1:IRECVTOT(IRECV),INR),IRECVTOT(IRECV), & - & TRGTOL_DTYPE,NPRCIDS(IRECV)-1,ITAG,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) + & TRGTOL_DTYPE,NPRCIDS(IRECV)-1,MTAGGL,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) ENDDO !....Send loop......................................................... @@ -518,7 +506,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IR=IR+1 ISEND=JSEND(INS) CALL MPI_ISEND(ZCOMBUFS(1:ISENDTOT(ISEND),INS),ISENDTOT(ISEND), & - & TRGTOL_DTYPE,NPRCIDS(ISEND)-1,ITAG,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) + & TRGTOL_DTYPE,NPRCIDS(ISEND)-1,MTAGGL,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) ENDDO !$ACC END HOST_DATA @@ -571,15 +559,13 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL GSTATS(1603,1) + !$ACC END DATA !! ZCOMBUFS + !$ACC END DATA !! ZCOMBUFS !$ACC END DATA !! PRESENT(PGP3B) !$ACC END DATA !! PRESENT(PGP3A) !$ACC END DATA !! PRESENT(PGP2) !$ACC END DATA !! PRESENT(PGPUV) !$ACC END DATA !! PRESENT(PGP) - !$ACC END DATA !! PRESENT(PGLAT) - - !$ACC END DATA !! ZCOMBUFS - !$ACC END DATA !! ZCOMBUFS IF (IBUFLENS > 0) DEALLOCATE(ZCOMBUFS) IF (IBUFLENR > 0) DEALLOCATE(ZCOMBUFR) From 93eb16a5233c88c29987e77db7f56b6f15c81829 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:02 -0700 Subject: [PATCH 073/263] Split packing loop similar to self transpose --- src/trans/gpu/internal/trgtol_mod.F90 | 72 +++++++++++++++++---------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 0c43fea39..626272e6b 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -433,35 +433,55 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IGPTROFF(JBLK)=IGPTROFF(JBLK-1)+IGPTRSEND(2,JBLK-1,ISETW)-IGPTRSEND(1,JBLK-1,ISETW)+1 ENDDO JK_MAX = MAXVAL(IGPTRSEND(2,:,ISETW)-IGPTRSEND(1,:,ISETW))+1 - - - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLDT,IFIRST,ILAST) COPYIN(IGPTROFF,IFLDA) ASYNC(1) - DO JJ=1,ISEND_FLD_CNT - DO JBLK=1,NGPBLKS - DO JKL=1, JK_MAX - IFLDT=IFLDA(JJ) - IFIRST = IGPTRSEND(1,JBLK,ISETW) - ILAST = IGPTRSEND(2,JBLK,ISETW) - JK = JKL+IFIRST-1 - JI=(JJ-1)*IPOS+IGPTROFF(JBLK)+JKL - IF(IFIRST > 0 .AND. JK <= ILAST) THEN - IF(PRESENT(KPTRGP)) THEN - ZCOMBUFS(JI,INS) = PGP(JK,KPTRGP(IFLDT),JBLK) - ELSEIF(PRESENT(PGP)) THEN - ZCOMBUFS(JI,INS) = PGP(JK,IFLDT,JBLK) - ELSEIF(LLUV(IFLDT)) THEN - ZCOMBUFS(JI,INS) = PGPUV(JK,IUVLEVS(IFLDT),IUVPARS(IFLDT),JBLK) - ELSEIF(LLGP2(IFLDT)) THEN - ZCOMBUFS(JI,INS) = PGP2(JK,IGP2PARS(IFLDT),JBLK) - ELSEIF(LLGP3A(IFLDT)) THEN - ZCOMBUFS(JI,INS) = PGP3A(JK,IGP3ALEVS(IFLDT),IGP3APARS(IFLDT),JBLK) - ELSEIF(LLGP3B(IFLDT)) THEN - ZCOMBUFS(JI,INS) = PGP3B(JK,IGP3BLEVS(IFLDT),IGP3BPARS(IFLDT),JBLK) + + !$ACC DATA COPYIN(IGPTROFF,IFLDA) ASYNC(1) + + IF(PRESENT(PGP)) THEN + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLDT,IFIRST,ILAST) ASYNC(1) + DO JJ=1,ISEND_FLD_CNT + DO JBLK=1,NGPBLKS + DO JKL=1, JK_MAX + IFLDT=IFLDA(JJ) + IFIRST = IGPTRSEND(1,JBLK,ISETW) + ILAST = IGPTRSEND(2,JBLK,ISETW) + JK = JKL+IFIRST-1 + JI=(JJ-1)*IPOS+IGPTROFF(JBLK)+JKL + IF(IFIRST > 0 .AND. JK <= ILAST) THEN + IF(PRESENT(KPTRGP)) THEN + ZCOMBUFS(JI,INS) = PGP(JK,KPTRGP(IFLDT),JBLK) + ELSE + ZCOMBUFS(JI,INS) = PGP(JK,IFLDT,JBLK) + ENDIF ENDIF - ENDIF + ENDDO ENDDO ENDDO - ENDDO + ELSE + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLDT,IFIRST,ILAST) ASYNC(1) + DO JJ=1,ISEND_FLD_CNT + DO JBLK=1,NGPBLKS + DO JKL=1, JK_MAX + IFLDT=IFLDA(JJ) + IFIRST = IGPTRSEND(1,JBLK,ISETW) + ILAST = IGPTRSEND(2,JBLK,ISETW) + JK = JKL+IFIRST-1 + JI=(JJ-1)*IPOS+IGPTROFF(JBLK)+JKL + IF(IFIRST > 0 .AND. JK <= ILAST) THEN + IF(LLUV(IFLDT)) THEN + ZCOMBUFS(JI,INS) = PGPUV(JK,IUVLEVS(IFLDT),IUVPARS(IFLDT),JBLK) + ELSEIF(LLGP2(IFLDT)) THEN + ZCOMBUFS(JI,INS) = PGP2(JK,IGP2PARS(IFLDT),JBLK) + ELSEIF(LLGP3A(IFLDT)) THEN + ZCOMBUFS(JI,INS) = PGP3A(JK,IGP3ALEVS(IFLDT),IGP3APARS(IFLDT),JBLK) + ELSEIF(LLGP3B(IFLDT)) THEN + ZCOMBUFS(JI,INS) = PGP3B(JK,IGP3BLEVS(IFLDT),IGP3BPARS(IFLDT),JBLK) + ENDIF + ENDIF + ENDDO + ENDDO + ENDDO + ENDIF + !$ACC END DATA ENDDO From cbde3476b9ec1b50c149884cbed0084547704202 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:02 -0700 Subject: [PATCH 074/263] Strucutre pack and self-send similarly --- src/trans/gpu/internal/trgtol_mod.F90 | 104 +++++++++++++------------- 1 file changed, 51 insertions(+), 53 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 626272e6b..fce878386 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -129,7 +129,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& INTEGER(KIND=JPIM) :: IGP3APARS(KF_GP),IGP3ALEVS(KF_GP),IGP3BPARS(KF_GP),IGP3BLEVS(KF_GP) INTEGER(KIND=JPIM) :: IUVPAR,IUVLEV,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR INTEGER(KIND=JPIM) :: IOFF,IOFF1,J1,J2 - INTEGER(KIND=JPIM) :: KINDEX(D%NLENGTF),INDOFF(NPROC),IFLDOFF(KF_FS) + INTEGER(KIND=JPIM) :: KINDEX(D%NLENGTF),INDOFF(NPROC) INTEGER(KIND=JPIM) :: ISEND_FLD_TOTAL(NPROC),ISEND_FLD_CNT INTEGER(KIND=JPIM) :: INUMFLDS INTEGER(KIND=JPIM) :: IGPTRSEND(2,NGPBLKS,NPRTRNS) @@ -223,8 +223,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL INIGPTR(IGPTRSEND,IGPTRRECV) - IBUFLENS = 0 - INSEND = 0 ! Prepare sender arrays DO JROC=1,NPROC @@ -244,19 +242,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ! the W-set is horizontal distribution only - how much data are we going to send to ! that process? IGPTRRECV tells me about one layer. ISENDTOT(JROC) = IGPTRRECV(ISETW)*ISEND_FLD_TOTAL(JROC) - - IF( JROC /= MYPROC) THEN - ! IBUFLENS sums up all but my process - IBUFLENS = MAX(IBUFLENS,ISENDTOT(JROC)) - IF(ISENDTOT(JROC) > 0) THEN - ! I have to send something, so let me store that - INSEND = INSEND+1 - JSEND(INSEND)=JROC - ENDIF - ENDIF ENDDO - INRECV = 0 INDOFFX = 0 IBUFLENR = 0 ! Prepare receiver arrays @@ -281,16 +268,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ! We always receive the full fourier space IRECVTOT(JROC) = IPOS*KF_FS - IF( JROC /= MYPROC) THEN - ! IBUFLENR sums up all but my process - IBUFLENR = MAX(IBUFLENR,IRECVTOT(JROC)) - IF(IRECVTOT(JROC) > 0) THEN - ! I have to recv something, so let me store that - INRECV = INRECV + 1 - JRECV(INRECV)=JROC - ENDIF - ENDIF - IF(IRECVTOT(JROC) > 0) THEN ! If I have to recv something, we need to fill KINDEX, this is the unpacking instruction... @@ -334,9 +311,9 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IF(KVSET(JFLD) == MYSETV .OR. KVSET(JFLD) == -1) THEN IFLDS = IFLDS+1 IF(PRESENT(KPTRGP)) THEN - IFLDOFF(IFLDS) = KPTRGP(JFLD) + IFLDA(IFLDS) = KPTRGP(JFLD) ELSE - IFLDOFF(IFLDS) = JFLD + IFLDA(IFLDS) = JFLD ENDIF ENDIF ENDDO @@ -347,7 +324,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDDO JK_MAX = MAXVAL(IGPTRSEND(2,:,MYSETW)-IGPTRSEND(1,:,MYSETW))+1 - !$ACC DATA COPYIN(IFLDOFF,IGPTROFF) ASYNC(1) + !$ACC DATA COPYIN(IFLDA(1:IFLDS),IGPTROFF) ASYNC(1) CALL GSTATS(1601,0) IF(PRESENT(PGP)) THEN @@ -360,7 +337,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& JK = JKL+IFIRST-1 IF(IFIRST > 0 .AND. JK <= ILAST) THEN IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JKL - PGLAT(JFLD,KINDEX(IPOS)) = PGP(JK,IFLDOFF(JFLD),JBLK) + PGLAT(JFLD,KINDEX(IPOS)) = PGP(JK,IFLDA(JFLD),JBLK) ENDIF ENDDO ENDDO @@ -375,7 +352,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& JK = JKL+IFIRST-1 IF(IFIRST > 0 .AND. JK <= ILAST) THEN IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JKL - IFLD = IFLDOFF(JFLD) + IFLD = IFLDA(JFLD) IF(LLUV(IFLD)) THEN PGLAT(JFLD,KINDEX(IPOS)) = PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) ELSEIF(LLGP2(IFLD)) THEN @@ -389,6 +366,10 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDDO ENDDO ENDDO + + ! send to self is done, so set those parts to 0 + ISENDTOT(MYPROC) = 0 + IRECVTOT(MYPROC) = 0 ENDIF CALL GSTATS(1601,1) @@ -396,6 +377,25 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDIF + IBUFLENR = MAXVAL(IRECVTOT) + IBUFLENS = MAXVAL(ISENDTOT) + ! Figure out processes that send or recv something + INSEND = 0 + INRECV = 0 + DO JROC=1,NPROC + IF( JROC /= MYPROC) THEN + IF(IRECVTOT(JROC) > 0) THEN + ! I have to recv something, so let me store that + INRECV = INRECV + 1 + JRECV(INRECV)=JROC + ENDIF + IF(ISENDTOT(JROC) > 0) THEN + ! I have to send something, so let me store that + INSEND = INSEND+1 + JSEND(INSEND)=JROC + ENDIF + ENDIF + ENDDO ! Send loop............................................................. @@ -413,59 +413,57 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& DO INS=1,INSEND ISEND=JSEND(INS) CALL PE2SET(ISEND,ISETA,ISETB,ISETW,ISETV) + ISEND_FLD_CNT = ISEND_FLD_TOTAL(ISEND) - IFLD = 0 + + IFLDS = 0 DO JFLD=1,KF_GP IF(KVSET(JFLD) == ISETV .OR. KVSET(JFLD) == -1 ) THEN - IFLD = IFLD+1 - IFLDA(IFLD)=JFLD + IFLDS = IFLDS+1 + IF(PRESENT(KPTRGP)) THEN + IFLDA(IFLDS)=KPTRGP(JFLD) + ELSE + IFLDA(IFLDS)=JFLD + ENDIF ENDIF ENDDO - - IPOS = 0 - DO JBLK=1,NGPBLKS - IFIRST = IGPTRSEND(1,JBLK,ISETW) - ILAST = IGPTRSEND(2,JBLK,ISETW) - IPOS = IPOS+ILAST-IFIRST+1 - ENDDO + IGPTROFF(1)=0 DO JBLK=2,NGPBLKS IGPTROFF(JBLK)=IGPTROFF(JBLK-1)+IGPTRSEND(2,JBLK-1,ISETW)-IGPTRSEND(1,JBLK-1,ISETW)+1 ENDDO JK_MAX = MAXVAL(IGPTRSEND(2,:,ISETW)-IGPTRSEND(1,:,ISETW))+1 - !$ACC DATA COPYIN(IGPTROFF,IFLDA) ASYNC(1) + ! Total send size + IPOS = IGPTRRECV(ISETW) + + !$ACC DATA COPYIN(IGPTROFF,IFLDA(1:ISEND_FLD_CNT)) ASYNC(1) IF(PRESENT(PGP)) THEN !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLDT,IFIRST,ILAST) ASYNC(1) - DO JJ=1,ISEND_FLD_CNT - DO JBLK=1,NGPBLKS + DO JBLK=1,NGPBLKS + DO JFLD=1,ISEND_FLD_CNT DO JKL=1, JK_MAX - IFLDT=IFLDA(JJ) IFIRST = IGPTRSEND(1,JBLK,ISETW) ILAST = IGPTRSEND(2,JBLK,ISETW) JK = JKL+IFIRST-1 - JI=(JJ-1)*IPOS+IGPTROFF(JBLK)+JKL + JI=(JFLD-1)*IPOS+IGPTROFF(JBLK)+JKL IF(IFIRST > 0 .AND. JK <= ILAST) THEN - IF(PRESENT(KPTRGP)) THEN - ZCOMBUFS(JI,INS) = PGP(JK,KPTRGP(IFLDT),JBLK) - ELSE - ZCOMBUFS(JI,INS) = PGP(JK,IFLDT,JBLK) - ENDIF + ZCOMBUFS(JI,INS) = PGP(JK,IFLDA(JFLD),JBLK) ENDIF ENDDO ENDDO ENDDO ELSE !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLDT,IFIRST,ILAST) ASYNC(1) - DO JJ=1,ISEND_FLD_CNT - DO JBLK=1,NGPBLKS + DO JBLK=1,NGPBLKS + DO JFLD=1,ISEND_FLD_CNT DO JKL=1, JK_MAX - IFLDT=IFLDA(JJ) + IFLDT=IFLDA(JFLD) IFIRST = IGPTRSEND(1,JBLK,ISETW) ILAST = IGPTRSEND(2,JBLK,ISETW) JK = JKL+IFIRST-1 - JI=(JJ-1)*IPOS+IGPTROFF(JBLK)+JKL + JI=(JFLD-1)*IPOS+IGPTROFF(JBLK)+JKL IF(IFIRST > 0 .AND. JK <= ILAST) THEN IF(LLUV(IFLDT)) THEN ZCOMBUFS(JI,INS) = PGPUV(JK,IUVLEVS(IFLDT),IUVPARS(IFLDT),JBLK) From cd0564120675b73d4f919f579a002aaf5eb76314 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:02 -0700 Subject: [PATCH 075/263] Simplify GP_XXX indexing --- src/trans/gpu/internal/trgtol_mod.F90 | 152 +++++++++++--------------- 1 file changed, 61 insertions(+), 91 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index fce878386..c7ac67199 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -120,15 +120,11 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& &ISETV, ISEND, J, JBLK, JFLD, & &JGL, JK, JL, JLOOP, ISETW, IFLD, & &II,INDOFFX,IBUFLENS,IBUFLENR,INRECV, IPROC,IFLDS, & - &INSEND,INS,INR,IR, IUNIT, JKL, JK_MAX + &INSEND,INS,INR,IR, IUNIT, JKL, JK_MAX, PBOUND ! LOCAL LOGICAL SCALARS - LOGICAL :: LLUV(KF_GP),LLGP2(KF_GP),LLGP3A(KF_GP),LLGP3B(KF_GP) LOGICAL :: LLINDER - INTEGER(KIND=JPIM) :: IUVLEVS(KF_GP),IUVPARS(KF_GP),IGP2PARS(KF_GP) - INTEGER(KIND=JPIM) :: IGP3APARS(KF_GP),IGP3ALEVS(KF_GP),IGP3BPARS(KF_GP),IGP3BLEVS(KF_GP) - INTEGER(KIND=JPIM) :: IUVPAR,IUVLEV,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR - INTEGER(KIND=JPIM) :: IOFF,IOFF1,J1,J2 + INTEGER(KIND=JPIM) :: IOFF INTEGER(KIND=JPIM) :: KINDEX(D%NLENGTF),INDOFF(NPROC) INTEGER(KIND=JPIM) :: ISEND_FLD_TOTAL(NPROC),ISEND_FLD_CNT INTEGER(KIND=JPIM) :: INUMFLDS @@ -137,7 +133,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& INTEGER(KIND=JPIM) :: IGPTROFF(NGPBLKS) ! INTEGER FUNCTIONS INTEGER(KIND=JPIM) :: IRECVCOUNT,ISENDCOUNT,INDOFFL - INTEGER(KIND=JPIM) :: IFLDA(KF_GP),JJ,JI,IFLDT + INTEGER(KIND=JPIM) :: IFLDA(KF_GP),JJ,JI REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR @@ -145,6 +141,13 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& REAL(KIND=JPRBT) :: TIMEF, tc + INTEGER(JPIM), PARAMETER :: PGP_INDICES_UV = 1 + INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP2 = 2 + INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP3A = 3 + INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP3B = 4 + INTEGER(JPIM), PARAMETER :: PGP_INDICES_END = 5 + INTEGER(JPIM) :: PGP_INDICES(PGP_INDICES_END) + #ifdef PARKINDTRANS_SINGLE #define TRGTOL_DTYPE MPI_REAL #else @@ -161,65 +164,16 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& iunit=300+myproc CALL GSTATS(1805,0) - - IUVPAR=0 - IUVLEV=0 - IOFF1=0 - LLUV(:) = .FALSE. - IUVPARS(:) = -99 - IUVLEVS(:) = -99 - IF (PRESENT(PGPUV)) THEN - IOFF=0 - IUVLEV=UBOUND(PGPUV,2) - DO J=1,IUVLEV - IUVLEVS(IOFF+J)=J - IUVPARS(IOFF+J)=IUVPAR+1 - IUVLEVS(IOFF+J+IUVLEV)=J - IUVPARS(IOFF+J+IUVLEV)=IUVPAR+2 - ENDDO - IUVPAR=IUVPAR+2 - LLUV(IOFF+1:IOFF+2*IUVLEV)=.TRUE. - IOFF=IOFF+2*IUVLEV - IOFF1=IOFF - ENDIF - LLGP2(:)=.FALSE. - IF(PRESENT(PGP2)) THEN - IOFF=IOFF1 - IGP2PAR=UBOUND(PGP2,2) - DO J=1,IGP2PAR - LLGP2(J+IOFF) = .TRUE. - IGP2PARS(J+IOFF)=J - ENDDO - IOFF1=IOFF1+IGP2PAR - ENDIF - LLGP3A(:) = .FALSE. - IF(PRESENT(PGP3A)) THEN - IGP3ALEV=UBOUND(PGP3A,2) - IGP3APAR=UBOUND(PGP3A,3) - IOFF=IOFF1 - DO J1=1,IGP3APAR - DO J2=1,IGP3ALEV - LLGP3A(J2+(J1-1)*IGP3ALEV+IOFF) = .TRUE. - IGP3APARS(J2+(J1-1)*IGP3ALEV+IOFF)=J1 - IGP3ALEVS(J2+(J1-1)*IGP3ALEV+IOFF)=J2 - ENDDO - ENDDO - IOFF1=IOFF1+IGP3APAR*IGP3ALEV - ENDIF - LLGP3B(:) = .FALSE. - IF(PRESENT(PGP3B)) THEN - IGP3BLEV=UBOUND(PGP3B,2) - IGP3BPAR=UBOUND(PGP3B,3) - IOFF=IOFF1 - DO J1=1,IGP3BPAR - DO J2=1,IGP3BLEV - LLGP3B(J2+(J1-1)*IGP3BLEV+IOFF) = .TRUE. - IGP3BPARS(J2+(J1-1)*IGP3BLEV+IOFF)=J1 - IGP3BLEVS(J2+(J1-1)*IGP3BLEV+IOFF)=J2 - ENDDO - ENDDO - IOFF1=IOFF1+IGP3BPAR*IGP3BLEV - ENDIF + IOFF=1 + PGP_INDICES(PGP_INDICES_UV) = IOFF + IF (PRESENT(PGPUV)) IOFF=IOFF+UBOUND(PGPUV,2)*2 + PGP_INDICES(PGP_INDICES_GP2) = IOFF + IF (PRESENT(PGP2)) IOFF=IOFF+UBOUND(PGP2,2) + PGP_INDICES(PGP_INDICES_GP3A) = IOFF + IF (PRESENT(PGP3A)) IOFF=IOFF+UBOUND(PGP3A,2)*UBOUND(PGP3A,3) + PGP_INDICES(PGP_INDICES_GP3B) = IOFF + IF (PRESENT(PGP3B)) IOFF=IOFF+UBOUND(PGP3B,2)*UBOUND(PGP3B,3) + PGP_INDICES(PGP_INDICES_END) = IOFF CALL INIGPTR(IGPTRSEND,IGPTRRECV) @@ -294,12 +248,12 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL GSTATS(1805,1) !$ACC DATA PRESENT(PGLAT) COPYIN(KPTRGP,INDOFF,KINDEX) & - !$ACC& COPYIN(LLGP2,LLGP3A,LLGP3B,LLUV,IGPTRSEND) ASYNC(1) + !$ACC& COPYIN(IGPTRSEND,PGP_INDICES) ASYNC(1) !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) - !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) COPYIN(IUVLEVS,IUVPARS) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) COPYIN(IGP2PARS) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) COPYIN(IGP3ALEVS,IGP3APARS) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) COPYIN(IGP3BLEVS,IGP3BPARS) ASYNC(1) + !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) ASYNC(1) ! Copy local contribution IF(ISENDTOT(MYPROC) > 0 )THEN @@ -343,7 +297,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDDO ENDDO ELSE - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) ASYNC(1) + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK,IOFF,PBOUND) ASYNC(1) DO JBLK=1,NGPBLKS DO JFLD=1,KF_FS DO JKL=1, JK_MAX @@ -353,14 +307,22 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IF(IFIRST > 0 .AND. JK <= ILAST) THEN IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JKL IFLD = IFLDA(JFLD) - IF(LLUV(IFLD)) THEN - PGLAT(JFLD,KINDEX(IPOS)) = PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) - ELSEIF(LLGP2(IFLD)) THEN - PGLAT(JFLD,KINDEX(IPOS)) = PGP2(JK,IGP2PARS(IFLD),JBLK) - ELSEIF(LLGP3A(IFLD)) THEN - PGLAT(JFLD,KINDEX(IPOS)) = PGP3A(JK,IGP3ALEVS(IFLD),IGP3APARS(IFLD),JBLK) - ELSEIF(LLGP3B(IFLD)) THEN - PGLAT(JFLD,KINDEX(IPOS)) = PGP3B(JK,IGP3BLEVS(IFLD),IGP3BPARS(IFLD),JBLK) + IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) + PBOUND=UBOUND(PGPUV,2) + ! TODO we could certainly reshape PGPXX arrays and we would simplify this + PGLAT(JFLD,KINDEX(IPOS)) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) + PGLAT(JFLD,KINDEX(IPOS)) = PGP2(JK,IOFF+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) + PBOUND=UBOUND(PGP3A,2) + PGLAT(JFLD,KINDEX(IPOS)) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) + PBOUND=UBOUND(PGP3B,2) + PGLAT(JFLD,KINDEX(IPOS)) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ENDIF ENDIF ENDDO @@ -440,7 +402,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC DATA COPYIN(IGPTROFF,IFLDA(1:ISEND_FLD_CNT)) ASYNC(1) IF(PRESENT(PGP)) THEN - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLDT,IFIRST,ILAST) ASYNC(1) + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFIRST,ILAST) ASYNC(1) DO JBLK=1,NGPBLKS DO JFLD=1,ISEND_FLD_CNT DO JKL=1, JK_MAX @@ -455,24 +417,32 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDDO ENDDO ELSE - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLDT,IFIRST,ILAST) ASYNC(1) + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLD,IFIRST,ILAST) ASYNC(1) DO JBLK=1,NGPBLKS DO JFLD=1,ISEND_FLD_CNT DO JKL=1, JK_MAX - IFLDT=IFLDA(JFLD) + IFLD=IFLDA(JFLD) IFIRST = IGPTRSEND(1,JBLK,ISETW) ILAST = IGPTRSEND(2,JBLK,ISETW) JK = JKL+IFIRST-1 JI=(JFLD-1)*IPOS+IGPTROFF(JBLK)+JKL IF(IFIRST > 0 .AND. JK <= ILAST) THEN - IF(LLUV(IFLDT)) THEN - ZCOMBUFS(JI,INS) = PGPUV(JK,IUVLEVS(IFLDT),IUVPARS(IFLDT),JBLK) - ELSEIF(LLGP2(IFLDT)) THEN - ZCOMBUFS(JI,INS) = PGP2(JK,IGP2PARS(IFLDT),JBLK) - ELSEIF(LLGP3A(IFLDT)) THEN - ZCOMBUFS(JI,INS) = PGP3A(JK,IGP3ALEVS(IFLDT),IGP3APARS(IFLDT),JBLK) - ELSEIF(LLGP3B(IFLDT)) THEN - ZCOMBUFS(JI,INS) = PGP3B(JK,IGP3BLEVS(IFLDT),IGP3BPARS(IFLDT),JBLK) + IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) + PBOUND=UBOUND(PGPUV,2) + ! TODO we could certainly reshape PGPXX arrays and we would simplify this + ZCOMBUFS(JI,INS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) + ZCOMBUFS(JI,INS) = PGP2(JK,IOFF+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) + PBOUND=UBOUND(PGP3A,2) + ZCOMBUFS(JI,INS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) + PBOUND=UBOUND(PGP3B,2) + ZCOMBUFS(JI,INS)= PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ENDIF ENDIF ENDDO From 4fe9543976c1b751c48771b74df538df9c4d354a Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:03 -0700 Subject: [PATCH 076/263] Minor non-critical cleanup in TRGTOL --- src/trans/gpu/internal/trgtol_mod.F90 | 292 +++++++++++--------------- 1 file changed, 128 insertions(+), 164 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index c7ac67199..6832363ea 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -12,44 +12,44 @@ MODULE TRGTOL_MOD CONTAINS SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) - + !**** *TRGTOL * - transposition of grid point data from column ! structure to latitudinal. Reorganize data between ! grid point calculations and direct Fourier Transform - + ! Version using CUDA-aware MPI ! Purpose. ! -------- - - + + !** Interface. ! ---------- ! *call* *trgtol(...) - + ! Explicit arguments : ! -------------------- ! PGLAT - Latitudinal data ready for direct FFT (output) ! PGP - Blocked grid point data (input) - + ! Implicit arguments : ! -------------------- - + ! Method. ! ------- ! See documentation - + ! Externals. ! ---------- - + ! Reference. ! ---------- ! ECMWF Research Department documentation of the IFS - + ! Author. ! ------- ! MPP Group *ECMWF* - + ! Modifications. ! -------------- ! Original: 95-10-01 @@ -68,19 +68,19 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ! 08-01-01 G.Mozdzynski: cleanup ! 09-01-02 G.Mozdzynski: use non-blocking recv and send ! ------------------------------------------------------------------ - - - + + + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - + USE MPL_MODULE ,ONLY : MPL_RECV, MPL_SEND, MPL_WAIT, JP_NON_BLOCKING_STANDARD, MPL_BARRIER - + USE TPM_GEN ,ONLY : NOUT, LSYNC_TRANS USE TPM_DISTR ,ONLY : D, NPRCIDS, NPRTRNS, MTAGGL, & & MYSETV, MYSETW, MYPROC, NPROC USE TPM_TRANS ,ONLY : NGPBLKS - + USE INIGPTR_MOD ,ONLY : INIGPTR USE PE2SET_MOD ,ONLY : PE2SET !USE MYSENDSET_MOD @@ -90,9 +90,9 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS ! USE MPI - + IMPLICIT NONE - + REAL(KIND=JPRBT),INTENT(OUT) :: PGLAT(:,:) INTEGER(KIND=JPIM),INTENT(IN) :: KVSET(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP @@ -103,44 +103,37 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3A(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP2(:,:,:) - + REAL(KIND=JPRBT),ALLOCATABLE :: ZCOMBUFS(:,:),ZCOMBUFR(:,:) - REAL(KIND=JPRBT) :: ZDUM(2) - + INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) INTEGER(KIND=JPIM) :: IREQ (NPROC*2) INTEGER(KIND=JPIM) :: JRECV (NPROC) INTEGER(KIND=JPIM) :: JSEND (NPROC) - + ! LOCAL INTEGER SCALARS INTEGER(KIND=JPIM) :: IFIRST, IFIRSTLAT, IGL, IGLL, ILAST,& &ILASTLAT, ILEN, JROC, IPOS, ISETA, & &ISETB, IRECV, & - &ISETV, ISEND, J, JBLK, JFLD, & - &JGL, JK, JL, JLOOP, ISETW, IFLD, & + &ISETV, ISEND, JBLK, JFLD, & + &JGL, JI, JK, JL, ISETW, IFLD, & &II,INDOFFX,IBUFLENS,IBUFLENR,INRECV, IPROC,IFLDS, & - &INSEND,INS,INR,IR, IUNIT, JKL, JK_MAX, PBOUND - + &INSEND,INS,INR,IR, JKL, JK_MAX, PBOUND, IERROR + ! LOCAL LOGICAL SCALARS - LOGICAL :: LLINDER INTEGER(KIND=JPIM) :: IOFF INTEGER(KIND=JPIM) :: KINDEX(D%NLENGTF),INDOFF(NPROC) INTEGER(KIND=JPIM) :: ISEND_FLD_TOTAL(NPROC),ISEND_FLD_CNT - INTEGER(KIND=JPIM) :: INUMFLDS INTEGER(KIND=JPIM) :: IGPTRSEND(2,NGPBLKS,NPRTRNS) INTEGER(KIND=JPIM) :: IGPTRRECV(NPRTRNS) INTEGER(KIND=JPIM) :: IGPTROFF(NGPBLKS) ! INTEGER FUNCTIONS INTEGER(KIND=JPIM) :: IRECVCOUNT,ISENDCOUNT,INDOFFL - INTEGER(KIND=JPIM) :: IFLDA(KF_GP),JJ,JI + INTEGER(KIND=JPIM) :: IFLDA(KF_GP) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR - - INTEGER(KIND=JPIM) :: IERROR, irank - - REAL(KIND=JPRBT) :: TIMEF, tc - + INTEGER(JPIM), PARAMETER :: PGP_INDICES_UV = 1 INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP2 = 2 INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP3A = 3 @@ -155,13 +148,11 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& #endif ! ------------------------------------------------------------------ - + !* 0. Some initializations ! -------------------- - + IF (LHOOK) CALL DR_HOOK('TRGTOL_CUDAAWARE',0,ZHOOK_HANDLE) - - iunit=300+myproc CALL GSTATS(1805,0) IOFF=1 @@ -246,7 +237,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDDO CALL GSTATS(1805,1) - + !$ACC DATA PRESENT(PGLAT) COPYIN(KPTRGP,INDOFF,KINDEX) & !$ACC& COPYIN(IGPTRSEND,PGP_INDICES) ASYNC(1) !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) @@ -255,6 +246,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) ASYNC(1) !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) ASYNC(1) + ! TODO We should do the local contribution *WHILE* sending the data... ! Copy local contribution IF(ISENDTOT(MYPROC) > 0 )THEN ! I have to send something to myself... @@ -271,7 +263,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDIF ENDIF ENDDO - + IGPTROFF(1)=0 DO JBLK=2,NGPBLKS IGPTROFF(JBLK)=IGPTROFF(JBLK-1)+IGPTRSEND(2,JBLK-1,MYSETW)-IGPTRSEND(1,JBLK-1,MYSETW)+1 @@ -360,7 +352,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDDO ! Send loop............................................................. - + ISENDCOUNT=MAXVAL(ISENDTOT) IRECVCOUNT=MAXVAL(IRECVTOT) IF (IBUFLENS > 0) ALLOCATE(ZCOMBUFS(ISENDCOUNT,INSEND)) @@ -368,111 +360,98 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC DATA IF(IBUFLENS > 0) CREATE(ZCOMBUFS) ASYNC(1) !$ACC DATA IF(IBUFLENR > 0) CREATE(ZCOMBUFR) ASYNC(1) - + !....Pack loop......................................................... - + CALL GSTATS(1602,0) - DO INS=1,INSEND - ISEND=JSEND(INS) - CALL PE2SET(ISEND,ISETA,ISETB,ISETW,ISETV) + DO INS=1,INSEND + ISEND=JSEND(INS) + CALL PE2SET(ISEND,ISETA,ISETB,ISETW,ISETV) - ISEND_FLD_CNT = ISEND_FLD_TOTAL(ISEND) + ISEND_FLD_CNT = ISEND_FLD_TOTAL(ISEND) - IFLDS = 0 - DO JFLD=1,KF_GP - IF(KVSET(JFLD) == ISETV .OR. KVSET(JFLD) == -1 ) THEN - IFLDS = IFLDS+1 - IF(PRESENT(KPTRGP)) THEN - IFLDA(IFLDS)=KPTRGP(JFLD) - ELSE - IFLDA(IFLDS)=JFLD - ENDIF + IFLDS = 0 + DO JFLD=1,KF_GP + IF(KVSET(JFLD) == ISETV .OR. KVSET(JFLD) == -1 ) THEN + IFLDS = IFLDS+1 + IF(PRESENT(KPTRGP)) THEN + IFLDA(IFLDS)=KPTRGP(JFLD) + ELSE + IFLDA(IFLDS)=JFLD ENDIF - ENDDO + ENDIF + ENDDO - IGPTROFF(1)=0 - DO JBLK=2,NGPBLKS - IGPTROFF(JBLK)=IGPTROFF(JBLK-1)+IGPTRSEND(2,JBLK-1,ISETW)-IGPTRSEND(1,JBLK-1,ISETW)+1 - ENDDO - JK_MAX = MAXVAL(IGPTRSEND(2,:,ISETW)-IGPTRSEND(1,:,ISETW))+1 + IGPTROFF(1)=0 + DO JBLK=2,NGPBLKS + IGPTROFF(JBLK)=IGPTROFF(JBLK-1)+IGPTRSEND(2,JBLK-1,ISETW)-IGPTRSEND(1,JBLK-1,ISETW)+1 + ENDDO + JK_MAX = MAXVAL(IGPTRSEND(2,:,ISETW)-IGPTRSEND(1,:,ISETW))+1 - ! Total send size - IPOS = IGPTRRECV(ISETW) + ! Total send size + IPOS = IGPTRRECV(ISETW) - !$ACC DATA COPYIN(IGPTROFF,IFLDA(1:ISEND_FLD_CNT)) ASYNC(1) + !$ACC DATA COPYIN(IGPTROFF,IFLDA(1:ISEND_FLD_CNT)) ASYNC(1) - IF(PRESENT(PGP)) THEN - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFIRST,ILAST) ASYNC(1) - DO JBLK=1,NGPBLKS - DO JFLD=1,ISEND_FLD_CNT - DO JKL=1, JK_MAX - IFIRST = IGPTRSEND(1,JBLK,ISETW) - ILAST = IGPTRSEND(2,JBLK,ISETW) - JK = JKL+IFIRST-1 - JI=(JFLD-1)*IPOS+IGPTROFF(JBLK)+JKL - IF(IFIRST > 0 .AND. JK <= ILAST) THEN + IF(PRESENT(PGP)) THEN + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFIRST,ILAST) ASYNC(1) + DO JBLK=1,NGPBLKS + DO JFLD=1,ISEND_FLD_CNT + DO JKL=1, JK_MAX + IFIRST = IGPTRSEND(1,JBLK,ISETW) + ILAST = IGPTRSEND(2,JBLK,ISETW) + JK = JKL+IFIRST-1 + JI=(JFLD-1)*IPOS+IGPTROFF(JBLK)+JKL + IF(IFIRST > 0 .AND. JK <= ILAST) THEN ZCOMBUFS(JI,INS) = PGP(JK,IFLDA(JFLD),JBLK) - ENDIF - ENDDO + ENDIF ENDDO ENDDO - ELSE - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLD,IFIRST,ILAST) ASYNC(1) - DO JBLK=1,NGPBLKS - DO JFLD=1,ISEND_FLD_CNT - DO JKL=1, JK_MAX - IFLD=IFLDA(JFLD) - IFIRST = IGPTRSEND(1,JBLK,ISETW) - ILAST = IGPTRSEND(2,JBLK,ISETW) - JK = JKL+IFIRST-1 - JI=(JFLD-1)*IPOS+IGPTROFF(JBLK)+JKL - IF(IFIRST > 0 .AND. JK <= ILAST) THEN - IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) - PBOUND=UBOUND(PGPUV,2) - ! TODO we could certainly reshape PGPXX arrays and we would simplify this - ZCOMBUFS(JI,INS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) - ZCOMBUFS(JI,INS) = PGP2(JK,IOFF+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) - PBOUND=UBOUND(PGP3A,2) - ZCOMBUFS(JI,INS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) - PBOUND=UBOUND(PGP3B,2) - ZCOMBUFS(JI,INS)= PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) - ENDIF + ENDDO + ELSE + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLD,IFIRST,ILAST) ASYNC(1) + DO JBLK=1,NGPBLKS + DO JFLD=1,ISEND_FLD_CNT + DO JKL=1, JK_MAX + IFLD=IFLDA(JFLD) + IFIRST = IGPTRSEND(1,JBLK,ISETW) + ILAST = IGPTRSEND(2,JBLK,ISETW) + JK = JKL+IFIRST-1 + JI=(JFLD-1)*IPOS+IGPTROFF(JBLK)+JKL + IF(IFIRST > 0 .AND. JK <= ILAST) THEN + IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) + PBOUND=UBOUND(PGPUV,2) + ! TODO we could certainly reshape PGPXX arrays and we would simplify this + ZCOMBUFS(JI,INS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) + ZCOMBUFS(JI,INS) = PGP2(JK,IOFF+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) + PBOUND=UBOUND(PGP3A,2) + ZCOMBUFS(JI,INS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) + PBOUND=UBOUND(PGP3B,2) + ZCOMBUFS(JI,INS)= PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ENDIF - ENDDO + ENDIF ENDDO ENDDO - ENDIF - !$ACC END DATA - ENDDO + ENDDO + ENDIF + !$ACC END DATA + ENDDO - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=(TIMEF()-Tc)/1000.0_JPRBT - CALL MPI_COMM_RANK(MPI_COMM_WORLD, IRANK, IERROR) - !IF(irank==0) WRITE(*,*) "packing (trgtol) in sec: ", Tc - #endif - CALL GSTATS(1602,1) - + !$ACC WAIT(1) IF (LHOOK) CALL DR_HOOK('TRGTOL_BAR',0,ZHOOK_HANDLE_BAR) CALL GSTATS_BARRIER(761) IF (LHOOK) CALL DR_HOOK('TRGTOL_BAR',1,ZHOOK_HANDLE_BAR) - - IR=0 - - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=TIMEF() - #endif + IF (LSYNC_TRANS) THEN CALL GSTATS(423,0) CALL MPL_BARRIER(CDSTRING='TRGTOL BARRIER') @@ -480,6 +459,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDIF CALL GSTATS(413,0) + IR=0 + !$ACC HOST_DATA USE_DEVICE(ZCOMBUFR,ZCOMBUFS) ! Receive loop......................................................... DO INR=1,INRECV @@ -488,7 +469,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL MPI_IRECV(ZCOMBUFR(1:IRECVTOT(IRECV),INR),IRECVTOT(IRECV), & & TRGTOL_DTYPE,NPRCIDS(IRECV)-1,MTAGGL,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) ENDDO - + !....Send loop......................................................... DO INS=1,INSEND IR=IR+1 @@ -497,7 +478,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& & TRGTOL_DTYPE,NPRCIDS(ISEND)-1,MTAGGL,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) ENDDO !$ACC END HOST_DATA - + IF(IR > 0) THEN CALL MPL_WAIT(KREQUEST=IREQ(1:IR), & & CDSTRING='TRGTOL_CUDAAWARE: WAIT FOR SENDS AND RECEIVES') @@ -506,60 +487,43 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL MPL_BARRIER(CDSTRING='TRGTOL BARRIER') ENDIF CALL GSTATS(413,1) - - !#ifdef COMVERBOSE - ! call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - ! Tc=(TIMEF()-Tc)/1000.0_JPRBT - ! !IF(irank==0) WRITE(*,*) "non-CUDA-aware isend/irecv (trgtol) in sec: ", Tc - !#endif - + CALL GSTATS_BARRIER2(761) - - !#ifdef COMVERBOSE - ! call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - ! Tc=TIMEF() - !#endif ! Unpack loop......................................................... - + CALL GSTATS(1603,0) DO INR=1,INRECV - IRECV=JRECV(INR) - ILEN = IRECVTOT(IRECV)/KF_FS - INDOFFL = INDOFF(IRECV) - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(II) ASYNC(1) - DO JFLD=1,KF_FS - DO JL=1,ILEN - II = KINDEX(INDOFFL+JL) - PGLAT(JFLD,II) = ZCOMBUFR(JL+(JFLD-1)*ILEN,INR) - ENDDO + IRECV=JRECV(INR) + ILEN = IRECVTOT(IRECV)/KF_FS + INDOFFL = INDOFF(IRECV) + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(II) ASYNC(1) + DO JFLD=1,KF_FS + DO JL=1,ILEN + II = KINDEX(INDOFFL+JL) + PGLAT(JFLD,II) = ZCOMBUFR(JL+(JFLD-1)*ILEN,INR) ENDDO + ENDDO ENDDO !$ACC WAIT(1) !$ACC END DATA - - !#ifdef COMVERBOSE - ! call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - ! Tc=(TIMEF()-Tc)/1000.0_JPRBT - ! !IF(irank==0) WRITE(*,*) "unpacking (trgtol) in sec: ", Tc - !#endif CALL GSTATS(1603,1) - !$ACC END DATA !! ZCOMBUFS - !$ACC END DATA !! ZCOMBUFS - !$ACC END DATA !! PRESENT(PGP3B) - !$ACC END DATA !! PRESENT(PGP3A) - !$ACC END DATA !! PRESENT(PGP2) - !$ACC END DATA !! PRESENT(PGPUV) - !$ACC END DATA !! PRESENT(PGP) - + !$ACC END DATA !! ZCOMBUFS + !$ACC END DATA !! ZCOMBUFS + !$ACC END DATA !! PRESENT(PGP3B) + !$ACC END DATA !! PRESENT(PGP3A) + !$ACC END DATA !! PRESENT(PGP2) + !$ACC END DATA !! PRESENT(PGPUV) + !$ACC END DATA !! PRESENT(PGP) + IF (IBUFLENS > 0) DEALLOCATE(ZCOMBUFS) IF (IBUFLENR > 0) DEALLOCATE(ZCOMBUFR) IF (LHOOK) CALL DR_HOOK('TRGTOL_CUDAAWARE',1,ZHOOK_HANDLE) - + END SUBROUTINE TRGTOL_CUDAAWARE SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& From 8c93b89ea98c7497caf5c94450d9e0a05edf17b0 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:03 -0700 Subject: [PATCH 077/263] Make ZCOMBUFS/ZCOMBUFR properly sized --- src/trans/gpu/internal/trgtol_mod.F90 | 43 ++++++++++++++++++--------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 6832363ea..ea78f72e1 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -104,7 +104,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP2(:,:,:) - REAL(KIND=JPRBT),ALLOCATABLE :: ZCOMBUFS(:,:),ZCOMBUFR(:,:) + REAL(KIND=JPRBT),ALLOCATABLE :: ZCOMBUFS(:),ZCOMBUFR(:) INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) @@ -128,6 +128,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& INTEGER(KIND=JPIM) :: IGPTRSEND(2,NGPBLKS,NPRTRNS) INTEGER(KIND=JPIM) :: IGPTRRECV(NPRTRNS) INTEGER(KIND=JPIM) :: IGPTROFF(NGPBLKS) + INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) + INTEGER(KIND=JPIM) :: SEND_OFFSET, RECV_OFFSET ! INTEGER FUNCTIONS INTEGER(KIND=JPIM) :: IRECVCOUNT,ISENDCOUNT,INDOFFL INTEGER(KIND=JPIM) :: IFLDA(KF_GP) @@ -350,13 +352,24 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDIF ENDIF ENDDO - ! Send loop............................................................. + ALLOCATE(ICOMBUFS_OFFSET(INSEND+1)) + ICOMBUFS_OFFSET(1) = 0 + DO JROC=1,INSEND + ICOMBUFS_OFFSET(JROC+1) = ICOMBUFS_OFFSET(JROC) + ISENDTOT(JSEND(JROC)) + ENDDO + ALLOCATE(ICOMBUFR_OFFSET(INRECV+1)) + ICOMBUFR_OFFSET(1) = 0 + DO JROC=1,INRECV + ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(JRECV(JROC)) + ENDDO + + ! Send loop............................................................. ISENDCOUNT=MAXVAL(ISENDTOT) IRECVCOUNT=MAXVAL(IRECVTOT) - IF (IBUFLENS > 0) ALLOCATE(ZCOMBUFS(ISENDCOUNT,INSEND)) - IF (IBUFLENR > 0) ALLOCATE(ZCOMBUFR(IRECVCOUNT,INRECV)) + IF (IBUFLENS > 0) ALLOCATE(ZCOMBUFS(ICOMBUFS_OFFSET(INSEND+1))) + IF (IBUFLENR > 0) ALLOCATE(ZCOMBUFR(ICOMBUFR_OFFSET(INRECV+1))) !$ACC DATA IF(IBUFLENS > 0) CREATE(ZCOMBUFS) ASYNC(1) !$ACC DATA IF(IBUFLENR > 0) CREATE(ZCOMBUFR) ASYNC(1) @@ -369,6 +382,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL PE2SET(ISEND,ISETA,ISETB,ISETW,ISETV) ISEND_FLD_CNT = ISEND_FLD_TOTAL(ISEND) + SEND_OFFSET = ICOMBUFS_OFFSET(INS) IFLDS = 0 DO JFLD=1,KF_GP @@ -401,9 +415,9 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IFIRST = IGPTRSEND(1,JBLK,ISETW) ILAST = IGPTRSEND(2,JBLK,ISETW) JK = JKL+IFIRST-1 - JI=(JFLD-1)*IPOS+IGPTROFF(JBLK)+JKL IF(IFIRST > 0 .AND. JK <= ILAST) THEN - ZCOMBUFS(JI,INS) = PGP(JK,IFLDA(JFLD),JBLK) + JI=(JFLD-1)*IPOS+IGPTROFF(JBLK)+JKL + ZCOMBUFS(SEND_OFFSET+JI) = PGP(JK,IFLDA(JFLD),JBLK) ENDIF ENDDO ENDDO @@ -417,24 +431,24 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IFIRST = IGPTRSEND(1,JBLK,ISETW) ILAST = IGPTRSEND(2,JBLK,ISETW) JK = JKL+IFIRST-1 - JI=(JFLD-1)*IPOS+IGPTROFF(JBLK)+JKL IF(IFIRST > 0 .AND. JK <= ILAST) THEN + JI=(JFLD-1)*IPOS+IGPTROFF(JBLK)+JKL IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) PBOUND=UBOUND(PGPUV,2) ! TODO we could certainly reshape PGPXX arrays and we would simplify this - ZCOMBUFS(JI,INS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ZCOMBUFS(SEND_OFFSET+JI) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) - ZCOMBUFS(JI,INS) = PGP2(JK,IOFF+1,JBLK) + ZCOMBUFS(SEND_OFFSET+JI) = PGP2(JK,IOFF+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) PBOUND=UBOUND(PGP3A,2) - ZCOMBUFS(JI,INS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ZCOMBUFS(SEND_OFFSET+JI) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) PBOUND=UBOUND(PGP3B,2) - ZCOMBUFS(JI,INS)= PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ZCOMBUFS(SEND_OFFSET+JI)= PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ENDIF ENDIF ENDDO @@ -466,7 +480,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& DO INR=1,INRECV IR=IR+1 IRECV=JRECV(INR) - CALL MPI_IRECV(ZCOMBUFR(1:IRECVTOT(IRECV),INR),IRECVTOT(IRECV), & + CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)),IRECVTOT(IRECV), & & TRGTOL_DTYPE,NPRCIDS(IRECV)-1,MTAGGL,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) ENDDO @@ -474,7 +488,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& DO INS=1,INSEND IR=IR+1 ISEND=JSEND(INS) - CALL MPI_ISEND(ZCOMBUFS(1:ISENDTOT(ISEND),INS),ISENDTOT(ISEND), & + CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT(ISEND), & & TRGTOL_DTYPE,NPRCIDS(ISEND)-1,MTAGGL,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) ENDDO !$ACC END HOST_DATA @@ -498,11 +512,12 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IRECV=JRECV(INR) ILEN = IRECVTOT(IRECV)/KF_FS INDOFFL = INDOFF(IRECV) + RECV_OFFSET = ICOMBUFR_OFFSET(INR) !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(II) ASYNC(1) DO JFLD=1,KF_FS DO JL=1,ILEN II = KINDEX(INDOFFL+JL) - PGLAT(JFLD,II) = ZCOMBUFR(JL+(JFLD-1)*ILEN,INR) + PGLAT(JFLD,II) = ZCOMBUFR(RECV_OFFSET+JL+(JFLD-1)*ILEN) ENDDO ENDDO ENDDO From d9c28ace152309718ee6125d6e4c77f67fdffc08 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:03 -0700 Subject: [PATCH 078/263] Not critical: tiny cleanup --- src/trans/gpu/internal/trgtol_mod.F90 | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index ea78f72e1..c24dddd59 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -118,7 +118,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& &ISETB, IRECV, & &ISETV, ISEND, JBLK, JFLD, & &JGL, JI, JK, JL, ISETW, IFLD, & - &II,INDOFFX,IBUFLENS,IBUFLENR,INRECV, IPROC,IFLDS, & + &II,INDOFFX,IBUFLENR,INRECV, IPROC,IFLDS, & &INSEND,INS,INR,IR, JKL, JK_MAX, PBOUND, IERROR ! LOCAL LOGICAL SCALARS @@ -131,7 +131,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) INTEGER(KIND=JPIM) :: SEND_OFFSET, RECV_OFFSET ! INTEGER FUNCTIONS - INTEGER(KIND=JPIM) :: IRECVCOUNT,ISENDCOUNT,INDOFFL + INTEGER(KIND=JPIM) :: INDOFFL INTEGER(KIND=JPIM) :: IFLDA(KF_GP) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR @@ -192,7 +192,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDDO INDOFFX = 0 - IBUFLENR = 0 ! Prepare receiver arrays DO JROC=1,NPROC @@ -322,10 +321,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDDO ENDDO ENDDO - - ! send to self is done, so set those parts to 0 - ISENDTOT(MYPROC) = 0 - IRECVTOT(MYPROC) = 0 ENDIF CALL GSTATS(1601,1) @@ -333,8 +328,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDIF - IBUFLENR = MAXVAL(IRECVTOT) - IBUFLENS = MAXVAL(ISENDTOT) ! Figure out processes that send or recv something INSEND = 0 INRECV = 0 @@ -366,13 +359,11 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ! Send loop............................................................. - ISENDCOUNT=MAXVAL(ISENDTOT) - IRECVCOUNT=MAXVAL(IRECVTOT) - IF (IBUFLENS > 0) ALLOCATE(ZCOMBUFS(ICOMBUFS_OFFSET(INSEND+1))) - IF (IBUFLENR > 0) ALLOCATE(ZCOMBUFR(ICOMBUFR_OFFSET(INRECV+1))) + IF (INSEND > 0) ALLOCATE(ZCOMBUFS(ICOMBUFS_OFFSET(INSEND+1))) + IF (INRECV > 0) ALLOCATE(ZCOMBUFR(ICOMBUFR_OFFSET(INRECV+1))) - !$ACC DATA IF(IBUFLENS > 0) CREATE(ZCOMBUFS) ASYNC(1) - !$ACC DATA IF(IBUFLENR > 0) CREATE(ZCOMBUFR) ASYNC(1) + !$ACC DATA IF(INSEND > 0) CREATE(ZCOMBUFS) ASYNC(1) + !$ACC DATA IF(INRECV > 0) CREATE(ZCOMBUFR) ASYNC(1) !....Pack loop......................................................... @@ -534,8 +525,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC END DATA !! PRESENT(PGPUV) !$ACC END DATA !! PRESENT(PGP) - IF (IBUFLENS > 0) DEALLOCATE(ZCOMBUFS) - IF (IBUFLENR > 0) DEALLOCATE(ZCOMBUFR) + IF (INSEND > 0) DEALLOCATE(ZCOMBUFS) + IF (INRECV > 0) DEALLOCATE(ZCOMBUFR) IF (LHOOK) CALL DR_HOOK('TRGTOL_CUDAAWARE',1,ZHOOK_HANDLE) From faea5039aabc85af0640f722623fa53c472a11cd Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:03 -0700 Subject: [PATCH 079/263] Simplify filling of receiver side in trgtol --- src/trans/gpu/internal/trgtol_mod.F90 | 225 ++++++++++++-------------- 1 file changed, 100 insertions(+), 125 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index c24dddd59..8f6593d39 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -77,11 +77,11 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& USE MPL_MODULE ,ONLY : MPL_RECV, MPL_SEND, MPL_WAIT, JP_NON_BLOCKING_STANDARD, MPL_BARRIER USE TPM_GEN ,ONLY : NOUT, LSYNC_TRANS - USE TPM_DISTR ,ONLY : D, NPRCIDS, NPRTRNS, MTAGGL, & + USE TPM_DISTR ,ONLY : D, NPRCIDS, NPRTRNS, MTAGGL,NPRTRW,NPRTRV, & & MYSETV, MYSETW, MYPROC, NPROC - USE TPM_TRANS ,ONLY : NGPBLKS + USE EQ_REGIONS_MOD ,ONLY : MY_REGION_EW, MY_REGION_NS + USE TPM_TRANS ,ONLY : NGPBLKS, NPROMA - USE INIGPTR_MOD ,ONLY : INIGPTR USE PE2SET_MOD ,ONLY : PE2SET !USE MYSENDSET_MOD USE MPL_DATA_MODULE, only: MPL_COMM_OML @@ -119,17 +119,15 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& &ISETV, ISEND, JBLK, JFLD, & &JGL, JI, JK, JL, ISETW, IFLD, & &II,INDOFFX,IBUFLENR,INRECV, IPROC,IFLDS, & - &INSEND,INS,INR,IR, JKL, JK_MAX, PBOUND, IERROR + &INSEND,INS,INR,IR, JKL, PBOUND, IERROR ! LOCAL LOGICAL SCALARS INTEGER(KIND=JPIM) :: IOFF INTEGER(KIND=JPIM) :: KINDEX(D%NLENGTF),INDOFF(NPROC) - INTEGER(KIND=JPIM) :: ISEND_FLD_TOTAL(NPROC),ISEND_FLD_CNT - INTEGER(KIND=JPIM) :: IGPTRSEND(2,NGPBLKS,NPRTRNS) - INTEGER(KIND=JPIM) :: IGPTRRECV(NPRTRNS) - INTEGER(KIND=JPIM) :: IGPTROFF(NGPBLKS) + INTEGER(KIND=JPIM) :: ISEND_FLD_TOTAL(NPRTRV),ISEND_FLD_CNT + INTEGER(KIND=JPIM) :: IPOINTS_PER_WSET(NPRTRW), IWSET_POINT_OFFSET(NPRTRW+1), ILOCAL_LAT, ILAT_STRIP INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) - INTEGER(KIND=JPIM) :: SEND_OFFSET, RECV_OFFSET + INTEGER(KIND=JPIM) :: SEND_OFFSET, RECV_OFFSET, IWPOINTS_COUNT, IWPOINTS_OFFSET ! INTEGER FUNCTIONS INTEGER(KIND=JPIM) :: INDOFFL INTEGER(KIND=JPIM) :: IFLDA(KF_GP) @@ -168,27 +166,33 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IF (PRESENT(PGP3B)) IOFF=IOFF+UBOUND(PGP3B,2)*UBOUND(PGP3B,3) PGP_INDICES(PGP_INDICES_END) = IOFF - CALL INIGPTR(IGPTRSEND,IGPTRRECV) - ! Prepare sender arrays - DO JROC=1,NPROC - - CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) - - ! count up expected number of fields - ! if fields are not distributed over processes, KVSET=-=1, so IPOS=KF_GP, - ! otherwise we count how many fields are stored in this V-set. This is used to figure out - ! how much data we are going to send *to* that process JROC. Keep in mind at this point (g-space) - ! all fields are on this process so we have total overlap! - ! basically we count the size of the V set - IPOS = 0 + ! find number of fields on a certain V-set + IF(NPRTRV == 1) THEN + ! This is needed because KVSET(JFLD) == -1 if there is only one V-set + ISEND_FLD_TOTAL(1) = KF_GP + ELSE + ISEND_FLD_TOTAL(:) = 0 DO JFLD=1,KF_GP - IF(KVSET(JFLD) == ISETV .OR. KVSET(JFLD) == -1) IPOS = IPOS+1 + ISEND_FLD_TOTAL(KVSET(JFLD)) = ISEND_FLD_TOTAL(KVSET(JFLD)) + 1 ENDDO - ISEND_FLD_TOTAL(JROC) = IPOS - ! the W-set is horizontal distribution only - how much data are we going to send to - ! that process? IGPTRRECV tells me about one layer. - ISENDTOT(JROC) = IGPTRRECV(ISETW)*ISEND_FLD_TOTAL(JROC) + ENDIF + ! find number of grid-points on a certain W-set that overlap with myself + IPOINTS_PER_WSET(:) = 0 + DO ILOCAL_LAT=D%NFRSTLAT(MY_REGION_NS),D%NLSTLAT(MY_REGION_NS) + ILAT_STRIP = ILOCAL_LAT-D%NFRSTLAT(MY_REGION_NS)+D%NPTRFLOFF+1 + IPOINTS_PER_WSET(D%NPROCL(ILOCAL_LAT)) = & + & IPOINTS_PER_WSET(D%NPROCL(ILOCAL_LAT))+D%NONL(ILAT_STRIP,MY_REGION_EW) + ENDDO + ! sum up offsets + IWSET_POINT_OFFSET(1) = 0 + DO JROC=1,NPRTRW + IWSET_POINT_OFFSET(JROC+1)=IWSET_POINT_OFFSET(JROC)+IPOINTS_PER_WSET(JROC) + ENDDO + DO JROC=1,NPROC + CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) + ! total send size is # points per field * # fields + ISENDTOT(JROC) = IPOINTS_PER_WSET(ISETW)*ISEND_FLD_TOTAL(ISETV) ENDDO INDOFFX = 0 @@ -240,7 +244,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL GSTATS(1805,1) !$ACC DATA PRESENT(PGLAT) COPYIN(KPTRGP,INDOFF,KINDEX) & - !$ACC& COPYIN(IGPTRSEND,PGP_INDICES) ASYNC(1) + !$ACC& COPYIN(PGP_INDICES) ASYNC(1) !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1) !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) ASYNC(1) @@ -265,60 +269,47 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDIF ENDDO - IGPTROFF(1)=0 - DO JBLK=2,NGPBLKS - IGPTROFF(JBLK)=IGPTROFF(JBLK-1)+IGPTRSEND(2,JBLK-1,MYSETW)-IGPTRSEND(1,JBLK-1,MYSETW)+1 - ENDDO - JK_MAX = MAXVAL(IGPTRSEND(2,:,MYSETW)-IGPTRSEND(1,:,MYSETW))+1 - - !$ACC DATA COPYIN(IFLDA(1:IFLDS),IGPTROFF) ASYNC(1) + !$ACC DATA COPYIN(IFLDA(1:IFLDS)) ASYNC(1) + IWPOINTS_OFFSET = IWSET_POINT_OFFSET(MYSETW) + IWPOINTS_COUNT = IPOINTS_PER_WSET(MYSETW) CALL GSTATS(1601,0) IF(PRESENT(PGP)) THEN - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) ASYNC(1) - DO JBLK=1,NGPBLKS + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) + DO JL=1,IWPOINTS_COUNT DO JFLD=1,KF_FS - DO JKL=1, JK_MAX - IFIRST = IGPTRSEND(1,JBLK,MYSETW) - ILAST = IGPTRSEND(2,JBLK,MYSETW) - JK = JKL+IFIRST-1 - IF(IFIRST > 0 .AND. JK <= ILAST) THEN - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JKL - PGLAT(JFLD,KINDEX(IPOS)) = PGP(JK,IFLDA(JFLD),JBLK) - ENDIF - ENDDO + JK = MOD(IWPOINTS_OFFSET+JL-1,NPROMA)+1 + JBLK = (IWPOINTS_OFFSET+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + IPOS = INDOFF(MYPROC)+JL + PGLAT(JFLD,KINDEX(IPOS)) = PGP(JK,IFLDA(JFLD),JBLK) ENDDO ENDDO ELSE - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK,IOFF,PBOUND) ASYNC(1) - DO JBLK=1,NGPBLKS + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) + DO JL=1,IWPOINTS_COUNT DO JFLD=1,KF_FS - DO JKL=1, JK_MAX - IFIRST = IGPTRSEND(1,JBLK,MYSETW) - ILAST = IGPTRSEND(2,JBLK,MYSETW) - JK = JKL+IFIRST-1 - IF(IFIRST > 0 .AND. JK <= ILAST) THEN - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JKL - IFLD = IFLDA(JFLD) - IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) - PBOUND=UBOUND(PGPUV,2) - ! TODO we could certainly reshape PGPXX arrays and we would simplify this - PGLAT(JFLD,KINDEX(IPOS)) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) - PGLAT(JFLD,KINDEX(IPOS)) = PGP2(JK,IOFF+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) - PBOUND=UBOUND(PGP3A,2) - PGLAT(JFLD,KINDEX(IPOS)) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) - PBOUND=UBOUND(PGP3B,2) - PGLAT(JFLD,KINDEX(IPOS)) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) - ENDIF - ENDIF - ENDDO + JK = MOD(IWPOINTS_OFFSET+JL-1,NPROMA)+1 + JBLK = (IWPOINTS_OFFSET+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + IPOS = INDOFF(MYPROC)+JL + IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) + PBOUND=UBOUND(PGPUV,2) + ! TODO we could certainly reshape PGPXX arrays and we would simplify this + PGLAT(JFLD,KINDEX(IPOS)) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) + PGLAT(JFLD,KINDEX(IPOS)) = PGP2(JK,IOFF+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) + PBOUND=UBOUND(PGP3A,2) + PGLAT(JFLD,KINDEX(IPOS)) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) + PBOUND=UBOUND(PGP3B,2) + PGLAT(JFLD,KINDEX(IPOS)) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ENDIF ENDDO ENDDO ENDIF @@ -372,7 +363,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ISEND=JSEND(INS) CALL PE2SET(ISEND,ISETA,ISETB,ISETW,ISETV) - ISEND_FLD_CNT = ISEND_FLD_TOTAL(ISEND) + ISEND_FLD_CNT = ISEND_FLD_TOTAL(ISETV) SEND_OFFSET = ICOMBUFS_OFFSET(INS) IFLDS = 0 @@ -387,63 +378,47 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDIF ENDDO - IGPTROFF(1)=0 - DO JBLK=2,NGPBLKS - IGPTROFF(JBLK)=IGPTROFF(JBLK-1)+IGPTRSEND(2,JBLK-1,ISETW)-IGPTRSEND(1,JBLK-1,ISETW)+1 - ENDDO - JK_MAX = MAXVAL(IGPTRSEND(2,:,ISETW)-IGPTRSEND(1,:,ISETW))+1 - - ! Total send size - IPOS = IGPTRRECV(ISETW) - - !$ACC DATA COPYIN(IGPTROFF,IFLDA(1:ISEND_FLD_CNT)) ASYNC(1) + !$ACC DATA COPYIN(IFLDA(1:ISEND_FLD_CNT)) ASYNC(1) + IWPOINTS_OFFSET = IWSET_POINT_OFFSET(ISETW) + IWPOINTS_COUNT = IPOINTS_PER_WSET(ISETW) IF(PRESENT(PGP)) THEN - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFIRST,ILAST) ASYNC(1) - DO JBLK=1,NGPBLKS + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) ASYNC(1) + DO JL=1,IWPOINTS_COUNT DO JFLD=1,ISEND_FLD_CNT - DO JKL=1, JK_MAX - IFIRST = IGPTRSEND(1,JBLK,ISETW) - ILAST = IGPTRSEND(2,JBLK,ISETW) - JK = JKL+IFIRST-1 - IF(IFIRST > 0 .AND. JK <= ILAST) THEN - JI=(JFLD-1)*IPOS+IGPTROFF(JBLK)+JKL - ZCOMBUFS(SEND_OFFSET+JI) = PGP(JK,IFLDA(JFLD),JBLK) - ENDIF - ENDDO + JK = MOD(IWPOINTS_OFFSET+JL-1,NPROMA)+1 + JBLK = (IWPOINTS_OFFSET+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + JI = (JFLD-1)*IWPOINTS_COUNT+JL + ZCOMBUFS(SEND_OFFSET+JI) = PGP(JK,IFLDA(JFLD),JBLK) ENDDO ENDDO ELSE - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLD,IFIRST,ILAST) ASYNC(1) - DO JBLK=1,NGPBLKS + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI,IOFF) ASYNC(1) + DO JL=1,IWPOINTS_COUNT DO JFLD=1,ISEND_FLD_CNT - DO JKL=1, JK_MAX - IFLD=IFLDA(JFLD) - IFIRST = IGPTRSEND(1,JBLK,ISETW) - ILAST = IGPTRSEND(2,JBLK,ISETW) - JK = JKL+IFIRST-1 - IF(IFIRST > 0 .AND. JK <= ILAST) THEN - JI=(JFLD-1)*IPOS+IGPTROFF(JBLK)+JKL - IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) - PBOUND=UBOUND(PGPUV,2) - ! TODO we could certainly reshape PGPXX arrays and we would simplify this - ZCOMBUFS(SEND_OFFSET+JI) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) - ZCOMBUFS(SEND_OFFSET+JI) = PGP2(JK,IOFF+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) - PBOUND=UBOUND(PGP3A,2) - ZCOMBUFS(SEND_OFFSET+JI) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) - PBOUND=UBOUND(PGP3B,2) - ZCOMBUFS(SEND_OFFSET+JI)= PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) - ENDIF - ENDIF - ENDDO - ENDDO + JK = MOD(IWPOINTS_OFFSET+JL-1,NPROMA)+1 + JBLK = (IWPOINTS_OFFSET+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + JI = (JFLD-1)*IWPOINTS_COUNT+JL + IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) + PBOUND=UBOUND(PGPUV,2) + ! TODO we could certainly reshape PGPXX arrays and we would simplify this + ZCOMBUFS(SEND_OFFSET+JI) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) + ZCOMBUFS(SEND_OFFSET+JI) = PGP2(JK,IOFF+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) + PBOUND=UBOUND(PGP3A,2) + ZCOMBUFS(SEND_OFFSET+JI) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) + PBOUND=UBOUND(PGP3B,2) + ZCOMBUFS(SEND_OFFSET+JI)= PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ENDIF + ENDDO ENDDO ENDIF !$ACC END DATA From c33af429da5d9541facf560efb11bf9e8a70537e Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:03 -0700 Subject: [PATCH 080/263] Merge KINDEX code (and rename) --- src/trans/gpu/internal/trgtol_mod.F90 | 67 +++++++++++---------------- 1 file changed, 28 insertions(+), 39 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 8f6593d39..19d237770 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -118,12 +118,12 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& &ISETB, IRECV, & &ISETV, ISEND, JBLK, JFLD, & &JGL, JI, JK, JL, ISETW, IFLD, & - &II,INDOFFX,IBUFLENR,INRECV, IPROC,IFLDS, & + &II,IBUFLENR,INRECV, IPROC,IFLDS, & &INSEND,INS,INR,IR, JKL, PBOUND, IERROR ! LOCAL LOGICAL SCALARS INTEGER(KIND=JPIM) :: IOFF - INTEGER(KIND=JPIM) :: KINDEX(D%NLENGTF),INDOFF(NPROC) + INTEGER(KIND=JPIM) :: ZCOMBUFR_TO_PGLAT(D%NLENGTF),ZCOMBUFR_PROC_OFFSET(NPROC) INTEGER(KIND=JPIM) :: ISEND_FLD_TOTAL(NPRTRV),ISEND_FLD_CNT INTEGER(KIND=JPIM) :: IPOINTS_PER_WSET(NPRTRW), IWSET_POINT_OFFSET(NPRTRW+1), ILOCAL_LAT, ILAT_STRIP INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) @@ -195,9 +195,15 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ISENDTOT(JROC) = IPOINTS_PER_WSET(ISETW)*ISEND_FLD_TOTAL(ISETV) ENDDO - INDOFFX = 0 ! Prepare receiver arrays + ZCOMBUFR_PROC_OFFSET(:) = 0 DO JROC=1,NPROC + ! Get new offset to my current ZCOMBUFR entry + IF (JROC > 1 .AND. KF_FS > 0) THEN + ZCOMBUFR_PROC_OFFSET(JROC) = ZCOMBUFR_PROC_OFFSET(JROC-1)+IRECVTOT(JROC-1)/KF_FS + ELSEIF (JROC > 1) THEN + ZCOMBUFR_PROC_OFFSET(JROC) = ZCOMBUFR_PROC_OFFSET(JROC-1) + ENDIF CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) @@ -211,39 +217,22 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IPOS = 0 DO JGL=IFIRSTLAT,ILASTLAT ! get from "actual" latitude to the latitude strip offset - IGL = D%NPTRFRSTLAT(ISETA)+JGL-D%NFRSTLAT(ISETA) - ! number of gridpoints on this latitude strip on my process - IPOS = IPOS+D%NONL(IGL,ISETB) + IGL = JGL-D%NFRSTLAT(ISETA)+D%NPTRFRSTLAT(ISETA) + ! get from "actual" latitude to the latitude offset + IGLL = JGL-D%NPTRLS(MYSETW)+1 + DO JL=1,D%NONL(IGL,ISETB) + IPOS = IPOS+1 + ! indicates where the data has to be stored + ZCOMBUFR_TO_PGLAT(ZCOMBUFR_PROC_OFFSET(JROC)+IPOS) = D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL)+JL-1 + ENDDO ENDDO - ! We always receive the full fourier space + !we always receive the full fourier space IRECVTOT(JROC) = IPOS*KF_FS - - IF(IRECVTOT(JROC) > 0) THEN - ! If I have to recv something, we need to fill KINDEX, this is the unpacking instruction... - - ! INDOFF is the offset to the first gridpoint on this process, only considering a - ! single layer, e.g KF_FS=1 - INDOFF(JROC) = INDOFFX - INDOFFX = INDOFFX+IRECVTOT(JROC)/KF_FS - - IPOS = 0 - DO JGL=IFIRSTLAT,ILASTLAT - ! get from "actual" latitude to the latitude strip offset - IGL = D%NPTRFRSTLAT(ISETA)+JGL-D%NFRSTLAT(ISETA) - ! get from "actual" latitude to the latitude offset - IGLL = JGL-D%NPTRLS(MYSETW)+1 - DO JL=1,D%NONL(IGL,ISETB) - IPOS = IPOS+1 - ! indicates where the data has to be stored - KINDEX(INDOFF(JROC)+IPOS) = D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL)+JL-1 - ENDDO - ENDDO - ENDIF ENDDO CALL GSTATS(1805,1) - !$ACC DATA PRESENT(PGLAT) COPYIN(KPTRGP,INDOFF,KINDEX) & + !$ACC DATA PRESENT(PGLAT) COPYIN(KPTRGP,ZCOMBUFR_PROC_OFFSET,ZCOMBUFR_TO_PGLAT) & !$ACC& COPYIN(PGP_INDICES) ASYNC(1) !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1) @@ -281,8 +270,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& JK = MOD(IWPOINTS_OFFSET+JL-1,NPROMA)+1 JBLK = (IWPOINTS_OFFSET+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) - IPOS = INDOFF(MYPROC)+JL - PGLAT(JFLD,KINDEX(IPOS)) = PGP(JK,IFLDA(JFLD),JBLK) + IPOS = ZCOMBUFR_PROC_OFFSET(MYPROC)+JL + PGLAT(JFLD,ZCOMBUFR_TO_PGLAT(IPOS)) = PGP(JK,IFLDA(JFLD),JBLK) ENDDO ENDDO ELSE @@ -292,23 +281,23 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& JK = MOD(IWPOINTS_OFFSET+JL-1,NPROMA)+1 JBLK = (IWPOINTS_OFFSET+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) - IPOS = INDOFF(MYPROC)+JL + IPOS = ZCOMBUFR_PROC_OFFSET(MYPROC)+JL IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) PBOUND=UBOUND(PGPUV,2) ! TODO we could certainly reshape PGPXX arrays and we would simplify this - PGLAT(JFLD,KINDEX(IPOS)) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PGLAT(JFLD,ZCOMBUFR_TO_PGLAT(IPOS)) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) - PGLAT(JFLD,KINDEX(IPOS)) = PGP2(JK,IOFF+1,JBLK) + PGLAT(JFLD,ZCOMBUFR_TO_PGLAT(IPOS)) = PGP2(JK,IOFF+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) PBOUND=UBOUND(PGP3A,2) - PGLAT(JFLD,KINDEX(IPOS)) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PGLAT(JFLD,ZCOMBUFR_TO_PGLAT(IPOS)) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) PBOUND=UBOUND(PGP3B,2) - PGLAT(JFLD,KINDEX(IPOS)) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PGLAT(JFLD,ZCOMBUFR_TO_PGLAT(IPOS)) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ENDIF ENDDO ENDDO @@ -477,12 +466,12 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& DO INR=1,INRECV IRECV=JRECV(INR) ILEN = IRECVTOT(IRECV)/KF_FS - INDOFFL = INDOFF(IRECV) + INDOFFL = ZCOMBUFR_PROC_OFFSET(IRECV) RECV_OFFSET = ICOMBUFR_OFFSET(INR) !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(II) ASYNC(1) DO JFLD=1,KF_FS DO JL=1,ILEN - II = KINDEX(INDOFFL+JL) + II = ZCOMBUFR_TO_PGLAT(INDOFFL+JL) PGLAT(JFLD,II) = ZCOMBUFR(RECV_OFFSET+JL+(JFLD-1)*ILEN) ENDDO ENDDO From a79f8fc0f01363fc5cfdd56a707db56d33901755 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:04 -0700 Subject: [PATCH 081/263] Not critical: bunch of renaming --- src/trans/gpu/internal/trgtol_mod.F90 | 191 +++++++++++++------------- 1 file changed, 95 insertions(+), 96 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 19d237770..8a6df8da8 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -109,8 +109,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) INTEGER(KIND=JPIM) :: IREQ (NPROC*2) - INTEGER(KIND=JPIM) :: JRECV (NPROC) - INTEGER(KIND=JPIM) :: JSEND (NPROC) + INTEGER(KIND=JPIM) :: IRECV_TO_PROC(NPROC) + INTEGER(KIND=JPIM) :: ISEND_TO_PROC(NPROC) ! LOCAL INTEGER SCALARS INTEGER(KIND=JPIM) :: IFIRST, IFIRSTLAT, IGL, IGLL, ILAST,& @@ -118,18 +118,16 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& &ISETB, IRECV, & &ISETV, ISEND, JBLK, JFLD, & &JGL, JI, JK, JL, ISETW, IFLD, & - &II,IBUFLENR,INRECV, IPROC,IFLDS, & - &INSEND,INS,INR,IR, JKL, PBOUND, IERROR - - ! LOCAL LOGICAL SCALARS - INTEGER(KIND=JPIM) :: IOFF - INTEGER(KIND=JPIM) :: ZCOMBUFR_TO_PGLAT(D%NLENGTF),ZCOMBUFR_PROC_OFFSET(NPROC) - INTEGER(KIND=JPIM) :: ISEND_FLD_TOTAL(NPRTRV),ISEND_FLD_CNT - INTEGER(KIND=JPIM) :: IPOINTS_PER_WSET(NPRTRW), IWSET_POINT_OFFSET(NPRTRW+1), ILOCAL_LAT, ILAT_STRIP + &II,IBUFLENR,IRECV_COUNTS, IPROC,IFLDS, & + &ISEND_COUNTS,INS,INR,IR, JKL, PBOUND, IERROR, ILOCAL_LAT + + INTEGER(KIND=JPIM) :: IOFF, ILAT_STRIP + INTEGER(KIND=JPIM) :: IRECV_BUFR_TO_OUT(D%NLENGTF),IRECV_BUFR_TO_OUT_OFFSET(NPROC), IRECV_BUFR_TO_OUT_V + INTEGER(KIND=JPIM) :: ISEND_FIELD_COUNT(NPRTRV),ISEND_FIELD_COUNT_V + INTEGER(KIND=JPIM) :: ISEND_WSET_SIZE(NPRTRW),ISEND_WSET_SIZE_V + INTEGER(KIND=JPIM) :: ISEND_WSET_OFFSET(NPRTRW+1), ISEND_WSET_OFFSET_V INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) - INTEGER(KIND=JPIM) :: SEND_OFFSET, RECV_OFFSET, IWPOINTS_COUNT, IWPOINTS_OFFSET - ! INTEGER FUNCTIONS - INTEGER(KIND=JPIM) :: INDOFFL + INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V INTEGER(KIND=JPIM) :: IFLDA(KF_GP) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR @@ -170,39 +168,39 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ! find number of fields on a certain V-set IF(NPRTRV == 1) THEN ! This is needed because KVSET(JFLD) == -1 if there is only one V-set - ISEND_FLD_TOTAL(1) = KF_GP + ISEND_FIELD_COUNT(1) = KF_GP ELSE - ISEND_FLD_TOTAL(:) = 0 + ISEND_FIELD_COUNT(:) = 0 DO JFLD=1,KF_GP - ISEND_FLD_TOTAL(KVSET(JFLD)) = ISEND_FLD_TOTAL(KVSET(JFLD)) + 1 + ISEND_FIELD_COUNT(KVSET(JFLD)) = ISEND_FIELD_COUNT(KVSET(JFLD)) + 1 ENDDO ENDIF ! find number of grid-points on a certain W-set that overlap with myself - IPOINTS_PER_WSET(:) = 0 + ISEND_WSET_SIZE(:) = 0 DO ILOCAL_LAT=D%NFRSTLAT(MY_REGION_NS),D%NLSTLAT(MY_REGION_NS) ILAT_STRIP = ILOCAL_LAT-D%NFRSTLAT(MY_REGION_NS)+D%NPTRFLOFF+1 - IPOINTS_PER_WSET(D%NPROCL(ILOCAL_LAT)) = & - & IPOINTS_PER_WSET(D%NPROCL(ILOCAL_LAT))+D%NONL(ILAT_STRIP,MY_REGION_EW) + ISEND_WSET_SIZE(D%NPROCL(ILOCAL_LAT)) = & + & ISEND_WSET_SIZE(D%NPROCL(ILOCAL_LAT))+D%NONL(ILAT_STRIP,MY_REGION_EW) ENDDO ! sum up offsets - IWSET_POINT_OFFSET(1) = 0 + ISEND_WSET_OFFSET(1) = 0 DO JROC=1,NPRTRW - IWSET_POINT_OFFSET(JROC+1)=IWSET_POINT_OFFSET(JROC)+IPOINTS_PER_WSET(JROC) + ISEND_WSET_OFFSET(JROC+1)=ISEND_WSET_OFFSET(JROC)+ISEND_WSET_SIZE(JROC) ENDDO DO JROC=1,NPROC CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) ! total send size is # points per field * # fields - ISENDTOT(JROC) = IPOINTS_PER_WSET(ISETW)*ISEND_FLD_TOTAL(ISETV) + ISENDTOT(JROC) = ISEND_WSET_SIZE(ISETW)*ISEND_FIELD_COUNT(ISETV) ENDDO ! Prepare receiver arrays - ZCOMBUFR_PROC_OFFSET(:) = 0 + IRECV_BUFR_TO_OUT_OFFSET(:) = 0 DO JROC=1,NPROC - ! Get new offset to my current ZCOMBUFR entry + ! Get new offset to my current KINDEX entry IF (JROC > 1 .AND. KF_FS > 0) THEN - ZCOMBUFR_PROC_OFFSET(JROC) = ZCOMBUFR_PROC_OFFSET(JROC-1)+IRECVTOT(JROC-1)/KF_FS + IRECV_BUFR_TO_OUT_OFFSET(JROC) = IRECV_BUFR_TO_OUT_OFFSET(JROC-1)+IRECVTOT(JROC-1)/KF_FS ELSEIF (JROC > 1) THEN - ZCOMBUFR_PROC_OFFSET(JROC) = ZCOMBUFR_PROC_OFFSET(JROC-1) + IRECV_BUFR_TO_OUT_OFFSET(JROC) = IRECV_BUFR_TO_OUT_OFFSET(JROC-1) ENDIF CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) @@ -223,7 +221,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& DO JL=1,D%NONL(IGL,ISETB) IPOS = IPOS+1 ! indicates where the data has to be stored - ZCOMBUFR_TO_PGLAT(ZCOMBUFR_PROC_OFFSET(JROC)+IPOS) = D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL)+JL-1 + IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_OFFSET(JROC)+IPOS) = D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL)+JL-1 ENDDO ENDDO !we always receive the full fourier space @@ -232,7 +230,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL GSTATS(1805,1) - !$ACC DATA PRESENT(PGLAT) COPYIN(KPTRGP,ZCOMBUFR_PROC_OFFSET,ZCOMBUFR_TO_PGLAT) & + !$ACC DATA PRESENT(PGLAT) COPYIN(IRECV_BUFR_TO_OUT) & !$ACC& COPYIN(PGP_INDICES) ASYNC(1) !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1) @@ -260,44 +258,45 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC DATA COPYIN(IFLDA(1:IFLDS)) ASYNC(1) - IWPOINTS_OFFSET = IWSET_POINT_OFFSET(MYSETW) - IWPOINTS_COUNT = IPOINTS_PER_WSET(MYSETW) + ISEND_WSET_OFFSET_V = ISEND_WSET_OFFSET(MYSETW) + ISEND_WSET_SIZE_V = ISEND_WSET_SIZE(MYSETW) + IRECV_BUFR_TO_OUT_V = IRECV_BUFR_TO_OUT_OFFSET(MYPROC) CALL GSTATS(1601,0) IF(PRESENT(PGP)) THEN !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) - DO JL=1,IWPOINTS_COUNT + DO JL=1,ISEND_WSET_SIZE_V DO JFLD=1,KF_FS - JK = MOD(IWPOINTS_OFFSET+JL-1,NPROMA)+1 - JBLK = (IWPOINTS_OFFSET+JL-1)/NPROMA+1 + JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) - IPOS = ZCOMBUFR_PROC_OFFSET(MYPROC)+JL - PGLAT(JFLD,ZCOMBUFR_TO_PGLAT(IPOS)) = PGP(JK,IFLDA(JFLD),JBLK) + IPOS = IRECV_BUFR_TO_OUT_V+JL + PGLAT(JFLD,IRECV_BUFR_TO_OUT(IPOS)) = PGP(JK,IFLD,JBLK) ENDDO ENDDO ELSE !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) - DO JL=1,IWPOINTS_COUNT + DO JL=1,ISEND_WSET_SIZE_V DO JFLD=1,KF_FS - JK = MOD(IWPOINTS_OFFSET+JL-1,NPROMA)+1 - JBLK = (IWPOINTS_OFFSET+JL-1)/NPROMA+1 + JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) - IPOS = ZCOMBUFR_PROC_OFFSET(MYPROC)+JL + IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL) IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) PBOUND=UBOUND(PGPUV,2) ! TODO we could certainly reshape PGPXX arrays and we would simplify this - PGLAT(JFLD,ZCOMBUFR_TO_PGLAT(IPOS)) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PGLAT(JFLD,IPOS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) - PGLAT(JFLD,ZCOMBUFR_TO_PGLAT(IPOS)) = PGP2(JK,IOFF+1,JBLK) + PGLAT(JFLD,IPOS) = PGP2(JK,IOFF+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) PBOUND=UBOUND(PGP3A,2) - PGLAT(JFLD,ZCOMBUFR_TO_PGLAT(IPOS)) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PGLAT(JFLD,IPOS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) PBOUND=UBOUND(PGP3B,2) - PGLAT(JFLD,ZCOMBUFR_TO_PGLAT(IPOS)) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PGLAT(JFLD,IPOS) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ENDIF ENDDO ENDDO @@ -309,51 +308,51 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDIF ! Figure out processes that send or recv something - INSEND = 0 - INRECV = 0 + ISEND_COUNTS = 0 + IRECV_COUNTS = 0 DO JROC=1,NPROC IF( JROC /= MYPROC) THEN IF(IRECVTOT(JROC) > 0) THEN ! I have to recv something, so let me store that - INRECV = INRECV + 1 - JRECV(INRECV)=JROC + IRECV_COUNTS = IRECV_COUNTS + 1 + IRECV_TO_PROC(IRECV_COUNTS)=JROC ENDIF IF(ISENDTOT(JROC) > 0) THEN ! I have to send something, so let me store that - INSEND = INSEND+1 - JSEND(INSEND)=JROC + ISEND_COUNTS = ISEND_COUNTS+1 + ISEND_TO_PROC(ISEND_COUNTS)=JROC ENDIF ENDIF ENDDO - ALLOCATE(ICOMBUFS_OFFSET(INSEND+1)) + ALLOCATE(ICOMBUFS_OFFSET(ISEND_COUNTS+1)) ICOMBUFS_OFFSET(1) = 0 - DO JROC=1,INSEND - ICOMBUFS_OFFSET(JROC+1) = ICOMBUFS_OFFSET(JROC) + ISENDTOT(JSEND(JROC)) + DO JROC=1,ISEND_COUNTS + ICOMBUFS_OFFSET(JROC+1) = ICOMBUFS_OFFSET(JROC) + ISENDTOT(ISEND_TO_PROC(JROC)) ENDDO - ALLOCATE(ICOMBUFR_OFFSET(INRECV+1)) + ALLOCATE(ICOMBUFR_OFFSET(IRECV_COUNTS+1)) ICOMBUFR_OFFSET(1) = 0 - DO JROC=1,INRECV - ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(JRECV(JROC)) + DO JROC=1,IRECV_COUNTS + ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(IRECV_TO_PROC(JROC)) ENDDO ! Send loop............................................................. - IF (INSEND > 0) ALLOCATE(ZCOMBUFS(ICOMBUFS_OFFSET(INSEND+1))) - IF (INRECV > 0) ALLOCATE(ZCOMBUFR(ICOMBUFR_OFFSET(INRECV+1))) + IF (ISEND_COUNTS > 0) ALLOCATE(ZCOMBUFS(ICOMBUFS_OFFSET(ISEND_COUNTS+1))) + IF (IRECV_COUNTS > 0) ALLOCATE(ZCOMBUFR(ICOMBUFR_OFFSET(IRECV_COUNTS+1))) - !$ACC DATA IF(INSEND > 0) CREATE(ZCOMBUFS) ASYNC(1) - !$ACC DATA IF(INRECV > 0) CREATE(ZCOMBUFR) ASYNC(1) + !$ACC DATA IF(ISEND_COUNTS > 0) CREATE(ZCOMBUFS) ASYNC(1) + !$ACC DATA IF(IRECV_COUNTS > 0) CREATE(ZCOMBUFR) ASYNC(1) !....Pack loop......................................................... CALL GSTATS(1602,0) - DO INS=1,INSEND - ISEND=JSEND(INS) + DO INS=1,ISEND_COUNTS + ISEND=ISEND_TO_PROC(INS) CALL PE2SET(ISEND,ISETA,ISETB,ISETW,ISETV) - ISEND_FLD_CNT = ISEND_FLD_TOTAL(ISETV) - SEND_OFFSET = ICOMBUFS_OFFSET(INS) + ISEND_FIELD_COUNT_V = ISEND_FIELD_COUNT(ISETV) + ICOMBUFS_OFFSET_V = ICOMBUFS_OFFSET(INS) IFLDS = 0 DO JFLD=1,KF_GP @@ -367,45 +366,45 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDIF ENDDO - !$ACC DATA COPYIN(IFLDA(1:ISEND_FLD_CNT)) ASYNC(1) + !$ACC DATA COPYIN(IFLDA(1:ISEND_FIELD_COUNT_V)) ASYNC(1) - IWPOINTS_OFFSET = IWSET_POINT_OFFSET(ISETW) - IWPOINTS_COUNT = IPOINTS_PER_WSET(ISETW) + ISEND_WSET_OFFSET_V = ISEND_WSET_OFFSET(ISETW) + ISEND_WSET_SIZE_V = ISEND_WSET_SIZE(ISETW) IF(PRESENT(PGP)) THEN !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) ASYNC(1) - DO JL=1,IWPOINTS_COUNT - DO JFLD=1,ISEND_FLD_CNT - JK = MOD(IWPOINTS_OFFSET+JL-1,NPROMA)+1 - JBLK = (IWPOINTS_OFFSET+JL-1)/NPROMA+1 + DO JL=1,ISEND_WSET_SIZE_V + DO JFLD=1,ISEND_FIELD_COUNT_V + JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) - JI = (JFLD-1)*IWPOINTS_COUNT+JL - ZCOMBUFS(SEND_OFFSET+JI) = PGP(JK,IFLDA(JFLD),JBLK) + JI = (JFLD-1)*ISEND_WSET_SIZE_V+JL + ZCOMBUFS(ICOMBUFS_OFFSET_V+JI) = PGP(JK,IFLD,JBLK) ENDDO ENDDO ELSE !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI,IOFF) ASYNC(1) - DO JL=1,IWPOINTS_COUNT - DO JFLD=1,ISEND_FLD_CNT - JK = MOD(IWPOINTS_OFFSET+JL-1,NPROMA)+1 - JBLK = (IWPOINTS_OFFSET+JL-1)/NPROMA+1 + DO JL=1,ISEND_WSET_SIZE_V + DO JFLD=1,ISEND_FIELD_COUNT_V + JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) - JI = (JFLD-1)*IWPOINTS_COUNT+JL + JI = (JFLD-1)*ISEND_WSET_SIZE_V+JL IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) PBOUND=UBOUND(PGPUV,2) ! TODO we could certainly reshape PGPXX arrays and we would simplify this - ZCOMBUFS(SEND_OFFSET+JI) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ZCOMBUFS(ICOMBUFS_OFFSET_V+JI) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) - ZCOMBUFS(SEND_OFFSET+JI) = PGP2(JK,IOFF+1,JBLK) + ZCOMBUFS(ICOMBUFS_OFFSET_V+JI) = PGP2(JK,IOFF+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) PBOUND=UBOUND(PGP3A,2) - ZCOMBUFS(SEND_OFFSET+JI) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ZCOMBUFS(ICOMBUFS_OFFSET_V+JI) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) PBOUND=UBOUND(PGP3B,2) - ZCOMBUFS(SEND_OFFSET+JI)= PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ZCOMBUFS(ICOMBUFS_OFFSET_V+JI)= PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ENDIF ENDDO ENDDO @@ -432,17 +431,17 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC HOST_DATA USE_DEVICE(ZCOMBUFR,ZCOMBUFS) ! Receive loop......................................................... - DO INR=1,INRECV + DO INR=1,IRECV_COUNTS IR=IR+1 - IRECV=JRECV(INR) - CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)),IRECVTOT(IRECV), & - & TRGTOL_DTYPE,NPRCIDS(IRECV)-1,MTAGGL,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) + IPROC=IRECV_TO_PROC(INR) + CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)),IRECVTOT(IPROC), & + & TRGTOL_DTYPE,NPRCIDS(IPROC)-1,MTAGGL,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) ENDDO !....Send loop......................................................... - DO INS=1,INSEND + DO INS=1,ISEND_COUNTS IR=IR+1 - ISEND=JSEND(INS) + ISEND=ISEND_TO_PROC(INS) CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT(ISEND), & & TRGTOL_DTYPE,NPRCIDS(ISEND)-1,MTAGGL,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) ENDDO @@ -463,16 +462,16 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL GSTATS(1603,0) - DO INR=1,INRECV - IRECV=JRECV(INR) - ILEN = IRECVTOT(IRECV)/KF_FS - INDOFFL = ZCOMBUFR_PROC_OFFSET(IRECV) - RECV_OFFSET = ICOMBUFR_OFFSET(INR) + DO INR=1,IRECV_COUNTS + IPROC=IRECV_TO_PROC(INR) + ILEN = IRECVTOT(IPROC)/KF_FS + IRECV_BUFR_TO_OUT_V = IRECV_BUFR_TO_OUT_OFFSET(IPROC) + ICOMBUFR_OFFSET_V = ICOMBUFR_OFFSET(INR) !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(II) ASYNC(1) DO JFLD=1,KF_FS DO JL=1,ILEN - II = ZCOMBUFR_TO_PGLAT(INDOFFL+JL) - PGLAT(JFLD,II) = ZCOMBUFR(RECV_OFFSET+JL+(JFLD-1)*ILEN) + II = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL) + PGLAT(JFLD,II) = ZCOMBUFR(ICOMBUFR_OFFSET_V+JL+(JFLD-1)*ILEN) ENDDO ENDDO ENDDO @@ -489,8 +488,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC END DATA !! PRESENT(PGPUV) !$ACC END DATA !! PRESENT(PGP) - IF (INSEND > 0) DEALLOCATE(ZCOMBUFS) - IF (INRECV > 0) DEALLOCATE(ZCOMBUFR) + IF (ISEND_COUNTS > 0) DEALLOCATE(ZCOMBUFS) + IF (IRECV_COUNTS > 0) DEALLOCATE(ZCOMBUFR) IF (LHOOK) CALL DR_HOOK('TRGTOL_CUDAAWARE',1,ZHOOK_HANDLE) From 5bdaa7108ec851ffc6d95f655057c463716fdd3c Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:04 -0700 Subject: [PATCH 082/263] Tiny cleanups in TRGTOL as preparation for TRLTOG --- src/trans/gpu/internal/trgtol_mod.F90 | 54 +++++++++++---------------- 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 8a6df8da8..0b9ba6ffb 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -71,24 +71,16 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT - USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - - USE MPL_MODULE ,ONLY : MPL_RECV, MPL_SEND, MPL_WAIT, JP_NON_BLOCKING_STANDARD, MPL_BARRIER - - USE TPM_GEN ,ONLY : NOUT, LSYNC_TRANS - USE TPM_DISTR ,ONLY : D, NPRCIDS, NPRTRNS, MTAGGL,NPRTRW,NPRTRV, & - & MYSETV, MYSETW, MYPROC, NPROC + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB , JPRBT + USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + USE MPL_MODULE ,ONLY : MPL_WAIT, MPL_BARRIER + USE TPM_GEN ,ONLY : LSYNC_TRANS USE EQ_REGIONS_MOD ,ONLY : MY_REGION_EW, MY_REGION_NS - USE TPM_TRANS ,ONLY : NGPBLKS, NPROMA - + USE TPM_DISTR ,ONLY : D,MYSETV, MYSETW, MTAGLG,NPRCIDS,MYPROC,NPROC,NPRTRW,NPRTRV + USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, NGPBLKS, NPROMA USE PE2SET_MOD ,ONLY : PE2SET - !USE MYSENDSET_MOD - USE MPL_DATA_MODULE, only: MPL_COMM_OML - USE OML_MOD, only: OML_MY_THREAD - !USE MYRECVSET_MOD - USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS - ! + USE MPL_DATA_MODULE ,ONLY : MPL_COMM_OML + USE OML_MOD ,ONLY : OML_MY_THREAD USE MPI IMPLICIT NONE @@ -104,6 +96,9 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP2(:,:,:) + ! LOCAL VARIABLES + + ! LOCAL INTEGER SCALARS REAL(KIND=JPRBT),ALLOCATABLE :: ZCOMBUFS(:),ZCOMBUFR(:) INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) @@ -112,8 +107,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& INTEGER(KIND=JPIM) :: IRECV_TO_PROC(NPROC) INTEGER(KIND=JPIM) :: ISEND_TO_PROC(NPROC) - ! LOCAL INTEGER SCALARS - INTEGER(KIND=JPIM) :: IFIRST, IFIRSTLAT, IGL, IGLL, ILAST,& + INTEGER(KIND=JPIM) :: IFIRSTLAT, IGL, IGLL, ILAST,& &ILASTLAT, ILEN, JROC, IPOS, ISETA, & &ISETB, IRECV, & &ISETV, ISEND, JBLK, JFLD, & @@ -129,6 +123,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V INTEGER(KIND=JPIM) :: IFLDA(KF_GP) + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR @@ -230,8 +225,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL GSTATS(1805,1) - !$ACC DATA PRESENT(PGLAT) COPYIN(IRECV_BUFR_TO_OUT) & - !$ACC& COPYIN(PGP_INDICES) ASYNC(1) + !$ACC DATA PRESENT(PGLAT) COPYIN(IRECV_BUFR_TO_OUT,PGP_INDICES) ASYNC(1) !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1) !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) ASYNC(1) @@ -284,7 +278,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) PBOUND=UBOUND(PGPUV,2) - ! TODO we could certainly reshape PGPXX arrays and we would simplify this PGLAT(JFLD,IPOS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) @@ -336,11 +329,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(IRECV_TO_PROC(JROC)) ENDDO - ! Send loop............................................................. - IF (ISEND_COUNTS > 0) ALLOCATE(ZCOMBUFS(ICOMBUFS_OFFSET(ISEND_COUNTS+1))) IF (IRECV_COUNTS > 0) ALLOCATE(ZCOMBUFR(ICOMBUFR_OFFSET(IRECV_COUNTS+1))) - !$ACC DATA IF(ISEND_COUNTS > 0) CREATE(ZCOMBUFS) ASYNC(1) !$ACC DATA IF(IRECV_COUNTS > 0) CREATE(ZCOMBUFR) ASYNC(1) @@ -388,23 +378,23 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) - JI = (JFLD-1)*ISEND_WSET_SIZE_V+JL + JI = ICOMBUFS_OFFSET_V+(JFLD-1)*ISEND_WSET_SIZE_V+JL IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) PBOUND=UBOUND(PGPUV,2) ! TODO we could certainly reshape PGPXX arrays and we would simplify this - ZCOMBUFS(ICOMBUFS_OFFSET_V+JI) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ZCOMBUFS(JI) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) - ZCOMBUFS(ICOMBUFS_OFFSET_V+JI) = PGP2(JK,IOFF+1,JBLK) + ZCOMBUFS(JI) = PGP2(JK,IOFF+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) PBOUND=UBOUND(PGP3A,2) - ZCOMBUFS(ICOMBUFS_OFFSET_V+JI) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ZCOMBUFS(JI) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) PBOUND=UBOUND(PGP3B,2) - ZCOMBUFS(ICOMBUFS_OFFSET_V+JI)= PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ZCOMBUFS(JI)= PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ENDIF ENDDO ENDDO @@ -435,7 +425,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IR=IR+1 IPROC=IRECV_TO_PROC(INR) CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)),IRECVTOT(IPROC), & - & TRGTOL_DTYPE,NPRCIDS(IPROC)-1,MTAGGL,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) + & TRGTOL_DTYPE,NPRCIDS(IPROC)-1,MTAGLG,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) ENDDO !....Send loop......................................................... @@ -443,7 +433,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IR=IR+1 ISEND=ISEND_TO_PROC(INS) CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT(ISEND), & - & TRGTOL_DTYPE,NPRCIDS(ISEND)-1,MTAGGL,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) + & TRGTOL_DTYPE,NPRCIDS(ISEND)-1,MTAGLG,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) ENDDO !$ACC END HOST_DATA @@ -475,7 +465,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDDO ENDDO ENDDO - !$ACC WAIT(1) !$ACC END DATA CALL GSTATS(1603,1) @@ -487,6 +476,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC END DATA !! PRESENT(PGP2) !$ACC END DATA !! PRESENT(PGPUV) !$ACC END DATA !! PRESENT(PGP) + !$ACC WAIT(1) IF (ISEND_COUNTS > 0) DEALLOCATE(ZCOMBUFS) IF (IRECV_COUNTS > 0) DEALLOCATE(ZCOMBUFR) From 531ba2f0fe1c45fa608065cb51a38a163a928401 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:04 -0700 Subject: [PATCH 083/263] Align TRLTOG with TRGTOL (huge change, but exactly reversed to TRLTOG) --- src/trans/gpu/internal/trltog_mod.F90 | 646 ++++++++++++-------------- 1 file changed, 291 insertions(+), 355 deletions(-) diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index cbe789df3..c1b19f023 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -17,41 +17,41 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ! to column structure. This takes place between inverse ! FFT and grid point calculations. ! TRLTOG is the inverse of TRGTOL - + ! Version using CUDA-aware MPI - + ! Purpose. ! -------- - - + + !** Interface. ! ---------- ! *call* *trltog(...) - + ! Explicit arguments : ! -------------------- ! PGLAT - Latitudinal data ready for direct FFT (input) ! PGP - Blocked grid point data (output) ! KVSET - "v-set" for each field (input) - + ! Implicit arguments : ! -------------------- - + ! Method. ! ------- ! See documentation - + ! Externals. ! ---------- - + ! Reference. ! ---------- ! ECMWF Research Department documentation of the IFS - + ! Author. ! ------- ! MPP Group *ECMWF* - + ! Modifications. ! -------------- ! Original : 95-10-01 @@ -69,32 +69,23 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ! 08-01-01 G.Mozdzynski: cleanup ! 09-01-02 G.Mozdzynski: use non-blocking recv and send ! ------------------------------------------------------------------ - - - + + + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB , JPRBT USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - - USE MPL_MODULE ,ONLY : MPL_WAIT, JP_NON_BLOCKING_STANDARD, MPL_MYRANK, MPL_BARRIER - - USE TPM_GEN ,ONLY : NOUT, LSYNC_TRANS - USE TPM_DISTR ,ONLY : D, MYSETV, MYSETW, MTAGLG, & - & NPRCIDS, NPRTRNS, MYPROC, NPROC - USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, NGPBLKS - - USE INIGPTR_MOD ,ONLY : INIGPTR + USE MPL_MODULE ,ONLY : MPL_WAIT, MPL_BARRIER + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE EQ_REGIONS_MOD ,ONLY : MY_REGION_EW, MY_REGION_NS + USE TPM_DISTR ,ONLY : D,MYSETV, MYSETW, MTAGLG,NPRCIDS,MYPROC,NPROC,NPRTRW,NPRTRV + USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, NGPBLKS, NPROMA USE PE2SET_MOD ,ONLY : PE2SET - !USE MYSENDSET_MOD - !USE MYRECVSET_MOD - USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS USE MPL_DATA_MODULE ,ONLY : MPL_COMM_OML USE OML_MOD ,ONLY : OML_MY_THREAD - ! USE MPI - + IMPLICIT NONE - - + REAL(KIND=JPRBT), INTENT(IN) :: PGLAT(:,:) INTEGER(KIND=JPIM),INTENT(IN) :: KVSET(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP @@ -105,55 +96,43 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP3A(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP2(:,:,:) - + ! LOCAL VARIABLES - - REAL(KIND=JPRBT),ALLOCATABLE :: ZCOMBUFS(:,:),ZCOMBUFR(:,:) - INTEGER(KIND=JPIM) :: ICOMR_KFFS(NPROC), ICOMS_KFFS(NPROC) - REAL(KIND=JPRBT) :: ZDUM(2) - - INTEGER(KIND=JPIM) :: ISENT (NPROC) - INTEGER(KIND=JPIM) :: IRCVD (NPROC) + + REAL(KIND=JPRBT),ALLOCATABLE :: ZCOMBUFS(:),ZCOMBUFR(:) + INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) - INTEGER(KIND=JPIM) :: IREQ (NPROC*4) - INTEGER(KIND=JPIM) :: JSEND (NPROC) - INTEGER(KIND=JPIM) :: JRECV (NPROC) - - INTEGER(KIND=JPIM) :: IFIRST, IFIRSTLAT, IFLD, IGL, IGLL,& - &ILAST, ILASTLAT, IPOS, ISETA, & - &ISETB, IRECV, IRECVSET, & - &ISETV, ISEND, ITAG, JBLK, JFLD, & - &JGL, JK, JL, JLOOP, ISETW, IFLDS, IPROC,JROC, & - &INRECV, INSEND,INR,INS,IR, JKL, JK_MAX - INTEGER(KIND=JPIM) :: II,INDOFFX,ILEN,IBUFLENS,IBUFLENR - - LOGICAL :: LLPGPUV,LLPGP3A,LLPGP3B,LLPGP2,LLPGPONLY - LOGICAL :: LLUV(KF_GP),LLGP2(KF_GP),LLGP3A(KF_GP),LLGP3B(KF_GP) - LOGICAL :: LLDONE, LLINDER + INTEGER(KIND=JPIM) :: IREQ (NPROC*2) + INTEGER(KIND=JPIM) :: IRECV_TO_PROC(NPROC) + INTEGER(KIND=JPIM) :: ISEND_TO_PROC(NPROC) + + INTEGER(KIND=JPIM) :: JFLD, J, JI, J1, J2, JGL, JK, JL, IFLDS, JROC, INR, INS + INTEGER(KIND=JPIM) :: IFIRSTLAT, ILASTLAT, IFLD, IGL, IGLL,& + &IPOS, ISETA, ISETB, ISETV, ISEND, IRECV, ISETW, IPROC, & + &IR, ILOCAL_LAT, ISEND_COUNTS, IRECV_COUNTS, IERROR, II, ILEN, IBUFLENS, IBUFLENR, & + &JBLK, ILAT_STRIP + INTEGER(KIND=JPIM) :: IUVLEVS(KF_GP),IUVPARS(KF_GP),IGP2PARS(KF_GP) INTEGER(KIND=JPIM) :: IGP3APARS(KF_GP),IGP3ALEVS(KF_GP),IGP3BPARS(KF_GP),IGP3BLEVS(KF_GP) INTEGER(KIND=JPIM) :: IUVPAR,IUVLEV,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR,IPAROFF - INTEGER(KIND=JPIM) :: IOFF,IOFF1,IOFFNS,IOFFEW,J1,J2 - INTEGER(KIND=JPIM) :: KINDEX(D%NLENGTF),INDOFF(NPROC),IFLDOFF(KF_GP) - INTEGER(KIND=JPIM) :: IRECV_FLD_START,IRECV_FLD_END - INTEGER(KIND=JPIM) :: ISEND_FLD_START(NPROC),ISEND_FLD_END - INTEGER(KIND=JPIM) :: INUMFLDS - INTEGER(KIND=JPIM) :: IGPTRSEND(2,NGPBLKS,NPRTRNS) - INTEGER(KIND=JPIM) :: IGPTRRECV(NPRTRNS) - INTEGER(KIND=JPIM) :: IGPTROFF(NGPBLKS) - - ! INTEGER FUNCTIONS - INTEGER(KIND=JPIM) :: ISENDCOUNT,IRECVCOUNT,J - INTEGER(KIND=JPIM) :: JPOS(NGPBLKS),IFLDA(KF_GP),JI,JJ - INTEGER(KIND=JPIM) :: IFLDT + INTEGER(KIND=JPIM) :: IOFF,IOFF1,IOFFNS,IOFFEW + INTEGER(KIND=JPIM) :: NDERS + + INTEGER(KIND=JPIM) :: IFLDA(KF_GP) + INTEGER(KIND=JPIM) :: IIN_TO_SEND_BUFR(D%NLENGTF),IIN_TO_SEND_BUFR_OFFSET(NPROC), IIN_TO_SEND_BUFR_V + INTEGER(KIND=JPIM) :: IRECV_FIELD_COUNT(NPRTRV),IRECV_FIELD_COUNT_V + INTEGER(KIND=JPIM) :: IRECV_WSET_SIZE(NPRTRW),IRECV_WSET_SIZE_V + INTEGER(KIND=JPIM) :: IRECV_WSET_OFFSET(NPRTRW+1), IRECV_WSET_OFFSET_V + INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) + INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V + + LOGICAL :: LLPGPUV,LLPGP3A,LLPGP3B,LLPGP2,LLPGPONLY + LOGICAL :: LLUV(KF_GP),LLGP2(KF_GP),LLGP3A(KF_GP),LLGP3B(KF_GP) + LOGICAL :: LLINDER + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR - - INTEGER(KIND=JPIM), DIMENSION(MPI_STATUS_SIZE,NPROC*2) :: ISTATUS - INTEGER(KIND=JPIM) :: IERROR - - REAL(KIND=JPRBT) :: TIMEF, Tc #ifdef PARKINDTRANS_SINGLE #define TRLTOG_DTYPE MPI_REAL @@ -161,16 +140,16 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& #define TRLTOG_DTYPE MPI_DOUBLE_PRECISION #endif - + ! ------------------------------------------------------------------ - + !* 0. Some initializations ! -------------------- IF (LHOOK) CALL DR_HOOK('TRLTOG',0,ZHOOK_HANDLE) - - + + CALL GSTATS(1806,0) - + LLINDER = .FALSE. LLPGPUV = .FALSE. LLPGP3A = .FALSE. @@ -183,13 +162,14 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IF(PRESENT(PGP3A)) LLPGP3A=.TRUE. IF(PRESENT(PGP3B)) LLPGP3B=.TRUE. IF(PRESENT(PGP2)) LLPGP2=.TRUE. - + IUVPAR=0 IUVLEV=0 IOFF1=0 IOFFNS=KF_SCALARS_G IOFFEW=2*KF_SCALARS_G - + + LLUV(:) = .FALSE. IF (LLPGPUV) THEN IOFF=0 @@ -224,7 +204,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IOFF1=IOFF IOFFNS=IOFFNS+IOFF IOFFEW=IOFFEW+IOFF - + IOFF=IUVPAR*IUVLEV+KF_SCALARS_G IF(LUVDER) THEN IF(LSCDERS) IOFF=IOFF+KF_SCALARS_G @@ -241,7 +221,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IOFFEW=IOFFEW+2*IUVLEV ENDIF ENDIF - + LLGP2(:)=.FALSE. IF(LLPGP2) THEN IOFF=IOFF1 @@ -267,7 +247,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IOFFEW=IOFF+IGP2PAR ENDIF ENDIF - + LLGP3A(:) = .FALSE. IF(LLPGP3A) THEN IGP3ALEV=UBOUND(PGP3A,2) @@ -305,7 +285,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IOFFEW=IOFFEW+IGP3APAR*IGP3ALEV ENDIF ENDIF - + LLGP3B(:) = .FALSE. IF(LLPGP3B) THEN IGP3BLEV=UBOUND(PGP3B,2) @@ -343,217 +323,200 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IOFFEW=IOFFEW+IGP3BPAR*IGP3BLEV ENDIF ENDIF - - CALL INIGPTR(IGPTRSEND,IGPTRRECV) - LLDONE = .FALSE. - ITAG = MTAGLG - - INDOFFX = 0 - IBUFLENS = 0 - IBUFLENR = 0 - INRECV = 0 - INSEND = 0 - - DO JROC=1,NPROC - - CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) - ISEND = JROC - ISENT(JROC) = 0 - IRCVD(JROC) = 0 - - ! count up expected number of fields - IPOS = 0 + + ! Prepare receiver arrays + ! find number of fields on a certain V-set + IF(NPRTRV == 1) THEN + ! This is needed because KVSET(JFLD) == -1 if there is only one V-set + IRECV_FIELD_COUNT(1) = KF_GP + ELSE + IRECV_FIELD_COUNT(:) = 0 DO JFLD=1,KF_GP - IF(KVSET(JFLD) == ISETV .OR. KVSET(JFLD) == -1) IPOS = IPOS+1 + IRECV_FIELD_COUNT(KVSET(JFLD)) = IRECV_FIELD_COUNT(KVSET(JFLD)) + 1 ENDDO - IRECVTOT(JROC) = IGPTRRECV(ISETW)*IPOS - IF(IRECVTOT(JROC) > 0 .AND. MYPROC /= JROC) THEN - INRECV = INRECV + 1 - JRECV(INRECV)=JROC + ENDIF + ! find number of grid-points on a certain W-set that overlap with myself + IRECV_WSET_SIZE(:) = 0 + DO ILOCAL_LAT=D%NFRSTLAT(MY_REGION_NS),D%NLSTLAT(MY_REGION_NS) + ILAT_STRIP = ILOCAL_LAT-D%NFRSTLAT(MY_REGION_NS)+D%NPTRFLOFF+1 + IRECV_WSET_SIZE(D%NPROCL(ILOCAL_LAT)) = & + & IRECV_WSET_SIZE(D%NPROCL(ILOCAL_LAT))+D%NONL(ILAT_STRIP,MY_REGION_EW) + ENDDO + ! sum up offsets + IRECV_WSET_OFFSET(1) = 0 + DO JROC=1,NPRTRW + IRECV_WSET_OFFSET(JROC+1)=IRECV_WSET_OFFSET(JROC)+IRECV_WSET_SIZE(JROC) + ENDDO + DO JROC=1,NPROC + CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) + ! total recv size is # points per field * # fields + IRECVTOT(JROC) = IRECV_WSET_SIZE(ISETW)*IRECV_FIELD_COUNT(ISETV) + ENDDO + + ! Prepare sender arrays + IIN_TO_SEND_BUFR_OFFSET(1) = 0 + DO JROC=1,NPROC + ! Get new offset to my current KINDEX entry + IF (JROC > 1 .AND. KF_FS > 0) THEN + IIN_TO_SEND_BUFR_OFFSET(JROC) = IIN_TO_SEND_BUFR_OFFSET(JROC-1)+ISENDTOT(JROC-1)/KF_FS + ELSEIF (JROC > 1) THEN + IIN_TO_SEND_BUFR_OFFSET(JROC) = IIN_TO_SEND_BUFR_OFFSET(JROC-1) ENDIF - - IF( JROC /= MYPROC) IBUFLENR = MAX(IBUFLENR,IRECVTOT(JROC)) - + + CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) + + ! MAX(Index of first fourier latitude for this W set, first latitude of a senders A set) + ! i.e. we find the overlap between what we have on sender side (others A set) and the receiver + ! (me, the W-set). Ideally those conincide, at least mostly. IFIRSTLAT = MAX(D%NPTRLS(MYSETW),D%NFRSTLAT(ISETA)) + ! MIN(Index of last fourier latitude for this W set, last latitude of a senders A set) ILASTLAT = MIN(D%NPTRLS(MYSETW)+D%NULTPP(MYSETW)-1,D%NLSTLAT(ISETA)) - + IPOS = 0 DO JGL=IFIRSTLAT,ILASTLAT - IGL = D%NPTRFRSTLAT(ISETA)+JGL-D%NFRSTLAT(ISETA) - IPOS = IPOS+D%NONL(IGL,ISETB) + ! get from "actual" latitude to the latitude strip offset + IGL = JGL-D%NFRSTLAT(ISETA)+D%NPTRFRSTLAT(ISETA) + ! get from "actual" latitude to the latitude offset + IGLL = JGL-D%NPTRLS(MYSETW)+1 + DO JL=1,D%NONL(IGL,ISETB) + IPOS = IPOS+1 + ! indicates where the data has to be stored + IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_OFFSET(JROC)+IPOS) = D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL)+JL-1 + ENDDO ENDDO - + !we always receive the full fourier space ISENDTOT(JROC) = IPOS*KF_FS - IF( JROC /= MYPROC) THEN - IBUFLENS = MAX(IBUFLENS,ISENDTOT(JROC)) - IF(ISENDTOT(JROC) > 0) THEN - INSEND = INSEND+1 - JSEND(INSEND)=JROC - ENDIF - ENDIF - - IF(IPOS > 0) THEN - INDOFF(JROC) = INDOFFX - INDOFFX = INDOFFX+IPOS - IPOS = 0 - DO JGL=IFIRSTLAT,ILASTLAT - IGL = D%NPTRFRSTLAT(ISETA)+JGL-D%NFRSTLAT(ISETA) - IGLL = JGL-D%NPTRLS(MYSETW)+1 - DO JL=D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL),& - &D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL)+D%NONL(IGL,ISETB)-1 - IPOS = IPOS+1 - KINDEX(IPOS+INDOFF(JROC)) = JL - ENDDO - ENDDO - ENDIF ENDDO - - ISENDCOUNT=0 - IRECVCOUNT=0 - DO J=1,NPROC - ISENDCOUNT=MAX(ISENDCOUNT,ISENDTOT(J)) - IRECVCOUNT=MAX(IRECVCOUNT,IRECVTOT(J)) - ENDDO - IF (IBUFLENS > 0) ALLOCATE(ZCOMBUFS(1:ISENDCOUNT,INSEND)) - IF (IBUFLENR > 0) ALLOCATE(ZCOMBUFR(1:IRECVCOUNT,INRECV)) - !$ACC DATA IF(IBUFLENS > 0) CREATE(ZCOMBUFS) - !$ACC DATA IF(IBUFLENR > 0) CREATE(ZCOMBUFR) - - !$ACC DATA & - !$ACC PRESENT(PGLAT) & - !$ACC COPYIN(IGPTRSEND,INDOFF,KINDEX, LLUV,LLGP2,LLGP3A,LLGP3B,KPTRGP) - !$ACC DATA IF(PRESENT(PGP)) COPYOUT(PGP) - !$ACC DATA IF(PRESENT(PGPUV)) COPYOUT(PGPUV) COPYIN(IUVLEVS,IUVPARS) - !$ACC DATA IF(PRESENT(PGP2)) COPYOUT(PGP2) COPYIN(IGP2PARS) - !$ACC DATA IF(PRESENT(PGP3A)) COPYOUT(PGP3A) COPYIN(IGP3APARS,IGP3ALEVS) - !$ACC DATA IF(PRESENT(PGP3B)) COPYOUT(PGP3B) COPYIN(IGP3BPARS, IGP3BLEVS) + + !$ACC DATA PRESENT(PGLAT) COPYIN(IIN_TO_SEND_BUFR,LLUV,LLGP2,LLGP3A,LLGP3B) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP)) COPYOUT(PGP) ASYNC(1) + !$ACC DATA IF(PRESENT(PGPUV)) COPYOUT(PGPUV) COPYIN(IUVLEVS,IUVPARS) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP2)) COPYOUT(PGP2) COPYIN(IGP2PARS) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3A)) COPYOUT(PGP3A) COPYIN(IGP3APARS,IGP3ALEVS) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3B)) COPYOUT(PGP3B) COPYIN(IGP3BPARS, IGP3BLEVS) ASYNC(1) CALL GSTATS(1806,1) - + ! Copy local contribution - IF( IRECVTOT(MYPROC) > 0 )THEN + IF(ISENDTOT(MYPROC) > 0) THEN + ! I have to send something to myself... + + ! Input is KF_GP fields. We find the resulting KF_FS fields. IFLDS = 0 DO JFLD=1,KF_GP IF(KVSET(JFLD) == MYSETV .OR. KVSET(JFLD) == -1) THEN IFLDS = IFLDS+1 - IF(LLINDER) THEN - IFLDOFF(IFLDS) = KPTRGP(JFLD) + IF(PRESENT(KPTRGP)) THEN + IFLDA(IFLDS) = KPTRGP(JFLD) ELSE - IFLDOFF(IFLDS) = JFLD + IFLDA(IFLDS) = JFLD ENDIF ENDIF ENDDO - - IPOS=0 - JK_MAX=0 - DO JBLK=1,NGPBLKS - IGPTROFF(JBLK)=IPOS - IFIRST = IGPTRSEND(1,JBLK,MYSETW) - IF(IFIRST > 0) THEN - ILAST = IGPTRSEND(2,JBLK,MYSETW) - IPOS=IPOS+ILAST-IFIRST+1 - IF (JK_MAX<(ILAST-IFIRST+1)) JK_MAX = (ILAST-IFIRST+1) - ENDIF - ENDDO - + + !$ACC DATA COPYIN(IFLDA(1:IFLDS)) ASYNC(1) + CALL GSTATS(1604,0) - IF (LLPGPONLY) THEN - !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) & - !$ACC COPYIN(IGPTROFF,IFLDOFF,JK_MAX) COLLAPSE(3) - DO JBLK=1,NGPBLKS - DO JFLD=1,IFLDS - DO JKL=1,JK_MAX - IFIRST = IGPTRSEND(1,JBLK,MYSETW) - ILAST = IGPTRSEND(2,JBLK,MYSETW) - JK = JKL+IFIRST-1 - IF(IFIRST>0 .AND. JK<=ILAST) THEN - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JKL - IF(LLINDER) THEN - IFLD = KPTRGP(JFLD) - ELSE - IFLD = IFLDOFF(JFLD) - ENDIF - PGP(JK,IFLD,JBLK) = PGLAT(JFLD,KINDEX(IPOS)) - ENDIF - ENDDO + IRECV_WSET_OFFSET_V = IRECV_WSET_OFFSET(MYSETW) + IRECV_WSET_SIZE_V = IRECV_WSET_SIZE(MYSETW) + IIN_TO_SEND_BUFR_V = IIN_TO_SEND_BUFR_OFFSET(MYPROC) + IF (PRESENT(PGP)) THEN + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) + DO JL=1,IRECV_WSET_SIZE_V + DO JFLD=1,KF_FS + JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + IPOS = IIN_TO_SEND_BUFR_V+JL + PGP(JK,IFLD,JBLK) = PGLAT(JFLD,IIN_TO_SEND_BUFR(IPOS)) ENDDO ENDDO ELSE - !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) & - !$ACC COPYIN(IGPTROFF,IFLDOFF,JK_MAX) COLLAPSE(3) - DO JBLK=1,NGPBLKS - DO JFLD=1,IFLDS - DO JKL=1,JK_MAX - IFIRST = IGPTRSEND(1,JBLK,MYSETW) - ILAST = IGPTRSEND(2,JBLK,MYSETW) - JK = JKL+IFIRST-1 - IF(IFIRST>0 .AND. JK<=ILAST) THEN - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JKL - IFLD = IFLDOFF(JFLD) - IF(LLUV(IFLD)) THEN - PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) = PGLAT(JFLD,KINDEX(IPOS)) - ELSEIF(LLGP2(IFLD)) THEN - PGP2(JK,IGP2PARS(IFLD),JBLK)=PGLAT(JFLD,KINDEX(IPOS)) - ELSEIF(LLGP3A(IFLD)) THEN - PGP3A(JK,IGP3ALEVS(IFLD),IGP3APARS(IFLD),JBLK)=PGLAT(JFLD,KINDEX(IPOS)) - ELSEIF(LLGP3B(IFLD)) THEN - PGP3B(JK,IGP3BLEVS(IFLD),IGP3BPARS(IFLD),JBLK)=PGLAT(JFLD,KINDEX(IPOS)) - ENDIF - ENDIF - ENDDO + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) + DO JL=1,IRECV_WSET_SIZE_V + DO JFLD=1,KF_FS + JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL) + IF(LLUV(IFLD)) THEN + PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) = PGLAT(JFLD,IPOS) + ELSEIF(LLGP2(IFLD)) THEN + PGP2(JK,IGP2PARS(IFLD),JBLK)=PGLAT(JFLD,IPOS) + ELSEIF(LLGP3A(IFLD)) THEN + PGP3A(JK,IGP3ALEVS(IFLD),IGP3APARS(IFLD),JBLK)=PGLAT(JFLD,IPOS) + ELSEIF(LLGP3B(IFLD)) THEN + PGP3B(JK,IGP3BLEVS(IFLD),IGP3BPARS(IFLD),JBLK)=PGLAT(JFLD,IPOS) + ENDIF ENDDO ENDDO ENDIF CALL GSTATS(1604,1) + !$ACC END DATA + ENDIF - - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=TIMEF() - #endif - ! - ! loop over the number of processors we need to communicate with. - ! NOT MYPROC - ! - ! Pack loop......................................................... - + + ! Figure out processes that send or recv something + ISEND_COUNTS = 0 + IRECV_COUNTS = 0 + DO JROC=1,NPROC + IF( JROC /= MYPROC) THEN + IF(IRECVTOT(JROC) > 0) THEN + ! I have to recv something, so let me store that + IRECV_COUNTS = IRECV_COUNTS + 1 + IRECV_TO_PROC(IRECV_COUNTS)=JROC + ENDIF + IF(ISENDTOT(JROC) > 0) THEN + ! I have to send something, so let me store that + ISEND_COUNTS = ISEND_COUNTS+1 + ISEND_TO_PROC(ISEND_COUNTS)=JROC + ENDIF + ENDIF + ENDDO + + ALLOCATE(ICOMBUFS_OFFSET(ISEND_COUNTS+1)) + ICOMBUFS_OFFSET(1) = 0 + DO JROC=1,ISEND_COUNTS + ICOMBUFS_OFFSET(JROC+1) = ICOMBUFS_OFFSET(JROC) + ISENDTOT(ISEND_TO_PROC(JROC)) + ENDDO + ALLOCATE(ICOMBUFR_OFFSET(IRECV_COUNTS+1)) + ICOMBUFR_OFFSET(1) = 0 + DO JROC=1,IRECV_COUNTS + ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(IRECV_TO_PROC(JROC)) + ENDDO + + IF (ISEND_COUNTS > 0) ALLOCATE(ZCOMBUFS(ICOMBUFS_OFFSET(ISEND_COUNTS+1))) + IF (IRECV_COUNTS > 0) ALLOCATE(ZCOMBUFR(ICOMBUFR_OFFSET(IRECV_COUNTS+1))) + !$ACC DATA IF(ISEND_COUNTS > 0) CREATE(ZCOMBUFS) + !$ACC DATA IF(IRECV_COUNTS > 0) CREATE(ZCOMBUFR) + CALL GSTATS(1605,0) - DO INS=1,INSEND - ISEND=JSEND(INS) - ILEN = ISENDTOT(ISEND)/KF_FS - !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(II) COPYIN(ILEN) COLLAPSE(2) - DO JL=1,ILEN - DO JFLD=1,KF_FS - II = KINDEX(INDOFF(ISEND)+JL) - ZCOMBUFS((JFLD-1)*ILEN+JL,INS) = PGLAT(JFLD,II) - ENDDO + DO INS=1,ISEND_COUNTS + IPROC = ISEND_TO_PROC(INS) + ILEN = ISENDTOT(IPROC)/KF_FS + IIN_TO_SEND_BUFR_V = IIN_TO_SEND_BUFR_OFFSET(IPROC) + ICOMBUFS_OFFSET_V = ICOMBUFS_OFFSET(INS) + !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(II) COLLAPSE(2) ASYNC(1) + DO JL=1,ILEN + DO JFLD=1,KF_FS + II = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL) + ZCOMBUFS(ICOMBUFS_OFFSET_V+(JFLD-1)*ILEN+JL) = PGLAT(JFLD,II) ENDDO - ICOMS_KFFS(INS) = KF_FS ENDDO + ENDDO + !$ACC WAIT(1) - - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=(TIMEF()-Tc)/1000.0_JPRBT - !IF(MPL_MYRANK==1) WRITE(*,*) "packing (trltog) in sec: ", Tc - #endif - CALL GSTATS(1605,1) - + IR=0 IF (LHOOK) CALL DR_HOOK('TRLTOG_BAR',0,ZHOOK_HANDLE_BAR) CALL GSTATS_BARRIER(762) IF (LHOOK) CALL DR_HOOK('TRLTOG_BAR',1,ZHOOK_HANDLE_BAR) CALL GSTATS(805,0) - - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=TIMEF() - #endif - + IF (LSYNC_TRANS) THEN CALL GSTATS(422,0) CALL MPL_BARRIER(CDSTRING='TRLTOG BARRIER') @@ -563,35 +526,22 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !...Receive loop......................................................... !$ACC HOST_DATA USE_DEVICE(ZCOMBUFS,ZCOMBUFR) - DO INR=1,INRECV + DO INR=1,IRECV_COUNTS IR=IR+1 - IRECV=JRECV(INR) - CALL MPI_IRECV(ZCOMBUFR(1:IRECVTOT(IRECV),INR), & + IRECV=IRECV_TO_PROC(INR) + CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)), & & IRECVTOT(IRECV), & & TRLTOG_DTYPE,NPRCIDS(IRECV)-1, & - & ITAG, MPL_COMM_OML(OML_MY_THREAD()), IREQ(IR), & - & IERROR ) - IR=IR+1 - CALL MPI_IRECV(ICOMR_KFFS(INR), 1, & - & MPI_INTEGER,NPRCIDS(IRECV)-1, & - & ITAG, MPL_COMM_OML(OML_MY_THREAD()), IREQ(IR), & + & MTAGLG, MPL_COMM_OML(OML_MY_THREAD()), IREQ(IR), & & IERROR ) ENDDO !...Send loop......................................................... - DO INS=1,INSEND + DO INS=1,ISEND_COUNTS IR=IR+1 - ISEND=JSEND(INS) - CALL MPI_ISEND(ZCOMBUFS(1:ISENDTOT(ISEND),INS),& - & ISENDTOT(ISEND), & - & TRLTOG_DTYPE, NPRCIDS(ISEND)-1,ITAG, & - & MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR), & - & IERROR) - IR=IR+1 - CALL MPI_ISEND(ICOMS_KFFS(INS),1, & - & MPI_INTEGER, NPRCIDS(ISEND)-1,ITAG, & - & MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR), & - & IERROR) + ISEND=ISEND_TO_PROC(INS) + CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT(ISEND), & + & TRLTOG_DTYPE, NPRCIDS(ISEND)-1,MTAGLG,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) ENDDO !$ACC END HOST_DATA @@ -599,110 +549,96 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL MPL_WAIT(KREQUEST=IREQ(1:IR), & & CDSTRING='TRLTOG_CUDAAWARE: WAIT FOR SENDS AND RECEIVES') ENDIF - + IF (LSYNC_TRANS) THEN CALL MPL_BARRIER(CDSTRING='TRLTOG BARRIER') ENDIF CALL GSTATS(412,1) - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=(TIMEF()-Tc)/1000.0_JPRBT - !IF(MPL_MYRANK==1) WRITE(*,*) "CUDA-aware isend/irecv (trltog) in sec: ", Tc - #endif - + CALL GSTATS(805,1) CALL GSTATS_BARRIER2(762) - - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=TIMEF() - #endif + ! Unpack loop......................................................... - + CALL GSTATS(1606,0) - DO INR=1,INRECV - IRECV=JRECV(INR) - CALL PE2SET(IRECV,ISETA,ISETB,ISETW,ISETV) - IRECVSET = ISETV -! IRECV_FLD_START = 1 !! INT(ZCOMBUFR(-1,INR),KIND=JPIM) !! is this always 1 ? - IRECV_FLD_END = ICOMR_KFFS(INR) - IFLD = 0 - IPOS = 0 - DO JFLD=1,KF_GP - IF(KVSET(JFLD) == IRECVSET .OR. KVSET(JFLD) == -1 ) THEN - IFLD = IFLD+1 - IFLDA(IFLD)=JFLD - ENDIF - ENDDO - - JK_MAX=0 - DO JBLK=1,NGPBLKS - IFIRST = IGPTRSEND(1,JBLK,ISETW) - IF(IFIRST > 0) THEN - ILAST = IGPTRSEND(2,JBLK,ISETW) - JPOS(JBLK)=IPOS - IPOS=IPOS+(ILAST-IFIRST+1) - IF (JK_MAX<(ILAST-IFIRST+1)) JK_MAX = (ILAST-IFIRST+1) + DO INR=1,IRECV_COUNTS + IRECV=IRECV_TO_PROC(INR) + CALL PE2SET(IRECV,ISETA,ISETB,ISETW,ISETV) + + IRECV_FIELD_COUNT_V = IRECV_FIELD_COUNT(ISETV) + ICOMBUFR_OFFSET_V = ICOMBUFR_OFFSET(INR) + + IFLDS = 0 + DO JFLD=1,KF_GP + IF(KVSET(JFLD) == ISETV .OR. KVSET(JFLD) == -1 ) THEN + IFLDS = IFLDS+1 + IF(PRESENT(KPTRGP)) THEN + IFLDA(IFLDS)=KPTRGP(JFLD) + ELSE + IFLDA(IFLDS)=JFLD ENDIF + ENDIF + ENDDO + + !$ACC DATA COPYIN(IFLDA(1:IRECV_FIELD_COUNT_V)) ASYNC(1) + + IRECV_WSET_OFFSET_V = IRECV_WSET_OFFSET(ISETW) + IRECV_WSET_SIZE_V = IRECV_WSET_SIZE(ISETW) + IF (PRESENT(PGP)) THEN + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) ASYNC(1) + DO JL=1,IRECV_WSET_SIZE_V + DO JFLD=1,IRECV_FIELD_COUNT_V + JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD=IFLDA(JFLD) + JI = ICOMBUFR_OFFSET_V+(JFLD-1)*IRECV_WSET_SIZE_V+JL + PGP(JK,IFLD,JBLK) = ZCOMBUFR(JI) + ENDDO ENDDO - - !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IFIRST,ILAST,JI,JK,IFLDT) & - !$ACC COPYIN(INR,KF_FS,IPOS,JPOS,IFLD,IFLDA,JK_MAX,IRECV_FLD_END) COLLAPSE(3) - DO JBLK=1,NGPBLKS - DO JJ=1,IRECV_FLD_END - DO JKL=1,JK_MAX - IFLDT=IFLDA(JJ) - IFIRST = IGPTRSEND(1,JBLK,ISETW) - ILAST = IGPTRSEND(2,JBLK,ISETW) - JK = JKL+IFIRST-1 - JI=(JJ-1)*IPOS+JPOS(JBLK)+JKL - IF(IFIRST > 0 .AND. JK<=ILAST) THEN - IF(LLINDER) THEN - PGP(JK,KPTRGP(IFLDT),JBLK) = ZCOMBUFR(JI,INR) - ELSEIF(LLPGPONLY) THEN - PGP(JK,IFLDT,JBLK) = ZCOMBUFR(JI,INR) - ELSEIF(LLUV(IFLDT)) THEN - PGPUV(JK,IUVLEVS(IFLDT),IUVPARS(IFLDT),JBLK) = ZCOMBUFR(JI,INR) - ELSEIF(LLGP2(IFLDT)) THEN - PGP2(JK,IGP2PARS(IFLDT),JBLK) = ZCOMBUFR(JI,INR) - ELSEIF(LLGP3A(IFLDT)) THEN - PGP3A(JK,IGP3ALEVS(IFLDT),IGP3APARS(IFLDT),JBLK) = ZCOMBUFR(JI,INR) - ELSEIF(LLGP3B(IFLDT)) THEN - PGP3B(JK,IGP3BLEVS(IFLDT),IGP3BPARS(IFLDT),JBLK) = ZCOMBUFR(JI,INR) - ENDIF - ENDIF - ENDDO + ELSE + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) ASYNC(1) + DO JL=1,IRECV_WSET_SIZE_V + DO JFLD=1,IRECV_FIELD_COUNT_V + JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD=IFLDA(JFLD) + JI = ICOMBUFR_OFFSET_V+(JFLD-1)*IRECV_WSET_SIZE_V+JL + IF(LLUV(IFLD)) THEN + PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) = ZCOMBUFR(JI) + ELSEIF(LLGP2(IFLD)) THEN + PGP2(JK,IGP2PARS(IFLD),JBLK) = ZCOMBUFR(JI) + ELSEIF(LLGP3A(IFLD)) THEN + PGP3A(JK,IGP3ALEVS(IFLD),IGP3APARS(IFLD),JBLK) = ZCOMBUFR(JI) + ELSEIF(LLGP3B(IFLD)) THEN + PGP3B(JK,IGP3BLEVS(IFLD),IGP3BPARS(IFLD),JBLK) = ZCOMBUFR(JI) + ENDIF ENDDO ENDDO - - ENDDO - - CALL GSTATS(431,0) - !$ACC END DATA - !$ACC END DATA - !$ACC END DATA - !$ACC END DATA - !$ACC END DATA + ENDIF !$ACC END DATA + ENDDO - !$ACC END DATA !! CREATE ZCOMBUFR - !$ACC END DATA !! CREATE ZCOMBUFS - CALL GSTATS(431,1) + CALL GSTATS(431,0) + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + + !$ACC END DATA !! CREATE ZCOMBUFR + !$ACC END DATA !! CREATE ZCOMBUFS + !$ACC WAIT(1) + CALL GSTATS(431,1) - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=(TIMEF()-Tc)/1000.0_JPRBT - !IF(MPL_MYRANK==1) WRITE(*,*) "unpacking (trltog) in sec: ", Tc - #endif - CALL GSTATS(1606,1) IF (IBUFLENS > 0) DEALLOCATE(ZCOMBUFS) IF (IBUFLENR > 0) DEALLOCATE(ZCOMBUFR) - + IF (LHOOK) CALL DR_HOOK('TRLTOG',1,ZHOOK_HANDLE) - + END SUBROUTINE TRLTOG_CUDAAWARE - + SUBROUTINE TRLTOG(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) From 33f2b05f752cf494602a064844beeed84244cc5b Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:05 -0700 Subject: [PATCH 084/263] Reallocate ZOA2 inside ltdir and adapt interfaces accoringly --- src/trans/gpu/internal/ledir_mod.F90 | 1 + src/trans/gpu/internal/ltdir_ctl_mod.F90 | 4 +- src/trans/gpu/internal/ltdir_mod.F90 | 51 +++++++++++------------- src/trans/gpu/internal/ltinv_ctl_mod.F90 | 10 ++--- src/trans/gpu/internal/ltinv_mod.F90 | 18 --------- src/trans/gpu/internal/updsp_mod.F90 | 27 +++---------- src/trans/gpu/internal/uvtvd_mod.F90 | 26 ++---------- 7 files changed, 41 insertions(+), 96 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index b09f39380..4caabe680 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -126,6 +126,7 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) PAIA = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) PAIS = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) IF (JF .LE. 4*KF_UV) THEN + ! Multiply in case of velocity PAIA = PAIA*F%RACTHE(JGL) PAIS = PAIS*F%RACTHE(JGL) ENDIF diff --git a/src/trans/gpu/internal/ltdir_ctl_mod.F90 b/src/trans/gpu/internal/ltdir_ctl_mod.F90 index fb6aa29cf..da7ac3973 100755 --- a/src/trans/gpu/internal/ltdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltdir_ctl_mod.F90 @@ -65,7 +65,7 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS, & INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) - INTEGER(KIND=JPIM) :: JM,IM,ILED2 + INTEGER(KIND=JPIM) :: JM,IM !$ACC DATA PRESENT(FOUBUF_IN) CREATE(FOUBUF) @@ -84,7 +84,7 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS, & CALL GSTATS(1645,0) IF (KF_FS > 0) THEN - CALL LTDIR(KF_FS,KF_UV,KF_SCALARS,ILED2, & + CALL LTDIR(KF_FS,KF_UV,KF_SCALARS, & & PSPVOR,PSPDIV,PSPSCALAR,& & PSPSC3A,PSPSC3B,PSPSC2 , & & KFLDPTRUV,KFLDPTRSC) diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index 940d34561..fbfc9258e 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -10,7 +10,7 @@ MODULE LTDIR_MOD CONTAINS - SUBROUTINE LTDIR(KF_FS,KF_UV,KF_SCALARS,KLED2,& + SUBROUTINE LTDIR(KF_FS,KF_UV,KF_SCALARS,& & PSPVOR,PSPDIV,PSPSCALAR,& & PSPSC3A,PSPSC3B,PSPSC2, & & KFLDPTRUV,KFLDPTRSC) @@ -27,6 +27,7 @@ SUBROUTINE LTDIR(KF_FS,KF_UV,KF_SCALARS,KLED2,& USE LEDIR_MOD ,ONLY : LEDIR USE UVTVD_MOD USE UPDSP_MOD ,ONLY : UPDSP + USE UPDSPB_MOD ,ONLY : UPDSPB USE TPM_FIELDS ,ONLY : ZOA1,ZOA2,ZEPSNM @@ -86,26 +87,10 @@ SUBROUTINE LTDIR(KF_FS,KF_UV,KF_SCALARS,KLED2,& IMPLICIT NONE - INTERFACE - SUBROUTINE cudaProfilerStart() BIND(C,name='cudaProfilerStart') - USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_INT - IMPLICIT NONE - END SUBROUTINE cudaProfilerStart - END INTERFACE - - INTERFACE - SUBROUTINE cudaProfilerStop() BIND(C,name='cudaProfilerStop') - USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_INT - IMPLICIT NONE - END SUBROUTINE cudaProfilerStop - END INTERFACE - - - ! DUMMY INTEGER SCALARS INTEGER(KIND=JPIM) :: KM INTEGER(KIND=JPIM) :: KMLOC - INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_UV,KF_SCALARS,KLED2 + INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_UV,KF_SCALARS REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) @@ -121,6 +106,7 @@ END SUBROUTINE cudaProfilerStop INTEGER(KIND=JPIM) :: IUS, IUE, IVS, IVE, IVORS, IVORE, IDIVS, IDIVE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + REAL(KIND=JPRB), ALLOCATABLE :: POA2(:,:,:) !call cudaProfilerStart @@ -146,12 +132,10 @@ END SUBROUTINE cudaProfilerStop !* 5. COMPUTE VORTICITY AND DIVERGENCE. ! --------------------------------- - + IF( KF_UV > 0 ) THEN - !stop 'Error: code path not (yet) supported in GPU version' - !!CALL PREPSNM - + IUS = 1 IUE = 2*KF_UV IVS = 2*KF_UV+1 @@ -160,9 +144,22 @@ END SUBROUTINE cudaProfilerStop IVORE = 2*KF_UV IDIVS = 2*KF_UV+1 IDIVE = 4*KF_UV - CALL UVTVD(KF_UV) - ! CALL UVTVD(KF_UV,ZEPSNM,ZOA1(IUS:IUE,:,:),ZOA1(IVS:IVE,:,:),& -! & ZOA2(IVORS:IVORE,:,:),ZOA2(IDIVS:IDIVE,:,:)) + + ALLOCATE(POA2(4*KF_UV,R%NTMAX+3,D%NUMP)) + !$ACC ENTER DATA CREATE(POA2) + + + ! Compute vorticity and divergence + CALL UVTVD(KF_UV,ZOA1(IUS:IUE,:,:),ZOA1(IVS:IVE,:,:),& + & POA2(IVORS:IVORE,:,:),POA2(IDIVS:IDIVE,:,:)) + + ! Write back. Note, if we have UV, the contract says we *must* have VOR/DIV + CALL UPDSPB(KF_UV,POA2(IVORS:IVORE,:,:),PSPVOR,KFLDPTRUV) + CALL UPDSPB(KF_UV,POA2(IDIVS:IDIVE,:,:),PSPDIV,KFLDPTRUV) + + + !$ACC EXIT DATA DELETE(POA2) + DEALLOCATE(POA2) ENDIF ! ------------------------------------------------------------------ @@ -178,8 +175,8 @@ END SUBROUTINE cudaProfilerStop ! KM = D%MYMS(KMLOC) ! this is on the host, so need to cp from device, Nils - CALL UPDSP(KF_UV,KF_SCALARS,ZOA1,ZOA2, & - & PSPVOR,PSPDIV,PSPSCALAR,& + CALL UPDSP(KF_UV,KF_SCALARS,ZOA1,& + & PSPSCALAR,& & PSPSC3A,PSPSC3B,PSPSC2 , & & KFLDPTRUV,KFLDPTRSC) diff --git a/src/trans/gpu/internal/ltinv_ctl_mod.F90 b/src/trans/gpu/internal/ltinv_ctl_mod.F90 index 7cb488521..e96a823fb 100755 --- a/src/trans/gpu/internal/ltinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltinv_ctl_mod.F90 @@ -95,11 +95,11 @@ SUBROUTINE LTINV_CTL(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,& CALL GSTATS(1647,0) ! from PSPXXX to FOUBUF_IN - CALL LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,ILEI2,IDIM1,& - & PSPVOR,PSPDIV,PSPSCALAR ,& - & PSPSC3A,PSPSC3B,PSPSC2 , & - & KFLDPTRUV,KFLDPTRSC,FSPGL_PROC) - + CALL LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,ILEI2,IDIM1,& + & PSPVOR,PSPDIV,PSPSCALAR ,& + & PSPSC3A,PSPSC3B,PSPSC2 , & + & KFLDPTRUV,KFLDPTRSC,FSPGL_PROC) + CALL GSTATS(1647,1) ENDIF CALL GSTATS(102,1) diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index c0c919d4b..f2cbadadf 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -89,21 +89,6 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& IMPLICIT NONE - INTERFACE - SUBROUTINE cudaProfilerStart() BIND(C,name='cudaProfilerStart') - USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_INT - IMPLICIT NONE - END SUBROUTINE cudaProfilerStart - END INTERFACE - - INTERFACE - SUBROUTINE cudaProfilerStop() BIND(C,name='cudaProfilerStop') - USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_INT - IMPLICIT NONE - END SUBROUTINE cudaProfilerStop - END INTERFACE - - INTEGER(KIND=JPIM), INTENT(IN) :: KF_OUT_LT INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS @@ -136,9 +121,6 @@ END SUBROUTINE cudaProfilerStop INTEGER(KIND=JPIM) :: KMLOC - !call cudaProfilerStart - - ! ------------------------------------------------------------------ !* 1. PERFORM LEGENDRE TRANFORM. diff --git a/src/trans/gpu/internal/updsp_mod.F90 b/src/trans/gpu/internal/updsp_mod.F90 index aa5f4c5f8..388f0c139 100755 --- a/src/trans/gpu/internal/updsp_mod.F90 +++ b/src/trans/gpu/internal/updsp_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 1988- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -9,8 +10,8 @@ MODULE UPDSP_MOD CONTAINS -SUBROUTINE UPDSP(KF_UV,KF_SCALARS,POA1,POA2, & - & PSPVOR,PSPDIV,PSPSCALAR,& +SUBROUTINE UPDSP(KF_UV,KF_SCALARS,POA1, & + & PSPSCALAR,& & PSPSC3A,PSPSC3B,PSPSC2 , & & KFLDPTRUV,KFLDPTRSC) @@ -29,9 +30,6 @@ SUBROUTINE UPDSP(KF_UV,KF_SCALARS,POA1,POA2, & ! -------------------- ! KM - zonal wave-number ! POA1 - spectral fields for zonal wavenumber KM (basic var.) -! POA2 - spectral fields for zonal wavenumber KM (vor. div.) -! PSPVOR - spectral vorticity -! PSPDIV - spectral divergence ! PSPSCALAR - spectral scalar variables ! Implicit arguments : @@ -77,9 +75,6 @@ SUBROUTINE UPDSP(KF_UV,KF_SCALARS,POA1,POA2, & INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV,KF_SCALARS REAL(KIND=JPRBT) , INTENT(IN) :: POA1(:,:,:) -REAL(KIND=JPRBT) , INTENT(IN) :: POA2(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) REAL(KIND=JPRB) ,OPTIONAL,INTENT(OUT) :: PSPSC2(:,:) REAL(KIND=JPRB) ,OPTIONAL,INTENT(OUT) :: PSPSC3A(:,:,:) @@ -88,7 +83,7 @@ SUBROUTINE UPDSP(KF_UV,KF_SCALARS,POA1,POA2, & INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) ! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: IVORS, IVORE, IDIVS, IDIVE, IST ,IEND,JFLD,IFLD,IDIM1,IDIM3,J3 +INTEGER(KIND=JPIM) :: IST ,IEND,JFLD,IFLD,IDIM1,IDIM3,J3 ! ------------------------------------------------------------------ @@ -98,24 +93,13 @@ SUBROUTINE UPDSP(KF_UV,KF_SCALARS,POA1,POA2, & !* 1.1 VORTICITY AND DIVERGENCE. -!$ACC DATA PRESENT(PSPVOR,PSPDIV) IF(KF_UV > 0) !$ACC DATA PRESENT(PSPSCALAR) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSCALAR)) !$ACC DATA PRESENT(PSPSC2) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC2) .AND. NF_SC2 > 0) !$ACC DATA PRESENT(PSPSC3A) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC3A) .AND. NF_SC3A > 0) !$ACC DATA PRESENT(PSPSC3B) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC3B) .AND. NF_SC3B > 0) IST = 1 -IF (KF_UV > 0) THEN - !stop 'Error: code path not (yet) supported in GPU version' - - IST = IST+4*KF_UV - IVORS = 1 - IVORE = 2*KF_UV - IDIVS = 2*KF_UV+1 - IDIVE = 4*KF_UV - CALL UPDSPB(KF_UV,POA2(IVORS:IVORE,:,:),PSPVOR,KFLDPTRUV) - CALL UPDSPB(KF_UV,POA2(IDIVS:IDIVE,:,:),PSPDIV,KFLDPTRUV) -ENDIF +IST = IST+4*KF_UV !* 1.2 SCALARS @@ -155,7 +139,6 @@ SUBROUTINE UPDSP(KF_UV,KF_SCALARS,POA1,POA2, & !$ACC END DATA !$ACC END DATA !$ACC END DATA -!$ACC END DATA ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/uvtvd_mod.F90 b/src/trans/gpu/internal/uvtvd_mod.F90 index f5fa2021c..bd0aac9a0 100755 --- a/src/trans/gpu/internal/uvtvd_mod.F90 +++ b/src/trans/gpu/internal/uvtvd_mod.F90 @@ -10,8 +10,7 @@ MODULE UVTVD_MOD CONTAINS -SUBROUTINE UVTVD(KF_UV) -!SUBROUTINE UVTVD(KF_UV,PEPSNM,PU,PV,PVOR,PDIV) +SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) !**** *UVTVD* - Compute vor/div from u and v in spectral space @@ -63,7 +62,7 @@ SUBROUTINE UVTVD(KF_UV) USE TPM_DIM ,ONLY : R, R_NTMAX USE TPM_FIELDS ,ONLY : F USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS -USE TPM_FIELDS ,ONLY : ZOA1,ZOA2,ZEPSNM +USE TPM_FIELDS ,ONLY : ZEPSNM ! IMPLICIT NONE @@ -72,37 +71,20 @@ SUBROUTINE UVTVD(KF_UV) INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV INTEGER(KIND=JPIM) :: KM, KMLOC -!REAL(KIND=JPRBT), INTENT(IN) :: PEPSNM(1:d%nump,0:R%NTMAX+2) -!REAL(KIND=JPRBT), INTENT(OUT) :: PVOR(:,:,:),PDIV(:,:,:) -!REAL(KIND=JPRBT), INTENT(INOUT) :: PU (:,:,:),PV (:,:,:) +REAL(KIND=JPRBT), INTENT(OUT) :: PVOR(:,:,:),PDIV(:,:,:) +REAL(KIND=JPRBT), INTENT(INOUT) :: PU (:,:,:),PV (:,:,:) ! LOCAL INTEGER SCALARS INTEGER(KIND=JPIM) :: II, IN, IR, J, JN, ITMAX -INTEGER(KIND=JPIM) :: IUS, IUE, IVS, IVE, IVORS, IVORE, IDIVS, IDIVE ! LOCAL REAL SCALARS REAL(KIND=JPRBT) :: ZKM,ZJN -REAL(KIND=JPRBT), POINTER :: PU(:,:,:),PV(:,:,:),PVOR(:,:,:),PDIV(:,:,:) - -IUS = 1 -IUE = 2*KF_UV -IVS = 2*KF_UV+1 -IVE = 4*KF_UV -IVORS = 1 -IVORE = 2*KF_UV -IDIVS = 2*KF_UV+1 -IDIVE = 4*KF_UV ! ------------------------------------------------------------------ !* 1. COMPUTE U V FROM VORTICITY AND DIVERGENCE. ! ------------------------------------------ -PU => ZOA1(IUS:IUE,:,:) -PV => ZOA1(IVS:IVE,:,:) -PVOR => ZOA2(IVORS:IVORE,:,:) -PDIV => ZOA2(IDIVS:IDIVE,:,:) - !$ACC DATA& !$ACC& PRESENT(D_MYMS,D_NUMP,R_NTMAX) & !$ACC& PRESENT(F,F%RN,F%NLTN) & From 3cc867c3a3df42a111ca36077b0119537b7613af Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:05 -0700 Subject: [PATCH 085/263] LTINV reallocates PIA now --- src/trans/gpu/internal/ltinv_mod.F90 | 61 ++++++++++++++++++---------- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index f2cbadadf..e1bc448f6 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -19,7 +19,7 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R - USE TPM_TRANS ,ONLY : LDIVGP, LVORGP, NF_SC2, NF_SC3A, NF_SC3B, foubuf_in + USE TPM_TRANS ,ONLY : LDIVGP, LVORGP, NF_SC2, NF_SC3A, NF_SC3B, foubuf_in, LSCDERS USE TPM_FLT USE TPM_GEOMETRY USE TPM_DISTR ,ONLY : D @@ -33,7 +33,7 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& USE FSPGL_INT_MOD ,ONLY : FSPGL_INT USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS use ieee_arithmetic - USE TPM_FIELDS ,ONLY : F,ZIA,ZEPSNM + USE TPM_FIELDS ,ONLY : F,ZEPSNM USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS @@ -112,7 +112,9 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& INTEGER(KIND=JPIM) :: ISTA, IIFC, IDGLU INTEGER(KIND=JPIM) :: IVORL,IVORU,IDIVL,IDIVU,IUL,IUU,IVL,IVU,ISL,ISU,IDL,IDU INTEGER(KIND=JPIM) :: IFIRST, ILAST, IDIM1,IDIM2,IDIM3,J3 - + + REAL(KIND=JPRB), ALLOCATABLE :: PIA(:,:,:) + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE !CHARACTER(LEN=10) :: CLHOOK @@ -135,9 +137,6 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& !* 3. SPECTRAL COMPUTATIONS FOR U,V AND DERIVATIVES. ! ---------------------------------------------- - IFIRST = 1 - ILAST = 0 - ! COPY FROM PSPXXXX TO ZIA IF (LSYNC_TRANS) THEN @@ -152,6 +151,23 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& IF (LSYNC_TRANS) THEN CALL MPL_BARRIER(CDSTRING='') ENDIF + + ! Compute PIA Domain decomposition + IFIRST = 0 + IFIRST = IFIRST + 2*KF_UV ! Vorticity or divergence + IFIRST = IFIRST + 2*KF_UV ! Vorticity or divergence + IFIRST = IFIRST + 2*KF_UV ! U + IFIRST = IFIRST + 2*KF_UV ! V + IFIRST = IFIRST + 2*KF_SCALARS ! Scalars + IF (LSCDERS) IFIRST = IFIRST + 2*KF_SCALARS ! Scalars NS Derivatives + + ALLOCATE(PIA(IFIRST,R%NSMAX+3,D%NUMP)) + !$ACC ENTER DATA CREATE(PIA) + + + IFIRST = 1 + ILAST = 0 + CALL GSTATS(431,1) IF (KF_UV > 0) THEN IVORL = 1 @@ -164,13 +180,13 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& IVU = 8*KF_UV IDIM2=UBOUND(PSPVOR,2) - CALL PRFI1B(ZIA(IVORL:IVORU,:,:),PSPVOR,KF_UV,IDIM2,KFLDPTRUV) - CALL PRFI1B(ZIA(IDIVL:IDIVU,:,:),PSPDIV,KF_UV,IDIM2,KFLDPTRUV) + CALL PRFI1B(PIA(IVORL:IVORU,:,:),PSPVOR,KF_UV,IDIM2,KFLDPTRUV) + CALL PRFI1B(PIA(IDIVL:IDIVU,:,:),PSPDIV,KF_UV,IDIM2,KFLDPTRUV) ! ------------------------------------------------------------------ - CALL VDTUV(KF_UV,ZEPSNM,ZIA(IVORL:IVORU,:,:),ZIA(IDIVL:IDIVU,:,:),& - & ZIA(IUL:IUU,:,:),ZIA(IVL:IVU,:,:)) + CALL VDTUV(KF_UV,ZEPSNM,PIA(IVORL:IVORU,:,:),PIA(IDIVL:IDIVU,:,:),& + & PIA(IUL:IUU,:,:),PIA(IVL:IVU,:,:)) ILAST = ILAST+8*KF_UV ENDIF @@ -181,13 +197,13 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& ILAST = IFIRST - 1 + 2*KF_SCALARS IDIM2=UBOUND(PSPSCALAR,2) - CALL PRFI1B(ZIA(IFIRST:ILAST,:,:),PSPSCALAR(:,:),KF_SCALARS,IDIM2,KFLDPTRSC) + CALL PRFI1B(PIA(IFIRST:ILAST,:,:),PSPSCALAR(:,:),KF_SCALARS,IDIM2,KFLDPTRSC) ELSE IF(PRESENT(PSPSC2) .AND. NF_SC2 > 0) THEN IFIRST = ILAST+1 ILAST = IFIRST-1+2*NF_SC2 IDIM2=UBOUND(PSPSC2,2) - CALL PRFI1B(ZIA(IFIRST:ILAST,:,:),PSPSC2(:,:),NF_SC2,IDIM2) + CALL PRFI1B(PIA(IFIRST:ILAST,:,:),PSPSC2(:,:),NF_SC2,IDIM2) ENDIF IF(PRESENT(PSPSC3A) .AND. NF_SC3A > 0) THEN IDIM1=NF_SC3A @@ -196,7 +212,7 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& ILAST = IFIRST-1+2*IDIM1 IDIM2=UBOUND(PSPSC3A,2) DO J3=1,IDIM3 - CALL PRFI1B(ZIA(IFIRST:ILAST,:,:),PSPSC3A(:,:,J3),IDIM1,IDIM2) + CALL PRFI1B(PIA(IFIRST:ILAST,:,:),PSPSC3A(:,:,J3),IDIM1,IDIM2) ENDDO ENDIF IF(PRESENT(PSPSC3B) .AND. NF_SC3B > 0) THEN @@ -207,14 +223,14 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& IFIRST = ILAST+1 ILAST = IFIRST-1+2*IDIM1 - CALL PRFI1B(ZIA(IFIRST:ILAST,:,:),PSPSC3B(:,:,J3),IDIM1,IDIM2) + CALL PRFI1B(PIA(IFIRST:ILAST,:,:),PSPSC3B(:,:,J3),IDIM1,IDIM2) ENDDO ENDIF ENDIF - IF(ILAST /= 8*KF_UV+2*KF_SCALARS) THEN - WRITE(0,*) 'LTINV:KF_UV,KF_SCALARS,ILAST ',KF_UV,KF_SCALARS,ILAST - CALL ABORT_TRANS('LTINV_MOD:ILAST /= 8*KF_UV+2*KF_SCALARS') - ENDIF + ENDIF + IF(ILAST /= 8*KF_UV+2*KF_SCALARS) THEN + WRITE(0,*) 'LTINV:KF_UV,KF_SCALARS,ILAST ',KF_UV,KF_SCALARS,ILAST + CALL ABORT_TRANS('LTINV_MOD:ILAST /= 8*KF_UV+2*KF_SCALARS') ENDIF !$ACC END DATA !$ACC END DATA @@ -228,7 +244,7 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& ISU = ISL+2*KF_SCALARS-1 IDL = 2*(4*KF_UV+KF_SCALARS)+1 IDU = IDL+2*KF_SCDERS-1 - CALL SPNSDE(KF_SCALARS,ZEPSNM,ZIA(ISL:ISU,:,:),ZIA(IDL:IDU,:,:)) + CALL SPNSDE(KF_SCALARS,ZEPSNM,PIA(ISL:ISU,:,:),PIA(IDL:IDU,:,:)) ENDIF ! ------------------------------------------------------------------ @@ -248,18 +264,19 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& ENDIF IF( KF_OUT_LT > 0 ) THEN - CALL LEINV(KF_OUT_LT,ZIA(ISTA:ISTA+2*KF_OUT_LT-1,:,:)) + CALL LEINV(KF_OUT_LT,PIA(ISTA:ISTA+2*KF_OUT_LT-1,:,:)) IF(PRESENT(FSPGL_PROC)) THEN stop 'Error: SPGL_PROC is not (yet) optimized in GPU version. Need to figure out how to implement' ENDIF ENDIF + + !$ACC EXIT DATA DELETE(PIA) + DEALLOCATE(PIA) IF (LHOOK) CALL DR_HOOK('LTINV_MOD',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ - !call cudaProfilerStop - END SUBROUTINE LTINV END MODULE LTINV_MOD From 7c7300e9579e6d1e0912a0d04026a5227bffeb56 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:05 -0700 Subject: [PATCH 086/263] Fix allocations in LTINV --- src/trans/gpu/internal/ltinv_mod.F90 | 150 ++++++++++++--------------- 1 file changed, 67 insertions(+), 83 deletions(-) diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index e1bc448f6..aafbf42d2 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -109,11 +109,12 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& EXTERNAL FSPGL_PROC OPTIONAL FSPGL_PROC - INTEGER(KIND=JPIM) :: ISTA, IIFC, IDGLU - INTEGER(KIND=JPIM) :: IVORL,IVORU,IDIVL,IDIVU,IUL,IUU,IVL,IVU,ISL,ISU,IDL,IDU - INTEGER(KIND=JPIM) :: IFIRST, ILAST, IDIM1,IDIM2,IDIM3,J3 + INTEGER(KIND=JPIM) :: IIFC, IDGLU + INTEGER(KIND=JPIM) :: IFIRST, J3 - REAL(KIND=JPRB), ALLOCATABLE :: PIA(:,:,:) + REAL(KIND=JPRB), ALLOCATABLE, TARGET :: PIA(:,:,:) + REAL(KIND=JPRB), POINTER :: PU(:,:,:), PV(:,:,:), PVOR(:,:,:), PDIV(:,:,:) + REAL(KIND=JPRB), POINTER :: PSCALARS(:,:,:), PSCALARS_NSDER(:,:,:) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE !CHARACTER(LEN=10) :: CLHOOK @@ -161,119 +162,102 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& IFIRST = IFIRST + 2*KF_SCALARS ! Scalars IF (LSCDERS) IFIRST = IFIRST + 2*KF_SCALARS ! Scalars NS Derivatives + ! Allocate data accordingly ALLOCATE(PIA(IFIRST,R%NSMAX+3,D%NUMP)) !$ACC ENTER DATA CREATE(PIA) + ! And reiterate domain decomposition to assign pointers + IFIRST = 0 + IF (.NOT. LVORGP .OR. LDIVGP) THEN + ! Usually we want to store vorticity first + PVOR => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! Vorticity + + PDIV => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! Divergence + ELSE + ! Except if we want to translate Vorticity but not Divergence, we should have Divergence first + ! Then we have all buffers that move on in a contiguous buffer + PDIV => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! Divergence + + PVOR => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! Vorticity + ENDIF + PU => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! U + PV => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! V + PSCALARS => PIA(IFIRST+1:IFIRST+2*KF_SCALARS,:,:) + IFIRST = IFIRST + 2*KF_SCALARS ! Scalars + PSCALARS_NSDER => PIA(IFIRST+1:IFIRST+2*KF_SCALARS,:,:) + IF (LSCDERS) IFIRST = IFIRST + 2*KF_SCALARS ! Scalars NS Derivatives - IFIRST = 1 - ILAST = 0 - CALL GSTATS(431,1) IF (KF_UV > 0) THEN - IVORL = 1 - IVORU = 2*KF_UV - IDIVL = 2*KF_UV+1 - IDIVU = 4*KF_UV - IUL = 4*KF_UV+1 - IUU = 6*KF_UV - IVL = 6*KF_UV+1 - IVU = 8*KF_UV - - IDIM2=UBOUND(PSPVOR,2) - CALL PRFI1B(PIA(IVORL:IVORU,:,:),PSPVOR,KF_UV,IDIM2,KFLDPTRUV) - CALL PRFI1B(PIA(IDIVL:IDIVU,:,:),PSPDIV,KF_UV,IDIM2,KFLDPTRUV) - - ! ------------------------------------------------------------------ - - CALL VDTUV(KF_UV,ZEPSNM,PIA(IVORL:IVORU,:,:),PIA(IDIVL:IDIVU,:,:),& - & PIA(IUL:IUU,:,:),PIA(IVL:IVU,:,:)) - ILAST = ILAST+8*KF_UV - + CALL PRFI1B(PVOR,PSPVOR,KF_UV,UBOUND(PSPVOR,2),KFLDPTRUV) + CALL PRFI1B(PDIV,PSPDIV,KF_UV,UBOUND(PSPDIV,2),KFLDPTRUV) + + ! Compute U and V for VOR and DIV + CALL VDTUV(KF_UV,ZEPSNM,PVOR,PDIV,PU,PV) ENDIF - - IF(KF_SCALARS > 0)THEN + + IF (KF_SCALARS > 0) THEN IF(PRESENT(PSPSCALAR)) THEN - IFIRST = ILAST+1 - ILAST = IFIRST - 1 + 2*KF_SCALARS - - IDIM2=UBOUND(PSPSCALAR,2) - CALL PRFI1B(PIA(IFIRST:ILAST,:,:),PSPSCALAR(:,:),KF_SCALARS,IDIM2,KFLDPTRSC) + CALL PRFI1B(PSCALARS,PSPSCALAR,KF_SCALARS,UBOUND(PSPSCALAR,2),KFLDPTRSC) ELSE + IFIRST = 1 IF(PRESENT(PSPSC2) .AND. NF_SC2 > 0) THEN - IFIRST = ILAST+1 - ILAST = IFIRST-1+2*NF_SC2 - IDIM2=UBOUND(PSPSC2,2) - CALL PRFI1B(PIA(IFIRST:ILAST,:,:),PSPSC2(:,:),NF_SC2,IDIM2) + CALL PRFI1B(PSCALARS(IFIRST:IFIRST+2*NF_SC2-1,:,:),PSPSC2(:,:),NF_SC2,UBOUND(PSPSC2,2)) + IFIRST = IFIRST+2*NF_SC2 ENDIF IF(PRESENT(PSPSC3A) .AND. NF_SC3A > 0) THEN - IDIM1=NF_SC3A - IDIM3=UBOUND(PSPSC3A,3) - IFIRST = ILAST+1 - ILAST = IFIRST-1+2*IDIM1 - IDIM2=UBOUND(PSPSC3A,2) - DO J3=1,IDIM3 - CALL PRFI1B(PIA(IFIRST:ILAST,:,:),PSPSC3A(:,:,J3),IDIM1,IDIM2) + DO J3=1,UBOUND(PSPSC3A,3) + CALL PRFI1B(PSCALARS(IFIRST:IFIRST+2*NF_SC3A-1,:,:),PSPSC3A(:,:,J3),NF_SC3A,UBOUND(PSPSC3A,2)) + IFIRST = IFIRST+2*NF_SC3A ENDDO ENDIF IF(PRESENT(PSPSC3B) .AND. NF_SC3B > 0) THEN - IDIM1=NF_SC3B - IDIM3=UBOUND(PSPSC3B,3) - IDIM2=UBOUND(PSPSC3B,2) - DO J3=1,IDIM3 - IFIRST = ILAST+1 - ILAST = IFIRST-1+2*IDIM1 - - CALL PRFI1B(PIA(IFIRST:ILAST,:,:),PSPSC3B(:,:,J3),IDIM1,IDIM2) + DO J3=1,UBOUND(PSPSC3B,3) + CALL PRFI1B(PSCALARS(IFIRST:IFIRST+2*NF_SC3B-1,:,:),PSPSC3B(:,:,J3),NF_SC3B,UBOUND(PSPSC3B,2)) + IFIRST = IFIRST+2*NF_SC3B ENDDO ENDIF + IF(IFIRST-1 /= 2*KF_SCALARS) THEN + WRITE(0,*) 'LTINV:KF_SCALARS,IFIRST',KF_SCALARS,IFIRST + CALL ABORT_TRANS('LTINV_MOD:IFIRST /= 2*KF_SCALARS') + ENDIF ENDIF ENDIF - IF(ILAST /= 8*KF_UV+2*KF_SCALARS) THEN - WRITE(0,*) 'LTINV:KF_UV,KF_SCALARS,ILAST ',KF_UV,KF_SCALARS,ILAST - CALL ABORT_TRANS('LTINV_MOD:ILAST /= 8*KF_UV+2*KF_SCALARS') - ENDIF !$ACC END DATA !$ACC END DATA !$ACC END DATA !$ACC END DATA !$ACC END DATA - + IF (KF_SCDERS > 0) THEN - ! stop 'Error: code path not (yet) supported in GPU version' - ISL = 2*(4*KF_UV)+1 - ISU = ISL+2*KF_SCALARS-1 - IDL = 2*(4*KF_UV+KF_SCALARS)+1 - IDU = IDL+2*KF_SCDERS-1 - CALL SPNSDE(KF_SCALARS,ZEPSNM,PIA(ISL:ISU,:,:),PIA(IDL:IDU,:,:)) - ENDIF - + CALL SPNSDE(KF_SCALARS,ZEPSNM,PSCALARS,PSCALARS_NSDER) + ENDIF + ! ------------------------------------------------------------------ - - + + !* 4. INVERSE LEGENDRE TRANSFORM. ! --------------------------- - ! FROM ZIA TO ZAOA1 and ZSOA1 - - ISTA = 1 - IF(KF_UV > 0 .AND. .NOT. LVORGP) THEN - ISTA = ISTA+2*KF_UV - ENDIF - IF(KF_UV > 0 .AND. .NOT. LDIVGP) THEN - ISTA = ISTA+2*KF_UV - ENDIF - - IF( KF_OUT_LT > 0 ) THEN - CALL LEINV(KF_OUT_LT,PIA(ISTA:ISTA+2*KF_OUT_LT-1,:,:)) + ! Forget aobut Vorticity and divergence if we don't need it in the output + IFIRST = 1 + IF(.NOT. LVORGP) IFIRST = IFIRST+2*KF_UV + IF(.NOT. LDIVGP) IFIRST = IFIRST+2*KF_UV - IF(PRESENT(FSPGL_PROC)) THEN - stop 'Error: SPGL_PROC is not (yet) optimized in GPU version. Need to figure out how to implement' - ENDIF - + ! Output is being stored in FOUBUF_IN + IF (KF_OUT_LT > 0) THEN + CALL LEINV(KF_OUT_LT,PIA(IFIRST:,:,:)) ENDIF !$ACC EXIT DATA DELETE(PIA) DEALLOCATE(PIA) + IF (LHOOK) CALL DR_HOOK('LTINV_MOD',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ From a65fbf6356a82733b04805486a2bd49af2173267 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:05 -0700 Subject: [PATCH 087/263] LEINV can now infer # fields; and we can pas FOUBUF_IN --- src/trans/gpu/internal/leinv_mod.F90 | 54 ++++++++++++++-------------- src/trans/gpu/internal/ltinv_mod.F90 | 6 ++-- 2 files changed, 31 insertions(+), 29 deletions(-) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index d123375cb..0c59b15dc 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -10,7 +10,7 @@ MODULE LEINV_MOD CONTAINS -SUBROUTINE LEINV(KF_LT,PIA) +SUBROUTINE LEINV(PIA,FOUBUF_IN) !**** *LEINV* - Inverse Legendre transform. @@ -58,7 +58,6 @@ SUBROUTINE LEINV(KF_LT,PIA) USE TPM_FIELDS ,ONLY : ZAA,ZAS,TDZAA,TDZAS USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPROCL,D_NPNTGTB1,D_NSTAGT1B USE TPM_GEN, ONLY: NOUT -USE TPM_TRANS ,ONLY : FOUBUF_IN USE CUDA_GEMM_BATCHED_MOD USE, INTRINSIC :: ISO_C_BINDING USE MPL_MODULE ,ONLY : MPL_BARRIER @@ -71,20 +70,23 @@ SUBROUTINE LEINV(KF_LT,PIA) INTEGER(KIND=JPIM) :: KM INTEGER(KIND=JPIM) :: KMLOC INTEGER(KIND=JPIM) :: KIFC -INTEGER(KIND=JPIM), INTENT(IN) :: KF_LT REAL(KIND=JPRB), INTENT(IN) :: PIA(:,:,:) +REAL(KIND=JPRB), INTENT(OUT) :: FOUBUF_IN(:) ! LOCAL REAL(KIND=JPRBT), ALLOCATABLE :: ZINP(:), ZOUTS(:), ZOUTA(:) REAL(KIND=JPRBT) :: ZAOA, ZSOA INTEGER(KIND=JPIM) :: IA, IS, ISL, J1, JGL, JK, J, IGLS, ISTAS, OFFSET1, OFFSET2 +INTEGER(KIND=JPIM) :: KFIELDS REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - + !* 1.1 PREPARATIONS. IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) + +KFIELDS = SIZE(PIA,1) ! ------------------------------------------------------------------ !* 1. PERFORM LEGENDRE TRANFORM. @@ -96,9 +98,9 @@ SUBROUTINE LEINV(KF_LT,PIA) ENDIF CALL GSTATS(453,0) -ALLOCATE(ZINP(2*KF_LT*TDZAS*D_NUMP)) -ALLOCATE(ZOUTS(2*KF_LT*R_NDGNH*D_NUMP)) -ALLOCATE(ZOUTA(2*KF_LT*R_NDGNH*D_NUMP)) +ALLOCATE(ZINP(KFIELDS*TDZAS*D_NUMP)) +ALLOCATE(ZOUTS(KFIELDS*R_NDGNH*D_NUMP)) +ALLOCATE(ZOUTA(KFIELDS*R_NDGNH*D_NUMP)) !$ACC DATA COPYIN(D,D_MYMS,G,G_NDGLU,D_NUMP,R,R_NDGNH,R_NSMAX) & !$ACC& CREATE (ZINP,ZOUTS,ZOUTA) & @@ -118,18 +120,18 @@ SUBROUTINE LEINV(KF_LT,PIA) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO JK=1,2*KF_LT + DO JK=1,KFIELDS KM = D_MYMS(KMLOC) IA = 1+MOD(R_NSMAX-KM+2,2) IF(KM /= 0)THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+2)/2 - ZINP(JK+(J-1+(KMLOC-1)*TDZAA)*2*KF_LT)=PIA(JK,IA+1+(J-1)*2,KMLOC) + ZINP(JK+(J-1+(KMLOC-1)*TDZAA)*KFIELDS)=PIA(JK,IA+1+(J-1)*2,KMLOC) ENDDO ELSEIF (MOD((JK+1),2) .EQ. 0) THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX+2)/2 - ZINP((JK-1)/2+1+(J-1+(KMLOC-1)*TDZAA)*2*KF_LT)=PIA(JK,IA+1+(J-1)*2,KMLOC) + ZINP((JK-1)/2+1+(J-1+(KMLOC-1)*TDZAA)*KFIELDS)=PIA(JK,IA+1+(J-1)*2,KMLOC) ENDDO ENDIF ENDDO @@ -145,12 +147,12 @@ SUBROUTINE LEINV(KF_LT,PIA) !$ACC HOST_DATA USE_DEVICE(ZAA,ZINP,ZOUTA) CALL CUDA_GEMM_BATCHED( & & 'N', 'T', & - & 2*KF_LT, R_NDGNH, TDZAA, & + & KFIELDS, R_NDGNH, TDZAA, & & 1.0_JPRBT, & - & ZINP, 2*KF_LT, TDZAA,& + & ZINP, KFIELDS, TDZAA,& & ZAA, R_NDGNH, TDZAA, & & 0._JPRBT, & - & ZOUTA, 2*KF_LT, R_NDGNH, & + & ZOUTA, KFIELDS, R_NDGNH, & & D_NUMP) !$ACC END HOST_DATA @@ -166,18 +168,18 @@ SUBROUTINE LEINV(KF_LT,PIA) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS,J) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO JK=1,2*KF_LT + DO JK=1,KFIELDS KM = D_MYMS(KMLOC) IS = 1+MOD(R_NSMAX-KM+1,2) IF(KM /= 0)THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+3)/2 - ZINP(JK+(J-1+(KMLOC-1)*TDZAS)*2*KF_LT)=PIA(JK,IS+1+(J-1)*2,KMLOC) + ZINP(JK+(J-1+(KMLOC-1)*TDZAS)*KFIELDS)=PIA(JK,IS+1+(J-1)*2,KMLOC) ENDDO ELSEIF (MOD((JK+1),2) .EQ. 0) THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX+3)/2 - ZINP((JK-1)/2+1+(J-1+(KMLOC-1)*TDZAS)*2*KF_LT)=PIA(JK,IS+1+(J-1)*2,KMLOC) + ZINP((JK-1)/2+1+(J-1+(KMLOC-1)*TDZAS)*KFIELDS)=PIA(JK,IS+1+(J-1)*2,KMLOC) ENDDO ENDIF ENDDO @@ -189,32 +191,32 @@ SUBROUTINE LEINV(KF_LT,PIA) !$ACC HOST_DATA USE_DEVICE(ZAS,ZINP,ZOUTS) CALL CUDA_GEMM_BATCHED( & & 'N', 'T', & - & 2*KF_LT, R_NDGNH, TDZAS, & + & KFIELDS, R_NDGNH, TDZAS, & & 1.0_JPRBT, & - & ZINP, 2*KF_LT, TDZAS, & + & ZINP, KFIELDS, TDZAS, & & ZAS, R_NDGNH, TDZAS, & & 0._JPRBT, & - & ZOUTS, 2*KF_LT, R_NDGNH, & + & ZOUTS, KFIELDS, R_NDGNH, & & D_NUMP) !$ACC END HOST_DATA !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,ISTAS,ZAOA,ZSOA) DO KMLOC=1,D_NUMP - DO JK=1,2*KF_LT + DO JK=1,KFIELDS KM = D_MYMS(KMLOC) ISL = R_NDGNH-G_NDGLU(KM)+1 !$ACC LOOP SEQ DO JGL=ISL,R_NDGNH IGLS = R_NDGL+1-JGL - OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL)) + D_NPNTGTB1(KMLOC,JGL))*2*KF_LT - OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS)) + D_NPNTGTB1(KMLOC,IGLS))*2*KF_LT + OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL)) + D_NPNTGTB1(KMLOC,JGL))*KFIELDS + OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS)) + D_NPNTGTB1(KMLOC,IGLS))*KFIELDS IF(KM /= 0) THEN - ZSOA = ZOUTS(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) - ZAOA = ZOUTA(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) + ZSOA = ZOUTS(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) + ZAOA = ZOUTA(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) ELSEIF (MOD((JK-1),2) .EQ. 0) THEN - ZSOA = ZOUTS((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) - ZAOA = ZOUTA((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_LT) + ZSOA = ZOUTS((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) + ZAOA = ZOUTA((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) ELSE ! Imaginary values of KM=0 is zero, though I don't think we care ZSOA = 0_JPRBT diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index aafbf42d2..e88cf0119 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -19,7 +19,7 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R - USE TPM_TRANS ,ONLY : LDIVGP, LVORGP, NF_SC2, NF_SC3A, NF_SC3B, foubuf_in, LSCDERS + USE TPM_TRANS ,ONLY : LDIVGP, LVORGP, NF_SC2, NF_SC3A, NF_SC3B, FOUBUF_IN, LSCDERS USE TPM_FLT USE TPM_GEOMETRY USE TPM_DISTR ,ONLY : D @@ -250,9 +250,9 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& IF(.NOT. LVORGP) IFIRST = IFIRST+2*KF_UV IF(.NOT. LDIVGP) IFIRST = IFIRST+2*KF_UV - ! Output is being stored in FOUBUF_IN + ! Transform PIA into FOUBUF_IN IF (KF_OUT_LT > 0) THEN - CALL LEINV(KF_OUT_LT,PIA(IFIRST:,:,:)) + CALL LEINV(PIA(IFIRST:,:,:), FOUBUF_IN) ENDIF !$ACC EXIT DATA DELETE(PIA) From 97bbf8b466f73116a4712d5fd9468189a6236828 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:06 -0700 Subject: [PATCH 088/263] Simple interface improvements --- src/trans/gpu/internal/ltinv_ctl_mod.F90 | 27 ++++------- src/trans/gpu/internal/ltinv_mod.F90 | 59 +++++++++--------------- src/trans/gpu/internal/prfi1b_mod.F90 | 1 - src/trans/gpu/internal/trmtol_mod.F90 | 24 +++++----- 4 files changed, 42 insertions(+), 69 deletions(-) diff --git a/src/trans/gpu/internal/ltinv_ctl_mod.F90 b/src/trans/gpu/internal/ltinv_ctl_mod.F90 index e96a823fb..bef29d859 100755 --- a/src/trans/gpu/internal/ltinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltinv_ctl_mod.F90 @@ -81,36 +81,27 @@ SUBROUTINE LTINV_CTL(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,& EXTERNAL FSPGL_PROC OPTIONAL FSPGL_PROC - INTEGER(KIND=JPIM) :: JM,IM,IBLEN,ILEI2,IDIM1, i, j + INTEGER(KIND=JPIM) :: JM,IM,i, j, KFIELD !$ACC DATA CREATE(FOUBUF_IN) PRESENT(FOUBUF) CALL GSTATS(102,0) - ILEI2 = 8*KF_UV + 2*KF_SCALARS + 2*KF_SCDERS - IDIM1 = 2*KF_OUT_LT - IBLEN = D%NLENGT0B*2*KF_OUT_LT - - IF(KF_OUT_LT > 0) THEN - CALL GSTATS(1647,0) - - ! from PSPXXX to FOUBUF_IN - CALL LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,ILEI2,IDIM1,& - & PSPVOR,PSPDIV,PSPSCALAR ,& - & PSPSC3A,PSPSC3B,PSPSC2 , & - & KFLDPTRUV,KFLDPTRSC,FSPGL_PROC) - - CALL GSTATS(1647,1) - ENDIF + CALL GSTATS(1647,0) + ! from PSPXXX to FOUBUF_IN + CALL LTINV(KF_UV,KF_SCALARS,& + & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & + & KFLDPTRUV,KFLDPTRSC,FOUBUF_IN,KFIELD) + CALL GSTATS(1647,1) CALL GSTATS(102,1) CALL GSTATS(152,0) ! from FOUBUF_IN to FOUBUF #ifdef USE_CUDA_AWARE_MPI_FT WRITE(NOUT,*) 'ltinv_ctl:TRMTOL_CUDAAWARE' - CALL TRMTOL_CUDAAWARE(FOUBUF_IN,FOUBUF,ZGTF_START(ZGTF_START_INDEX_NSDERS+1)-1) + CALL TRMTOL_CUDAAWARE(FOUBUF_IN,FOUBUF,KFIELD) #else - CALL TRMTOL(FOUBUF_IN,FOUBUF,ZGTF_START(ZGTF_START_INDEX_NSDERS+1)-1) + CALL TRMTOL(FOUBUF_IN,FOUBUF,KFIELD) #endif CALL GSTATS(152,1) !$ACC END DATA diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index e88cf0119..a976f9b36 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -10,22 +10,18 @@ MODULE LTINV_MOD CONTAINS - SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& - & PSPVOR,PSPDIV,PSPSCALAR,& - & PSPSC3A,PSPSC3B,PSPSC2 , & - & KFLDPTRUV,KFLDPTRSC,FSPGL_PROC) - + SUBROUTINE LTINV(KF_UV,KF_SCALARS,& + & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & + & KFLDPTRUV,KFLDPTRSC,FOUBUF_IN,FOUBUF_KFIELD) + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - + USE TPM_DIM ,ONLY : R - USE TPM_TRANS ,ONLY : LDIVGP, LVORGP, NF_SC2, NF_SC3A, NF_SC3B, FOUBUF_IN, LSCDERS + USE TPM_TRANS ,ONLY : LDIVGP, LVORGP, NF_SC2, NF_SC3A, NF_SC3B, LSCDERS USE TPM_FLT USE TPM_GEOMETRY USE TPM_DISTR ,ONLY : D - use tpm_gen, only: nout - !USE PRLE1_MOD - USE PREPSNM_MOD ,ONLY : PREPSNM USE PRFI1B_MOD ,ONLY : PRFI1B USE VDTUV_MOD ,ONLY : VDTUV USE SPNSDE_MOD ,ONLY : SPNSDE @@ -89,12 +85,8 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& IMPLICIT NONE - INTEGER(KIND=JPIM), INTENT(IN) :: KF_OUT_LT INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS - INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCDERS - INTEGER(KIND=JPIM), INTENT(IN) :: KLEI2 - INTEGER(KIND=JPIM), INTENT(IN) :: KDIM1 REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPVOR(:,:) REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPDIV(:,:) @@ -104,12 +96,9 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC3B(:,:,:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) - - - EXTERNAL FSPGL_PROC - OPTIONAL FSPGL_PROC - - INTEGER(KIND=JPIM) :: IIFC, IDGLU + REAL(KIND=JPRB), INTENT(OUT) :: FOUBUF_IN(:) + INTEGER(KIND=JPIM), INTENT(OUT) :: FOUBUF_KFIELD + INTEGER(KIND=JPIM) :: IFIRST, J3 REAL(KIND=JPRB), ALLOCATABLE, TARGET :: PIA(:,:,:) @@ -117,28 +106,19 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& REAL(KIND=JPRB), POINTER :: PSCALARS(:,:,:), PSCALARS_NSDER(:,:,:) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - !CHARACTER(LEN=10) :: CLHOOK - - - INTEGER(KIND=JPIM) :: KM - INTEGER(KIND=JPIM) :: KMLOC - - + ! ------------------------------------------------------------------ - + !* 1. PERFORM LEGENDRE TRANFORM. ! -------------------------- - - !WRITE(CLHOOK,FMT='(A,I4.4)') 'LTINV_',KM + IF (LHOOK) CALL DR_HOOK('LTINV_MOD',0,ZHOOK_HANDLE) - + ! ------------------------------------------------------------------ - - + + !* 3. SPECTRAL COMPUTATIONS FOR U,V AND DERIVATIVES. ! ---------------------------------------------- - -! COPY FROM PSPXXXX TO ZIA IF (LSYNC_TRANS) THEN CALL MPL_BARRIER(CDSTRING='') @@ -235,7 +215,7 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& !$ACC END DATA !$ACC END DATA - IF (KF_SCDERS > 0) THEN + IF (LSCDERS) THEN CALL SPNSDE(KF_SCALARS,ZEPSNM,PSCALARS,PSCALARS_NSDER) ENDIF @@ -245,13 +225,16 @@ SUBROUTINE LTINV(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& !* 4. INVERSE LEGENDRE TRANSFORM. ! --------------------------- - ! Forget aobut Vorticity and divergence if we don't need it in the output + ! Forget about Vorticity and divergence if we don't need it in the output IFIRST = 1 IF(.NOT. LVORGP) IFIRST = IFIRST+2*KF_UV IF(.NOT. LDIVGP) IFIRST = IFIRST+2*KF_UV + ! Keep this for next functions because we have to remember this + FOUBUF_KFIELD = SIZE(PIA,1)-IFIRST+1 + ! Transform PIA into FOUBUF_IN - IF (KF_OUT_LT > 0) THEN + IF (FOUBUF_KFIELD > 0) THEN CALL LEINV(PIA(IFIRST:,:,:), FOUBUF_IN) ENDIF diff --git a/src/trans/gpu/internal/prfi1b_mod.F90 b/src/trans/gpu/internal/prfi1b_mod.F90 index a2b410b71..06ef39d52 100755 --- a/src/trans/gpu/internal/prfi1b_mod.F90 +++ b/src/trans/gpu/internal/prfi1b_mod.F90 @@ -141,7 +141,6 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) DO JN=2,R_NSMAX+2-KM INM = IASM0+((R_NSMAX+2-JN)-KM)*2 IF( INM .LT. KDIM ) THEN ! TODO is this really needed, we don't have it in the reverse... - ! TODO THIS IS NOT JN+1 in the reverse code but JN PIA(2*JFLD-1,JN+1,KMLOC) = PSPEC(JFLD,INM ) PIA(2*JFLD ,JN+1,KMLOC) = PSPEC(JFLD,INM+1) END IF diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 0a7f6782d..069b392da 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -91,10 +91,10 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) IF(NPROC > 1) THEN DO J=1,NPRTRW - ILENS(J) = D%NLTSFTB(J)*2*KFIELD - IOFFS(J) = D%NSTAGT1B(J)*2*KFIELD - ILENR(J) = D%NLTSGTB(J)*2*KFIELD - IOFFR(J) = D%NSTAGT0B(J)*2*KFIELD + ILENS(J) = D%NLTSFTB(J)*KFIELD + IOFFS(J) = D%NSTAGT1B(J)*KFIELD + ILENR(J) = D%NLTSGTB(J)*KFIELD + IOFFR(J) = D%NSTAGT0B(J)*KFIELD ENDDO CALL GSTATS(807,0) @@ -138,8 +138,8 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) CALL GSTATS(410,1) CALL GSTATS(807,1) ELSE - ILEN = D%NLTSGTB(MYSETW)*2*KFIELD - ISTA = D%NSTAGT0B(MYSETW)*2*KFIELD+1 + ILEN = D%NLTSGTB(MYSETW)*KFIELD + ISTA = D%NSTAGT0B(MYSETW)*KFIELD+1 CALL GSTATS(1608,0) !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) DO J=ISTA,ISTA+ILEN-1 @@ -227,10 +227,10 @@ SUBROUTINE TRMTOL(PFBUF_IN,PFBUF,KFIELD) IF(NPROC > 1) THEN DO J=1,NPRTRW - ILENS(J) = D%NLTSFTB(J)*2*KFIELD - IOFFS(J) = D%NSTAGT1B(J)*2*KFIELD - ILENR(J) = D%NLTSGTB(J)*2*KFIELD - IOFFR(J) = D%NSTAGT0B(J)*2*KFIELD + ILENS(J) = D%NLTSFTB(J)*KFIELD + IOFFS(J) = D%NSTAGT1B(J)*KFIELD + ILENR(J) = D%NLTSGTB(J)*KFIELD + IOFFR(J) = D%NSTAGT0B(J)*KFIELD ENDDO CALL GSTATS(807,0) @@ -244,8 +244,8 @@ SUBROUTINE TRMTOL(PFBUF_IN,PFBUF,KFIELD) CALL MPI_BARRIER(MPI_COMM_WORLD,IERROR) CALL GSTATS(807,1) ELSE - ILEN = D%NLTSGTB(MYSETW)*2*KFIELD - ISTA = D%NSTAGT0B(MYSETW)*2*KFIELD+1 + ILEN = D%NLTSGTB(MYSETW)*KFIELD + ISTA = D%NSTAGT0B(MYSETW)*KFIELD+1 CALL GSTATS(1608,0) !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) DO J=ISTA,ISTA+ILEN-1 From f0b5e5cc288204342e6d1360a8cf639fc446d4b5 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:06 -0700 Subject: [PATCH 089/263] Improve allocation of FOURIER_IN --- src/trans/gpu/internal/leinv_mod.F90 | 9 +++++++-- src/trans/gpu/internal/ltinv_ctl_mod.F90 | 10 ++++------ src/trans/gpu/internal/ltinv_mod.F90 | 2 +- src/trans/gpu/internal/trmtol_mod.F90 | 12 ++++++++++-- 4 files changed, 22 insertions(+), 11 deletions(-) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 0c59b15dc..5e401ef0c 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -71,7 +71,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) INTEGER(KIND=JPIM) :: KMLOC INTEGER(KIND=JPIM) :: KIFC REAL(KIND=JPRB), INTENT(IN) :: PIA(:,:,:) -REAL(KIND=JPRB), INTENT(OUT) :: FOUBUF_IN(:) +REAL(KIND=JPRB), INTENT(OUT), ALLOCATABLE :: FOUBUF_IN(:) ! LOCAL REAL(KIND=JPRBT), ALLOCATABLE :: ZINP(:), ZOUTS(:), ZOUTA(:) @@ -104,7 +104,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) !$ACC DATA COPYIN(D,D_MYMS,G,G_NDGLU,D_NUMP,R,R_NDGNH,R_NSMAX) & !$ACC& CREATE (ZINP,ZOUTS,ZOUTA) & -!$ACC& PRESENT(ZAA,ZAS,PIA,FOUBUF_IN) & +!$ACC& PRESENT(ZAA,ZAS,PIA) & !$ACC& PRESENT(D_MYMS,D_NPROCL,D_NSTAGT1B,D_NPNTGTB1,G_NDGLU) ! READ 2:NSMAX+3 @@ -200,6 +200,10 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & D_NUMP) !$ACC END HOST_DATA +ALLOCATE(FOUBUF_IN(D%NLENGT1B*KFIELDS)) +!$ACC ENTER DATA CREATE(FOUBUF_IN) + +!$ACC DATA PRESENT(FOUBUF_IN) !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,ISTAS,ZAOA,ZSOA) DO KMLOC=1,D_NUMP DO JK=1,KFIELDS @@ -228,6 +232,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDDO ENDDO ENDDO +!$ACC END DATA !$ACC END DATA DEALLOCATE(ZINP) diff --git a/src/trans/gpu/internal/ltinv_ctl_mod.F90 b/src/trans/gpu/internal/ltinv_ctl_mod.F90 index bef29d859..874d90210 100755 --- a/src/trans/gpu/internal/ltinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltinv_ctl_mod.F90 @@ -57,7 +57,7 @@ SUBROUTINE LTINV_CTL(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,& USE TPM_GEN, only: nout USE TPM_DIM ,ONLY : R - USE TPM_TRANS ,ONLY : FOUBUF, FOUBUF_IN + USE TPM_TRANS ,ONLY : FOUBUF USE TPM_DISTR ,ONLY : D USE TPM_GEOMETRY ,ONLY : G USE TPM_FIELDS ,ONLY : ZGTF_START, ZGTF_START_INDEX_NSDERS @@ -82,13 +82,11 @@ SUBROUTINE LTINV_CTL(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,& OPTIONAL FSPGL_PROC INTEGER(KIND=JPIM) :: JM,IM,i, j, KFIELD - - - !$ACC DATA CREATE(FOUBUF_IN) PRESENT(FOUBUF) + REAL(KIND=JPRB), ALLOCATABLE :: FOUBUF_IN(:) CALL GSTATS(102,0) CALL GSTATS(1647,0) - ! from PSPXXX to FOUBUF_IN + ! LTINV allocates FOUBUF_IN and creates on device CALL LTINV(KF_UV,KF_SCALARS,& & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & & KFLDPTRUV,KFLDPTRSC,FOUBUF_IN,KFIELD) @@ -96,6 +94,7 @@ SUBROUTINE LTINV_CTL(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,& CALL GSTATS(102,1) CALL GSTATS(152,0) + ! TRMTOL deallocates FOUBUF_IN and deletes on device ! from FOUBUF_IN to FOUBUF #ifdef USE_CUDA_AWARE_MPI_FT WRITE(NOUT,*) 'ltinv_ctl:TRMTOL_CUDAAWARE' @@ -104,7 +103,6 @@ SUBROUTINE LTINV_CTL(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,& CALL TRMTOL(FOUBUF_IN,FOUBUF,KFIELD) #endif CALL GSTATS(152,1) - !$ACC END DATA ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index a976f9b36..34ad7bca9 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -96,7 +96,7 @@ SUBROUTINE LTINV(KF_UV,KF_SCALARS,& REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC3B(:,:,:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) - REAL(KIND=JPRB), INTENT(OUT) :: FOUBUF_IN(:) + REAL(KIND=JPRB), ALLOCATABLE, INTENT(OUT) :: FOUBUF_IN(:) INTEGER(KIND=JPIM), INTENT(OUT) :: FOUBUF_KFIELD INTEGER(KIND=JPIM) :: IFIRST, J3 diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 069b392da..fa179e95b 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -74,7 +74,7 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) INTEGER(KIND=JPIM) ,INTENT(IN) :: KFIELD REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) -REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF_IN(:) +REAL(KIND=JPRBT), INTENT(IN), ALLOCATABLE :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK @@ -148,6 +148,11 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) CALL GSTATS(1608,1) ENDIF +IF (ALLOCATED(PFBUF_IN)) THEN + !$ACC EXIT DATA DELETE(PFBUF_IN) + DEALLOCATE(PFBUF_IN) +ENDIF + IF (LHOOK) CALL DR_HOOK('TRMTOL_CUDAAWARE',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ @@ -216,7 +221,7 @@ SUBROUTINE TRMTOL(PFBUF_IN,PFBUF,KFIELD) INTEGER(KIND=JPIM) ,INTENT(IN) :: KFIELD REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) -REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF_IN(:) +REAL(KIND=JPRBT), INTENT(IN), ALLOCATABLE :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) INTEGER(KIND=JPIM) :: J, ILEN, ISTA @@ -254,6 +259,9 @@ SUBROUTINE TRMTOL(PFBUF_IN,PFBUF,KFIELD) CALL GSTATS(1608,1) ENDIF +!$ACC EXIT DATA DELETE(PFBUF_IN) +DEALLOCATE(PFBUF_IN) + IF (LHOOK) CALL DR_HOOK('TRMTOL',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ END SUBROUTINE TRMTOL From e688ab310bc15305d2c65b7fffc422a895d4dced Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:06 -0700 Subject: [PATCH 090/263] Non critical simple clean up work --- src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 25 +++++++------------- src/trans/gpu/internal/ltinv_ctl_mod.F90 | 16 ++++--------- 2 files changed, 13 insertions(+), 28 deletions(-) diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index 6190db0c8..d1a7f62d5 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -259,9 +259,9 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& IPTRSPSC(JFLD) = ISTSC+JFLD-1 ENDDO - CALL LTINV_CTL(IF_OUT_LT,IF_UV,IF_SCALARS,IF_SCDERS, & + CALL LTINV_CTL(IF_UV,IF_SCALARS, & & PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& - & KFLDPTRUV=IPTRSPUV,KFLDPTRSC=IPTRSPSC,FSPGL_PROC=FSPGL_PROC) + & KFLDPTRUV=IPTRSPUV,KFLDPTRSC=IPTRSPSC) IF(IF_UV_G > 0 .AND. IF_SCALARS_G > 0) THEN CALL FTINV_CTL(IF_UV_G,IF_SCALARS_G,& @@ -287,35 +287,28 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& ! Figure out where we want to store data in ZGTF IST = 1 ZGTF_START(ZGTF_START_INDEX_VOR) = IST - IF (LVORGP) THEN - IST = IST+KF_UV - ENDIF + IF (LVORGP) IST = IST+KF_UV ZGTF_START(ZGTF_START_INDEX_DIV) = IST - IF (LDIVGP) THEN - IST = IST+KF_UV - ENDIF + IF (LDIVGP) IST = IST+KF_UV ZGTF_START(ZGTF_START_INDEX_UV) = IST IST = IST+2*KF_UV ZGTF_START(ZGTF_START_INDEX_SCALAR) = IST IST = IST+KF_SCALARS ZGTF_START(ZGTF_START_INDEX_NSDERS) = IST - IST = IST+KF_SCDERS + IF (LSCDERS) IST = IST+KF_SCDERS ZGTF_START(ZGTF_START_INDEX_UVDERS) = IST - IF (LUVDER) THEN - IST = IST+2*KF_UV - ENDIF + IF (LUVDER) IST = IST+2*KF_UV ZGTF_START(ZGTF_START_INDEX_EWDERS) = IST - IST = IST+KF_SCDERS + IF (LSCDERS) IST = IST+KF_SCDERS ZGTF_START(ZGTF_START_INDEX_END) = IST !$ACC DATA CREATE(FOUBUF) ! No splitting of fields, transform done in one go ! from PSPXXX to FOUBUF call nvtxStartRange("LTINV") - CALL LTINV_CTL(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS, & + CALL LTINV_CTL(KF_UV,KF_SCALARS, & &PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& - &PSPSC3A=PSPSC3A,PSPSC3B=PSPSC3B,PSPSC2=PSPSC2,& - &FSPGL_PROC=FSPGL_PROC) + &PSPSC3A=PSPSC3A,PSPSC3B=PSPSC3B,PSPSC2=PSPSC2) call nvtxEndRange ! from FOUBUF to PGPXXX diff --git a/src/trans/gpu/internal/ltinv_ctl_mod.F90 b/src/trans/gpu/internal/ltinv_ctl_mod.F90 index 874d90210..5a2289664 100755 --- a/src/trans/gpu/internal/ltinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltinv_ctl_mod.F90 @@ -10,10 +10,9 @@ MODULE LTINV_CTL_MOD CONTAINS - SUBROUTINE LTINV_CTL(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,& - & PSPVOR,PSPDIV,PSPSCALAR,& - & PSPSC3A,PSPSC3B,PSPSC2,& - & KFLDPTRUV,KFLDPTRSC,FSPGL_PROC) + SUBROUTINE LTINV_CTL(KF_UV,KF_SCALARS,& + & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& + & KFLDPTRUV,KFLDPTRSC) !**** *LTINV_CTL* - Control routine for inverse Legandre transform. @@ -24,17 +23,13 @@ SUBROUTINE LTINV_CTL(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,& !** Interface. ! ---------- ! CALL INV_TRANS_CTL(...) - ! KF_OUT_LT - number of fields coming out from inverse LT ! KF_UV - local number of spectral u-v fields ! KF_SCALARS - local number of scalar spectral fields - ! KF_SCDERS - local number of derivatives of scalar spectral fields ! PSPVOR(:,:) - spectral vorticity (input) ! PSPDIV(:,:) - spectral divergence (input) ! PSPSCALAR(:,:) - spectral scalarvalued fields (input) ! KFLDPTRUV(:) - field pointer array for vor./div. ! KFLDPTRSC(:) - field pointer array for PSPSCALAR - ! FSPGL_PROC - external procedure to be executed in fourier space - ! before transposition ! Method. ! ------- @@ -60,7 +55,6 @@ SUBROUTINE LTINV_CTL(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,& USE TPM_TRANS ,ONLY : FOUBUF USE TPM_DISTR ,ONLY : D USE TPM_GEOMETRY ,ONLY : G - USE TPM_FIELDS ,ONLY : ZGTF_START, ZGTF_START_INDEX_NSDERS USE TPM_FLT @@ -69,7 +63,7 @@ SUBROUTINE LTINV_CTL(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,& IMPLICIT NONE - INTEGER(KIND=JPIM),INTENT(IN) :: KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS + INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV,KF_SCALARS REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPVOR(:,:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPDIV(:,:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSCALAR(:,:) @@ -78,8 +72,6 @@ SUBROUTINE LTINV_CTL(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,& REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC2(:,:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) - EXTERNAL FSPGL_PROC - OPTIONAL FSPGL_PROC INTEGER(KIND=JPIM) :: JM,IM,i, j, KFIELD REAL(KIND=JPRB), ALLOCATABLE :: FOUBUF_IN(:) From d0617446bebb91eff623ed7770b4431441780863 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:06 -0700 Subject: [PATCH 091/263] Add FOUBUF to LTINV_CTL arguments --- src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 5 ++--- src/trans/gpu/internal/ltinv_ctl_mod.F90 | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index d1a7f62d5..c20b3af5d 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -261,7 +261,7 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& CALL LTINV_CTL(IF_UV,IF_SCALARS, & & PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& - & KFLDPTRUV=IPTRSPUV,KFLDPTRSC=IPTRSPSC) + & KFLDPTRUV=IPTRSPUV,KFLDPTRSC=IPTRSPSC,FOUBUF=FOUBUF) IF(IF_UV_G > 0 .AND. IF_SCALARS_G > 0) THEN CALL FTINV_CTL(IF_UV_G,IF_SCALARS_G,& @@ -307,8 +307,7 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& ! from PSPXXX to FOUBUF call nvtxStartRange("LTINV") CALL LTINV_CTL(KF_UV,KF_SCALARS, & - &PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& - &PSPSC3A=PSPSC3A,PSPSC3B=PSPSC3B,PSPSC2=PSPSC2) + & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,FOUBUF) call nvtxEndRange ! from FOUBUF to PGPXXX diff --git a/src/trans/gpu/internal/ltinv_ctl_mod.F90 b/src/trans/gpu/internal/ltinv_ctl_mod.F90 index 5a2289664..177225f37 100755 --- a/src/trans/gpu/internal/ltinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltinv_ctl_mod.F90 @@ -11,7 +11,7 @@ MODULE LTINV_CTL_MOD CONTAINS SUBROUTINE LTINV_CTL(KF_UV,KF_SCALARS,& - & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& + & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,FOUBUF,& & KFLDPTRUV,KFLDPTRSC) !**** *LTINV_CTL* - Control routine for inverse Legandre transform. @@ -52,7 +52,6 @@ SUBROUTINE LTINV_CTL(KF_UV,KF_SCALARS,& USE TPM_GEN, only: nout USE TPM_DIM ,ONLY : R - USE TPM_TRANS ,ONLY : FOUBUF USE TPM_DISTR ,ONLY : D USE TPM_GEOMETRY ,ONLY : G @@ -70,6 +69,7 @@ SUBROUTINE LTINV_CTL(KF_UV,KF_SCALARS,& REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC3A(:,:,:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC3B(:,:,:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC2(:,:) + REAL(KIND=JPRB), INTENT(OUT) :: FOUBUF(:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) From 5154a76b33d5c2629c1087d29d1bd8bb08c57b7c Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:07 -0700 Subject: [PATCH 092/263] Make FOUBUF allocatable and pass through --- src/trans/gpu/internal/fourier_in_mod.F90 | 9 ++++++--- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 7 ++++--- src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 14 +++++++------- src/trans/gpu/internal/ltinv_ctl_mod.F90 | 2 +- src/trans/gpu/internal/trmtol_mod.F90 | 10 ++++++++-- 5 files changed, 26 insertions(+), 16 deletions(-) diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index c47e4e6ac..6b0912394 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -10,7 +10,7 @@ MODULE FOURIER_IN_MOD CONTAINS -SUBROUTINE FOURIER_IN(PREEL,KFIELDS) +SUBROUTINE FOURIER_IN(FOUBUF,PREEL,KFIELDS) !**** *FOURIER_IN* - Copy fourier data from buffer to local array @@ -38,10 +38,9 @@ SUBROUTINE FOURIER_IN(PREEL,KFIELDS) ! ------------------------------------------------------------------ -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRB, JPRBT USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NSTAGTF,D_NSTAGT0B,D_NPNTGTB0,D_NPROCM,D_NPTRLS -USE TPM_TRANS ,ONLY : FOUBUF USE TPM_GEOMETRY ,ONLY : G, G_NMEN,G_NMEN_MAX use tpm_gen, only: nout ! @@ -53,6 +52,7 @@ SUBROUTINE FOURIER_IN(PREEL,KFIELDS) INTEGER(KIND=JPIM) :: KGL REAL(KIND=JPRBT), INTENT(OUT) :: PREEL(:,:) +REAL(KIND=JPRB), ALLOCATABLE, INTENT(IN) :: FOUBUF(:) INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,IR,II,ISTA,OFFSET_VAR,IOFF INTEGER(KIND=JPIM) :: IBEG,IEND,IINC,iimax1,iimax2,iimax3,iunit @@ -99,6 +99,9 @@ SUBROUTINE FOURIER_IN(PREEL,KFIELDS) !$ACC WAIT(1) +!$ACC EXIT DATA DELETE(FOUBUF) +DEALLOCATE(FOUBUF) + END SUBROUTINE FOURIER_IN END MODULE FOURIER_IN_MOD diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index e68c1853c..f94bed6f2 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -11,7 +11,7 @@ MODULE FTINV_CTL_MOD CONTAINS SUBROUTINE FTINV_CTL(KF_UV_G,KF_SCALARS_G,& - & KF_UV,KF_SCALARS,KF_SCDERS,KF_GP,KF_FS,KF_OUT_LT,KVSETUV,KVSETSC,KPTRGP, & + & KF_UV,KF_SCALARS,KF_SCDERS,KF_GP,KF_FS,KF_OUT_LT,FOUBUF,KVSETUV,KVSETSC,KPTRGP, & & KVSETSC3A,KVSETSC3B,KVSETSC2,& & PGP,PGPUV,PGP3A,PGP3B,PGP2) @@ -65,7 +65,7 @@ SUBROUTINE FTINV_CTL(KF_UV_G,KF_SCALARS_G,& USE TPM_GEN ,ONLY : NERR, nout !USE TPM_DIM !USE TPM_GEOMETRY -USE TPM_TRANS ,ONLY : FOUBUF, LDIVGP, LSCDERS, LUVDER, LVORGP,LATLON, ZGTF +USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP,LATLON, ZGTF USE TPM_DISTR ,ONLY : D, MYPROC, NPROC USE TPM_FLT ,ONLY : S USE FOURIER_IN_MOD ,ONLY : FOURIER_IN @@ -98,6 +98,7 @@ SUBROUTINE FTINV_CTL(KF_UV_G,KF_SCALARS_G,& REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3A(:,:,:,:) REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP2(:,:,:) +REAL(KIND=JPRB), ALLOCATABLE, INTENT(IN) :: FOUBUF(:) INTEGER(KIND=JPIM) :: IST INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) @@ -116,7 +117,7 @@ SUBROUTINE FTINV_CTL(KF_UV_G,KF_SCALARS_G,& CALL GSTATS(1639,0) ! from FOUBUF to ZGTF -CALL FOURIER_IN(ZGTF,ZGTF_START(ZGTF_START_INDEX_NSDERS+1)-1) +CALL FOURIER_IN(FOUBUF,ZGTF,ZGTF_START(ZGTF_START_INDEX_NSDERS+1)-1) ! 2. Fourier space computations diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index c20b3af5d..a0740d70e 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -87,7 +87,7 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& USE PARKIND1 ,ONLY : JPIM ,JPRB USE TPM_GEN ,ONLY : NPROMATR -USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, FOUBUF +USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP USE TPM_FIELDS ,ONLY : ZGTF_START, ZGTF_START_INDEX_UV, ZGTF_START_INDEX_NSDERS, & & ZGTF_START_INDEX_UVDERS, ZGTF_START_INDEX_EWDERS, ZGTF_START_INDEX_SCALAR, & & ZGTF_START_INDEX_VOR, ZGTF_START_INDEX_DIV, ZGTF_START_INDEX_END @@ -141,6 +141,8 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& INTEGER(KIND=JPIM) :: JFLD,ISTSC_G,IENSC_G,ISTSC,IENSC,IENUV,IF_SCDERS,IF_OUT_LT INTEGER(KIND=JPIM) :: IOFFD,IOFFU,IOFFV,IOFFUVD,IOFFSC,IOFFSCNS,IOFFSCEW,IOFF,IF_GPB,IST +REAL(KIND=JPRB), ALLOCATABLE :: FOUBUF(:) + ! ------------------------------------------------------------------ ! Perform transform @@ -266,18 +268,18 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& IF(IF_UV_G > 0 .AND. IF_SCALARS_G > 0) THEN CALL FTINV_CTL(IF_UV_G,IF_SCALARS_G,& & IF_UV,IF_SCALARS,IF_SCDERS,IF_GP,IF_FS,IF_OUT_LT,& - & KVSETUV=IVSETUV(ISTUV_G:IENUV_G),& + & FOUBUF,KVSETUV=IVSETUV(ISTUV_G:IENUV_G),& & KVSETSC=IVSETSC(ISTSC_G:IENSC_G),KPTRGP=IPTRGP,& & PGP=PGP) ELSEIF(IF_UV_G > 0) THEN CALL FTINV_CTL(IF_UV_G,IF_SCALARS_G,& & IF_UV,IF_SCALARS,IF_SCDERS,IF_GP,IF_FS,IF_OUT_LT,& - & KVSETUV=IVSETUV(ISTUV_G:IENUV_G),KPTRGP=IPTRGP,& + & FOUBUF,KVSETUV=IVSETUV(ISTUV_G:IENUV_G),KPTRGP=IPTRGP,& & PGP=PGP) ELSEIF(IF_SCALARS_G > 0) THEN CALL FTINV_CTL(IF_UV_G,IF_SCALARS_G,& & IF_UV,IF_SCALARS,IF_SCDERS,IF_GP,IF_FS,IF_OUT_LT,& - & KVSETSC=IVSETSC(ISTSC_G:IENSC_G),KPTRGP=IPTRGP,& + & FOUBUF,KVSETSC=IVSETSC(ISTSC_G:IENSC_G),KPTRGP=IPTRGP,& & PGP=PGP) ENDIF ENDDO @@ -302,7 +304,6 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& IF (LSCDERS) IST = IST+KF_SCDERS ZGTF_START(ZGTF_START_INDEX_END) = IST - !$ACC DATA CREATE(FOUBUF) ! No splitting of fields, transform done in one go ! from PSPXXX to FOUBUF call nvtxStartRange("LTINV") @@ -314,10 +315,9 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& call nvtxStartRange("FTINV") CALL FTINV_CTL(KF_UV_G,KF_SCALARS_G,& & KF_UV,KF_SCALARS,KF_SCDERS,KF_GP,KF_FS,KF_OUT_LT,& - & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& + & FOUBUF,KVSETUV=KVSETUV,KVSETSC=KVSETSC,& & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) - !$ACC END DATA call nvtxEndRange call nvtxEndRange diff --git a/src/trans/gpu/internal/ltinv_ctl_mod.F90 b/src/trans/gpu/internal/ltinv_ctl_mod.F90 index 177225f37..7e90bd088 100755 --- a/src/trans/gpu/internal/ltinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltinv_ctl_mod.F90 @@ -69,7 +69,7 @@ SUBROUTINE LTINV_CTL(KF_UV,KF_SCALARS,& REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC3A(:,:,:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC3B(:,:,:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC2(:,:) - REAL(KIND=JPRB), INTENT(OUT) :: FOUBUF(:) + REAL(KIND=JPRB), ALLOCATABLE, INTENT(OUT) :: FOUBUF(:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index fa179e95b..3ae3f8902 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -73,7 +73,7 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) IMPLICIT NONE INTEGER(KIND=JPIM) ,INTENT(IN) :: KFIELD -REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) +REAL(KIND=JPRBT), INTENT(OUT), ALLOCATABLE :: PFBUF(:) REAL(KIND=JPRBT), INTENT(IN), ALLOCATABLE :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) @@ -89,6 +89,9 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) IF (LHOOK) CALL DR_HOOK('TRMTOL_CUDAAWARE',0,ZHOOK_HANDLE) +ALLOCATE(PFBUF(D%NLENGT0B*KFIELD)) +!$ACC ENTER DATA CREATE(PFBUF) + IF(NPROC > 1) THEN DO J=1,NPRTRW ILENS(J) = D%NLTSFTB(J)*KFIELD @@ -220,7 +223,7 @@ SUBROUTINE TRMTOL(PFBUF_IN,PFBUF,KFIELD) IMPLICIT NONE INTEGER(KIND=JPIM) ,INTENT(IN) :: KFIELD -REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) +REAL(KIND=JPRBT), INTENT(OUT), ALLOCATABLE :: PFBUF(:) REAL(KIND=JPRBT), INTENT(IN), ALLOCATABLE :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) @@ -230,6 +233,9 @@ SUBROUTINE TRMTOL(PFBUF_IN,PFBUF,KFIELD) IF (LHOOK) CALL DR_HOOK('TRMTOL',0,ZHOOK_HANDLE) +ALLOCATE(PFBUF(D%NLENGT0B*KFIELD)) +!$ACC ENTER DATA CREATE(PFBUF) + IF(NPROC > 1) THEN DO J=1,NPRTRW ILENS(J) = D%NLTSFTB(J)*KFIELD From b8a09c6a485f45b63361d922e2391adb3f0901e8 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:07 -0700 Subject: [PATCH 093/263] Pass KFIELD through to Fourier Transform --- src/trans/gpu/internal/fourier_in_mod.F90 | 4 +-- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 28 +++++++++++++++-- src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 33 +++++--------------- src/trans/gpu/internal/ltinv_ctl_mod.F90 | 11 +++---- 4 files changed, 39 insertions(+), 37 deletions(-) diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index 6b0912394..122f816b1 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -81,14 +81,14 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL,KFIELDS) OFFSET_VAR=D_NPTRLS(MYSETW) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF,JM,IPROC,ISTA) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC - DO JF=1,KFIELDS + DO JF=1,KFIELDS/2 IGLG = OFFSET_VAR+KGL-1 IOFF = D_NSTAGTF(KGL)+1 !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) IPROC = D_NPROCM(JM) - ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*2*KFIELDS + ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KFIELDS PREEL(2*JF-1,2*JM+IOFF) = FOUBUF(ISTA+2*JF-1) PREEL(2*JF, 2*JM+IOFF) = FOUBUF(ISTA+2*JF ) diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index f94bed6f2..d12faaa24 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -10,7 +10,7 @@ MODULE FTINV_CTL_MOD CONTAINS -SUBROUTINE FTINV_CTL(KF_UV_G,KF_SCALARS_G,& +SUBROUTINE FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& & KF_UV,KF_SCALARS,KF_SCDERS,KF_GP,KF_FS,KF_OUT_LT,FOUBUF,KVSETUV,KVSETSC,KPTRGP, & & KVSETSC3A,KVSETSC3B,KVSETSC2,& & PGP,PGPUV,PGP3A,PGP3B,PGP2) @@ -73,12 +73,15 @@ SUBROUTINE FTINV_CTL(KF_UV_G,KF_SCALARS_G,& USE FTINV_MOD ,ONLY : FTINV USE TRLTOG_MOD ,ONLY : TRLTOG, TRLTOG_CUDAAWARE USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS -USE TPM_FIELDS ,ONLY : ZGTF_START, ZGTF_START_INDEX_NSDERS, ZGTF_START_INDEX_UVDERS +USE TPM_FIELDS ,ONLY : ZGTF_START, ZGTF_START_INDEX_UV, ZGTF_START_INDEX_NSDERS, & + & ZGTF_START_INDEX_UVDERS, ZGTF_START_INDEX_EWDERS, ZGTF_START_INDEX_SCALAR, & + & ZGTF_START_INDEX_VOR, ZGTF_START_INDEX_DIV, ZGTF_START_INDEX_END use ieee_arithmetic ! IMPLICIT NONE +INTEGER(KIND=JPIM) ,INTENT(IN) :: KFIELD INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_UV_G INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_SCALARS_G INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_UV @@ -116,8 +119,27 @@ SUBROUTINE FTINV_CTL(KF_UV_G,KF_SCALARS_G,& CALL GSTATS(107,0) CALL GSTATS(1639,0) + + ! Figure out where we want to store data in ZGTF + IST = 1 + ZGTF_START(ZGTF_START_INDEX_VOR) = IST + IF (LVORGP) IST = IST+KF_UV + ZGTF_START(ZGTF_START_INDEX_DIV) = IST + IF (LDIVGP) IST = IST+KF_UV + ZGTF_START(ZGTF_START_INDEX_UV) = IST + IST = IST+2*KF_UV + ZGTF_START(ZGTF_START_INDEX_SCALAR) = IST + IST = IST+KF_SCALARS + ZGTF_START(ZGTF_START_INDEX_NSDERS) = IST + IF (LSCDERS) IST = IST+KF_SCDERS + ZGTF_START(ZGTF_START_INDEX_UVDERS) = IST + IF (LUVDER) IST = IST+2*KF_UV + ZGTF_START(ZGTF_START_INDEX_EWDERS) = IST + IF (LSCDERS) IST = IST+KF_SCDERS + ZGTF_START(ZGTF_START_INDEX_END) = IST + ! from FOUBUF to ZGTF -CALL FOURIER_IN(FOUBUF,ZGTF,ZGTF_START(ZGTF_START_INDEX_NSDERS+1)-1) +CALL FOURIER_IN(FOUBUF,ZGTF,KFIELD) ! 2. Fourier space computations diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index a0740d70e..285c5f3ec 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -88,9 +88,6 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& USE TPM_GEN ,ONLY : NPROMATR USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP -USE TPM_FIELDS ,ONLY : ZGTF_START, ZGTF_START_INDEX_UV, ZGTF_START_INDEX_NSDERS, & - & ZGTF_START_INDEX_UVDERS, ZGTF_START_INDEX_EWDERS, ZGTF_START_INDEX_SCALAR, & - & ZGTF_START_INDEX_VOR, ZGTF_START_INDEX_DIV, ZGTF_START_INDEX_END !USE TPM_DISTR USE SHUFFLE_MOD ,ONLY : SHUFFLE @@ -140,6 +137,7 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& INTEGER(KIND=JPIM) :: IF_UV_G,IF_UV,ISTUV,IF_SCALARS,IF_SCALARS_G,IF_FS,IF_GP INTEGER(KIND=JPIM) :: JFLD,ISTSC_G,IENSC_G,ISTSC,IENSC,IENUV,IF_SCDERS,IF_OUT_LT INTEGER(KIND=JPIM) :: IOFFD,IOFFU,IOFFV,IOFFUVD,IOFFSC,IOFFSCNS,IOFFSCEW,IOFF,IF_GPB,IST +INTEGER(KIND=JPIM) :: KFIELD REAL(KIND=JPRB), ALLOCATABLE :: FOUBUF(:) @@ -263,21 +261,21 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& CALL LTINV_CTL(IF_UV,IF_SCALARS, & & PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& - & KFLDPTRUV=IPTRSPUV,KFLDPTRSC=IPTRSPSC,FOUBUF=FOUBUF) + & KFLDPTRUV=IPTRSPUV,KFLDPTRSC=IPTRSPSC,FOUBUF=FOUBUF,KFIELD=KFIELD) IF(IF_UV_G > 0 .AND. IF_SCALARS_G > 0) THEN - CALL FTINV_CTL(IF_UV_G,IF_SCALARS_G,& + CALL FTINV_CTL(KFIELD,IF_UV_G,IF_SCALARS_G,& & IF_UV,IF_SCALARS,IF_SCDERS,IF_GP,IF_FS,IF_OUT_LT,& & FOUBUF,KVSETUV=IVSETUV(ISTUV_G:IENUV_G),& & KVSETSC=IVSETSC(ISTSC_G:IENSC_G),KPTRGP=IPTRGP,& & PGP=PGP) ELSEIF(IF_UV_G > 0) THEN - CALL FTINV_CTL(IF_UV_G,IF_SCALARS_G,& + CALL FTINV_CTL(KFIELD,IF_UV_G,IF_SCALARS_G,& & IF_UV,IF_SCALARS,IF_SCDERS,IF_GP,IF_FS,IF_OUT_LT,& & FOUBUF,KVSETUV=IVSETUV(ISTUV_G:IENUV_G),KPTRGP=IPTRGP,& & PGP=PGP) ELSEIF(IF_SCALARS_G > 0) THEN - CALL FTINV_CTL(IF_UV_G,IF_SCALARS_G,& + CALL FTINV_CTL(KFIELD,IF_UV_G,IF_SCALARS_G,& & IF_UV,IF_SCALARS,IF_SCDERS,IF_GP,IF_FS,IF_OUT_LT,& & FOUBUF,KVSETSC=IVSETSC(ISTSC_G:IENSC_G),KPTRGP=IPTRGP,& & PGP=PGP) @@ -286,34 +284,17 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& ELSE call nvtxStartRange("INVTRANS") - ! Figure out where we want to store data in ZGTF - IST = 1 - ZGTF_START(ZGTF_START_INDEX_VOR) = IST - IF (LVORGP) IST = IST+KF_UV - ZGTF_START(ZGTF_START_INDEX_DIV) = IST - IF (LDIVGP) IST = IST+KF_UV - ZGTF_START(ZGTF_START_INDEX_UV) = IST - IST = IST+2*KF_UV - ZGTF_START(ZGTF_START_INDEX_SCALAR) = IST - IST = IST+KF_SCALARS - ZGTF_START(ZGTF_START_INDEX_NSDERS) = IST - IF (LSCDERS) IST = IST+KF_SCDERS - ZGTF_START(ZGTF_START_INDEX_UVDERS) = IST - IF (LUVDER) IST = IST+2*KF_UV - ZGTF_START(ZGTF_START_INDEX_EWDERS) = IST - IF (LSCDERS) IST = IST+KF_SCDERS - ZGTF_START(ZGTF_START_INDEX_END) = IST ! No splitting of fields, transform done in one go ! from PSPXXX to FOUBUF call nvtxStartRange("LTINV") CALL LTINV_CTL(KF_UV,KF_SCALARS, & - & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,FOUBUF) + & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,FOUBUF,KFIELD) call nvtxEndRange ! from FOUBUF to PGPXXX call nvtxStartRange("FTINV") - CALL FTINV_CTL(KF_UV_G,KF_SCALARS_G,& + CALL FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& & KF_UV,KF_SCALARS,KF_SCDERS,KF_GP,KF_FS,KF_OUT_LT,& & FOUBUF,KVSETUV=KVSETUV,KVSETSC=KVSETSC,& & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& diff --git a/src/trans/gpu/internal/ltinv_ctl_mod.F90 b/src/trans/gpu/internal/ltinv_ctl_mod.F90 index 7e90bd088..aca1742a8 100755 --- a/src/trans/gpu/internal/ltinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltinv_ctl_mod.F90 @@ -11,7 +11,7 @@ MODULE LTINV_CTL_MOD CONTAINS SUBROUTINE LTINV_CTL(KF_UV,KF_SCALARS,& - & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,FOUBUF,& + & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,FOUBUF,KFIELD,& & KFLDPTRUV,KFLDPTRSC) !**** *LTINV_CTL* - Control routine for inverse Legandre transform. @@ -59,9 +59,9 @@ SUBROUTINE LTINV_CTL(KF_UV,KF_SCALARS,& USE LTINV_MOD ,ONLY : LTINV USE TRMTOL_MOD ,ONLY : TRMTOL, TRMTOL_CUDAAWARE - + IMPLICIT NONE - + INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV,KF_SCALARS REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPVOR(:,:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPDIV(:,:) @@ -72,10 +72,9 @@ SUBROUTINE LTINV_CTL(KF_UV,KF_SCALARS,& REAL(KIND=JPRB), ALLOCATABLE, INTENT(OUT) :: FOUBUF(:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) - - INTEGER(KIND=JPIM) :: JM,IM,i, j, KFIELD + INTEGER(KIND=JPIM),INTENT(OUT) :: KFIELD REAL(KIND=JPRB), ALLOCATABLE :: FOUBUF_IN(:) - + CALL GSTATS(102,0) CALL GSTATS(1647,0) ! LTINV allocates FOUBUF_IN and creates on device From 79dc709eddf00769d5585fca40b916aee9b4db97 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:07 -0700 Subject: [PATCH 094/263] ZGTF is now reallocated and properly sized in inv_trans --- src/trans/gpu/internal/fsc_mod.F90 | 65 +++++++++++----------- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 70 +++++++++++++++--------- src/trans/gpu/internal/ftinv_mod.F90 | 6 +- src/trans/gpu/internal/ltinv_mod.F90 | 6 +- src/trans/gpu/internal/tpm_fields.F90 | 11 ---- 5 files changed, 84 insertions(+), 74 deletions(-) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 4ab621d8c..8fd50706e 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -10,7 +10,7 @@ MODULE FSC_MOD CONTAINS -SUBROUTINE FSC +SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCALARS_EWDER) !**** *FSC - Division by a*cos(theta), east-west derivatives @@ -47,11 +47,10 @@ SUBROUTINE FSC USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT -USE TPM_TRANS ,ONLY : LUVDER, LATLON, ZGTF, LVORGP, LDIVGP +USE TPM_TRANS ,ONLY : LUVDER, LATLON, LVORGP, LDIVGP USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NSTAGTF -USE TPM_FIELDS ,ONLY : F, ZGTF_START, ZGTF_START_INDEX_UV, ZGTF_START_INDEX_NSDERS, & - & ZGTF_START_INDEX_UVDERS, ZGTF_START_INDEX_EWDERS, ZGTF_START_INDEX_SCALAR USE TPM_GEOMETRY ,ONLY : G, G_NMEN +USE TPM_FIELDS ,ONLY : F USE TPM_FLT ,ONLY: S use tpm_gen, only: nout ! @@ -62,9 +61,13 @@ SUBROUTINE FSC REAL(KIND=JPRBT) :: ZACHTE2 INTEGER(KIND=JPIM) :: IOFF,OFFSET_VAR -INTEGER(KIND=JPIM) :: JF,IGLG,JM,JF_UV,JF_SCALAR +INTEGER(KIND=JPIM) :: JF,IGLG,JM INTEGER(KIND=JPIM) :: IBEG,IEND,IINC +INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV, KF_SCALARS +REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: & + & PUV(:,:), PSCALARS(:,:), PSCALARS_NSDER(:,:), PUV_EWDER(:,:), PSCALARS_EWDER(:,:) + ! ------------------------------------------------------------------ IF(MYPROC > NPROC/2)THEN @@ -77,7 +80,7 @@ SUBROUTINE FSC IINC=-1 ENDIF -!$ACC DATA PRESENT(ZGTF,D,G,F,D_NSTAGTF,G_NMEN) COPYIN(ZGTF_START) +!$ACC DATA PRESENT(D,G,F,D_NSTAGTF,G_NMEN,PUV,PSCALARS,PSCALARS_NSDER,PUV_EWDER,PSCALARS_EWDER) IF( LATLON.AND.S%LDLL.AND.S%LSHIFTLL ) THEN PRINT *, "This is not implemented yet! LATLON.AND.S%LDLL.AND.S%LSHIFTLL" @@ -91,30 +94,28 @@ SUBROUTINE FSC !* 1.1 U AND V. -IF (ZGTF_START(ZGTF_START_INDEX_UV) /= ZGTF_START(ZGTF_START_INDEX_UV+1)) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) - DO KGL=IBEG,IEND,IINC - DO JF=ZGTF_START(ZGTF_START_INDEX_UV),ZGTF_START(ZGTF_START_INDEX_UV+1)-1 - IGLG = OFFSET_VAR+KGL-1 - IOFF = D_NSTAGTF(KGL)+1 +!$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) +DO KGL=IBEG,IEND,IINC + DO JF=1,2*KF_UV + IGLG = OFFSET_VAR+KGL-1 + IOFF = D_NSTAGTF(KGL)+1 - ZACHTE2 = F%RACTHE(IGLG) + ZACHTE2 = F%RACTHE(IGLG) - !$ACC LOOP SEQ - DO JM=0,G_NMEN(IGLG) - ZGTF(2*JF-1,2*JM+IOFF) = ZGTF(2*JF-1,2*JM+IOFF)*ZACHTE2 - ZGTF(2*JF, 2*JM+IOFF) = ZGTF(2*JF ,2*JM+IOFF)*ZACHTE2 - ENDDO + !$ACC LOOP SEQ + DO JM=0,G_NMEN(IGLG) + PUV(2*JF-1,2*JM+IOFF) = PUV(2*JF-1,2*JM+IOFF)*ZACHTE2 + PUV(2*JF, 2*JM+IOFF) = PUV(2*JF ,2*JM+IOFF)*ZACHTE2 ENDDO ENDDO -ENDIF +ENDDO !* 1.2 N-S DERIVATIVES -IF (ZGTF_START(ZGTF_START_INDEX_NSDERS) /= ZGTF_START(ZGTF_START_INDEX_NSDERS+1)) THEN +IF (ASSOCIATED(PSCALARS_NSDER)) THEN !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC - DO JF=ZGTF_START(ZGTF_START_INDEX_NSDERS),ZGTF_START(ZGTF_START_INDEX_NSDERS+1)-1 + DO JF=1,KF_SCALARS IGLG = OFFSET_VAR+KGL-1 IOFF = D_NSTAGTF(KGL)+1 @@ -122,8 +123,8 @@ SUBROUTINE FSC !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - ZGTF(2*JF-1,2*JM+IOFF) = ZGTF(2*JF-1,2*JM+IOFF)*ZACHTE2 - ZGTF(2*JF, 2*JM+IOFF) = ZGTF(2*JF, 2*JM+IOFF)*ZACHTE2 + PSCALARS_NSDER(2*JF-1,2*JM+IOFF) = PSCALARS_NSDER(2*JF-1,2*JM+IOFF)*ZACHTE2 + PSCALARS_NSDER(2*JF, 2*JM+IOFF) = PSCALARS_NSDER(2*JF, 2*JM+IOFF)*ZACHTE2 ENDDO ENDDO ENDDO @@ -136,40 +137,38 @@ SUBROUTINE FSC !* 2.1 U AND V. -IF (ZGTF_START(ZGTF_START_INDEX_UVDERS) /= ZGTF_START(ZGTF_START_INDEX_UVDERS+1)) THEN +IF (ASSOCIATED(PUV_EWDER)) THEN !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC - DO JF=ZGTF_START(ZGTF_START_INDEX_UVDERS),ZGTF_START(ZGTF_START_INDEX_UVDERS+1)-1 + DO JF=1,2*KF_UV IGLG = OFFSET_VAR+KGL-1 IOFF = D_NSTAGTF(KGL)+1 ZACHTE2 = F%RACTHE(IGLG) - JF_UV = JF - ZGTF_START(ZGTF_START_INDEX_UVDERS) + ZGTF_START(ZGTF_START_INDEX_UV) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - ZGTF(2*JF-1,2*JM+IOFF) = -ZGTF(2*JF_UV,2*JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) - ZGTF(2*JF, 2*JM+IOFF) = ZGTF(2*JF_UV-1,2*JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) + PUV_EWDER(2*JF-1,2*JM+IOFF) = -PUV(2*JF,2*JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) + PUV_EWDER(2*JF, 2*JM+IOFF) = PUV(2*JF-1,2*JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) ENDDO ENDDO ENDDO ENDIF !* 2.2 SCALAR VARIABLES -IF (ZGTF_START(ZGTF_START_INDEX_EWDERS) /= ZGTF_START(ZGTF_START_INDEX_EWDERS+1)) THEN +IF (ASSOCIATED(PSCALARS_EWDER)) THEN !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC - DO JF=ZGTF_START(ZGTF_START_INDEX_EWDERS),ZGTF_START(ZGTF_START_INDEX_EWDERS+1)-1 + DO JF=1,KF_SCALARS IGLG = OFFSET_VAR+KGL-1 IOFF = D_NSTAGTF(KGL)+1 ZACHTE2 = F%RACTHE(IGLG) - JF_SCALAR = JF - ZGTF_START(ZGTF_START_INDEX_EWDERS) + ZGTF_START(ZGTF_START_INDEX_SCALAR) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - ZGTF(2*JF-1,2*JM+IOFF) = -ZGTF(2*JF_SCALAR,2*JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) - ZGTF(2*JF, 2*JM+IOFF) = ZGTF(2*JF_SCALAR-1,2*JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) + PSCALARS_EWDER(2*JF-1,2*JM+IOFF) = -PSCALARS(2*JF,2*JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) + PSCALARS_EWDER(2*JF, 2*JM+IOFF) = PSCALARS(2*JF-1,2*JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) ENDDO ENDDO ENDDO diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index d12faaa24..2e131b59f 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -65,7 +65,7 @@ SUBROUTINE FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& USE TPM_GEN ,ONLY : NERR, nout !USE TPM_DIM !USE TPM_GEOMETRY -USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP,LATLON, ZGTF +USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP,LATLON USE TPM_DISTR ,ONLY : D, MYPROC, NPROC USE TPM_FLT ,ONLY : S USE FOURIER_IN_MOD ,ONLY : FOURIER_IN @@ -73,9 +73,6 @@ SUBROUTINE FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& USE FTINV_MOD ,ONLY : FTINV USE TRLTOG_MOD ,ONLY : TRLTOG, TRLTOG_CUDAAWARE USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS -USE TPM_FIELDS ,ONLY : ZGTF_START, ZGTF_START_INDEX_UV, ZGTF_START_INDEX_NSDERS, & - & ZGTF_START_INDEX_UVDERS, ZGTF_START_INDEX_EWDERS, ZGTF_START_INDEX_SCALAR, & - & ZGTF_START_INDEX_VOR, ZGTF_START_INDEX_DIV, ZGTF_START_INDEX_END use ieee_arithmetic ! @@ -103,11 +100,16 @@ SUBROUTINE FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP2(:,:,:) REAL(KIND=JPRB), ALLOCATABLE, INTENT(IN) :: FOUBUF(:) +REAL(KIND=JPRBT), ALLOCATABLE, TARGET :: ZGTF(:,:) +REAL(KIND=JPRBT), POINTER :: & + & PUV(:,:), PSCALARS(:,:), PSCALARS_NSDER(:,:), PUV_EWDER(:,:), PSCALARS_EWDER(:,:) + INTEGER(KIND=JPIM) :: IST INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) INTEGER(KIND=JPIM) :: IVSET(KF_GP) INTEGER(KIND=JPIM) :: J3,JGL,IGL,IOFF,IFGP2,IFGP3A,IFGP3B,IGP3APAR,IGP3BPAR +INTEGER(KIND=JPIM) :: IFIRST INTEGER(KIND=JPIM) :: JF_FS @@ -120,36 +122,53 @@ SUBROUTINE FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& CALL GSTATS(1639,0) - ! Figure out where we want to store data in ZGTF - IST = 1 - ZGTF_START(ZGTF_START_INDEX_VOR) = IST - IF (LVORGP) IST = IST+KF_UV - ZGTF_START(ZGTF_START_INDEX_DIV) = IST - IF (LDIVGP) IST = IST+KF_UV - ZGTF_START(ZGTF_START_INDEX_UV) = IST - IST = IST+2*KF_UV - ZGTF_START(ZGTF_START_INDEX_SCALAR) = IST - IST = IST+KF_SCALARS - ZGTF_START(ZGTF_START_INDEX_NSDERS) = IST - IF (LSCDERS) IST = IST+KF_SCDERS - ZGTF_START(ZGTF_START_INDEX_UVDERS) = IST - IF (LUVDER) IST = IST+2*KF_UV - ZGTF_START(ZGTF_START_INDEX_EWDERS) = IST - IF (LSCDERS) IST = IST+KF_SCDERS - ZGTF_START(ZGTF_START_INDEX_END) = IST +! Compute ZGTF Domain decomposition +IFIRST = 0 +IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity +IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence +IFIRST = IFIRST + 2*KF_UV ! U and V +IFIRST = IFIRST + KF_SCALARS ! Scalars +IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives +IF (2*IFIRST /= KFIELD) CALL ABORT_TRANS('IFIRST /= KFIELD') +IF (LUVDER) IST = IST+2*KF_UV ! U and V derivatives +IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives + +ALLOCATE(ZGTF(2*IFIRST, D%NLENGTF)) +!$ACC ENTER DATA CREATE(ZGTF) + +! And reiterate domain decomposition to assign pointers +IFIRST = 0 +IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity +IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence +PUV => ZGTF(2*IFIRST+1:2*(IFIRST+2*KF_UV),:) +IFIRST = IFIRST + 2*KF_UV ! U and V +PSCALARS => ZGTF(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) +IFIRST = IFIRST + KF_SCALARS ! Scalars +IF (LSCDERS) THEN + PSCALARS_NSDER => ZGTF(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) + IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives +ENDIF +IF (LUVDER) THEN + PUV_EWDER => ZGTF(2*IFIRST+1:2*(IFIRST+2*KF_UV),:) + IST = IST+2*KF_UV ! U and V derivatives +ENDIF +IF (LSCDERS) THEN + PSCALARS_EWDER => ZGTF(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) + IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives +ENDIF ! from FOUBUF to ZGTF CALL FOURIER_IN(FOUBUF,ZGTF,KFIELD) ! 2. Fourier space computations -! fills rest of data up to ZGTF_START_INDEX_UVDERS -CALL FSC +! fill the rest of ZGTF +CALL FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCALARS_EWDER) ! 3. Fourier transform ! from ZGTF to ZGTF IF(KF_FS > 0) THEN - CALL FTINV(ZGTF,size(zgtf,1),ZGTF_START(ZGTF_START_INDEX_UVDERS+1)-1) + CALL FTINV(ZGTF,SIZE(ZGTF,1)) ENDIF CALL GSTATS(1639,1) @@ -248,7 +267,8 @@ SUBROUTINE FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& CALL GSTATS(157,1) ! ------------------------------------------------------------------ -!DEALLOCATE(ZGTF) +!$ACC EXIT DATA DELETE(ZGTF) +DEALLOCATE(ZGTF) END SUBROUTINE FTINV_CTL END MODULE FTINV_CTL_MOD diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index 297fba3d0..658bb36c4 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -10,7 +10,7 @@ MODULE FTINV_MOD CONTAINS -SUBROUTINE FTINV(PREEL,STRIDE,KFIELDS) +SUBROUTINE FTINV(PREEL,KFIELD) !**** *FTINV - Inverse Fourier transform @@ -55,7 +55,7 @@ SUBROUTINE FTINV(PREEL,STRIDE,KFIELDS) IMPLICIT NONE -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS, STRIDE +INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD REAL(KIND=JPRBT), INTENT(INOUT) :: PREEL(:,:) INTEGER(KIND=JPIM) :: IGLG,IOFF,KGL @@ -91,7 +91,7 @@ SUBROUTINE FTINV(PREEL,STRIDE,KFIELDS) IOFF=D%NSTAGTF(KGL)+1 IGLG = D%NPTRLS(MYSETW)+KGL-1 - CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),2*KFIELDS,STRIDE) + CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),KFIELD,KFIELD) !$ACC HOST_DATA USE_DEVICE(PREEL,PREEL2) CALL EXECUTE_PLAN_FFTC(IPLAN_C2R,1,PREEL(1,IOFF),PREEL2(1,IOFF)) !$ACC END HOST_DATA diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index 34ad7bca9..8dfa193bb 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -170,8 +170,10 @@ SUBROUTINE LTINV(KF_UV,KF_SCALARS,& IFIRST = IFIRST + 2*KF_UV ! V PSCALARS => PIA(IFIRST+1:IFIRST+2*KF_SCALARS,:,:) IFIRST = IFIRST + 2*KF_SCALARS ! Scalars - PSCALARS_NSDER => PIA(IFIRST+1:IFIRST+2*KF_SCALARS,:,:) - IF (LSCDERS) IFIRST = IFIRST + 2*KF_SCALARS ! Scalars NS Derivatives + IF (LSCDERS) THEN + PSCALARS_NSDER => PIA(IFIRST+1:IFIRST+2*KF_SCALARS,:,:) + IFIRST = IFIRST + 2*KF_SCALARS ! Scalars NS Derivatives + ENDIF CALL GSTATS(431,1) IF (KF_UV > 0) THEN diff --git a/src/trans/gpu/internal/tpm_fields.F90 b/src/trans/gpu/internal/tpm_fields.F90 index 3067f2228..59f14bab3 100755 --- a/src/trans/gpu/internal/tpm_fields.F90 +++ b/src/trans/gpu/internal/tpm_fields.F90 @@ -62,15 +62,4 @@ MODULE TPM_FIELDS REAL(KIND=JPRBT),ALLOCATABLE, TARGET :: ZOA1(:,:,:) REAL(KIND=JPRBT),ALLOCATABLE, TARGET :: ZOA2(:,:,:) -! TODO find a better place for this -INTEGER(JPIM) :: ZGTF_START(8) -INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_VOR = 1 -INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_DIV = 2 -INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_UV = 3 -INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_SCALAR = 4 -INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_NSDERS = 5 -INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_UVDERS = 6 -INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_EWDERS = 7 -INTEGER(JPIM), PARAMETER :: ZGTF_START_INDEX_END = 8 - END MODULE TPM_FIELDS From 77748f49842245fd9cd1284b9230aac843136ea1 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:07 -0700 Subject: [PATCH 095/263] Avoid copy of ZGTF --- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 2 +- src/trans/gpu/internal/ftinv_mod.F90 | 30 ++++++++++++++++-------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index 2e131b59f..c559e3cca 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -100,7 +100,7 @@ SUBROUTINE FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP2(:,:,:) REAL(KIND=JPRB), ALLOCATABLE, INTENT(IN) :: FOUBUF(:) -REAL(KIND=JPRBT), ALLOCATABLE, TARGET :: ZGTF(:,:) +REAL(KIND=JPRBT), POINTER :: ZGTF(:,:) REAL(KIND=JPRBT), POINTER :: & & PUV(:,:), PSCALARS(:,:), PSCALARS_NSDER(:,:), PUV_EWDER(:,:), PSCALARS_EWDER(:,:) diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index 658bb36c4..bcba1b304 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -56,13 +56,12 @@ SUBROUTINE FTINV(PREEL,KFIELD) IMPLICIT NONE INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD -REAL(KIND=JPRBT), INTENT(INOUT) :: PREEL(:,:) +REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL(:,:) -INTEGER(KIND=JPIM) :: IGLG,IOFF,KGL +INTEGER(KIND=JPIM) :: IGLG,IOFF,KGL,IRET INTEGER(KIND=JPIM) :: IPLAN_C2R INTEGER(KIND=JPIM) :: IBEG,IEND,IINC - -REAL(KIND=JPRBT), allocatable :: PREEL2(:,:) +REAL(KIND=JPRBT), POINTER :: PREEL2(:,:), TMP(:,:) ! ------------------------------------------------------------------ @@ -78,8 +77,10 @@ SUBROUTINE FTINV(PREEL,KFIELD) IINC=-1 ENDIF -allocate(preel2(size(preel,1),size(preel,2))) -!$acc data create(preel2) present(preel) +ALLOCATE(PREEL2(SIZE(PREEL,1),SIZE(PREEL,2))) +!$ACC ENTER DATA CREATE(PREEL2) + +!$ACC DATA PRESENT(PREEL,PREEL2) IF (LSYNC_TRANS) THEN CALL MPL_BARRIER(CDSTRING='FTINV BARRIER') @@ -97,15 +98,24 @@ SUBROUTINE FTINV(PREEL,KFIELD) !$ACC END HOST_DATA END DO +IRET = CUDA_SYNCHRONIZE() + IF (LSYNC_TRANS) THEN CALL MPL_BARRIER(CDSTRING='FTINV BARRIER') ENDIF CALL GSTATS(451,1) -!$acc kernels -preel(:,:) = preel2(:,:) -!$acc end kernels -!$acc end data +!$ACC END DATA + +! Swap pointers +TMP => PREEL +PREEL => PREEL2 +PREEL2 => TMP + +! and deallocate the local pointer +!$ACC EXIT DATA DELETE(PREEL2) +DEALLOCATE(PREEL2) + ! ------------------------------------------------------------------ END SUBROUTINE FTINV From 60480532767f48cb2bde763d6eaa3d3c10963714 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:07 -0700 Subject: [PATCH 096/263] Add missing synchronization in ftdir --- src/trans/gpu/internal/ftdir_mod.F90 | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index ca5c4796b..8c354a1ea 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -68,7 +68,7 @@ SUBROUTINE FTDIR(STRIDE,KF_FS) INTEGER(KIND=JPIM) :: IPLAN_R2C REAL(KIND=JPRBT) :: SCAL -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC +INTEGER(KIND=JPIM) :: IBEG,IEND,IINC, IRET INTEGER(KIND=JPIM) :: OFFSET_VAR real(kind=jprbt), allocatable :: zgtf2(:,:) @@ -107,6 +107,8 @@ SUBROUTINE FTDIR(STRIDE,KF_FS) !$ACC end host_data END DO +IRET = CUDA_SYNCHRONIZE() + IF (LSYNC_TRANS) THEN CALL MPL_BARRIER(CDSTRING='FTDIR BARRIER') ENDIF From 0fb20ea24cfae02c6059068f977a6102cb12db10 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:08 -0700 Subject: [PATCH 097/263] Make interface slightly more restrictive. If we want to have this flexibility, the interface should take care! --- src/trans/gpu/external/inv_trans.F90 | 90 ++++++++++++++-------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/src/trans/gpu/external/inv_trans.F90 b/src/trans/gpu/external/inv_trans.F90 index 01f89013c..aa0747ce8 100755 --- a/src/trans/gpu/external/inv_trans.F90 +++ b/src/trans/gpu/external/inv_trans.F90 @@ -427,16 +427,16 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& IF(.NOT. PRESENT(PSPVOR) ) THEN CALL ABORT_TRANS('INV_TRANS : IF_UV > 0 BUT PSPVOR MISSING') ENDIF - IF(UBOUND(PSPVOR,1) < IF_UV) THEN - WRITE(NERR,*)'INV_TRANS : UBOUND(PSPVOR,1) < IF_UV ',UBOUND(PSPVOR,1),IF_UV - CALL ABORT_TRANS('INV_TRANS : PSPVOR TOO SHORT') + IF(UBOUND(PSPVOR,1) /= IF_UV) THEN + WRITE(NERR,*)'INV_TRANS : UBOUND(PSPVOR,1) /= IF_UV ',UBOUND(PSPVOR,1),IF_UV + CALL ABORT_TRANS('INV_TRANS : PSPVOR TOO SHORT OR TOO LONG') ENDIF IF(.NOT. PRESENT(PSPDIV) ) THEN CALL ABORT_TRANS('INV_TRANS : IF_UV > 0 BUT PSPDIV MISSING') ENDIF - IF(UBOUND(PSPDIV,1) < IF_UV) THEN - WRITE(NERR,*)'INV_TRANS : UBOUND(PSPDIV,1) < IF_UV ',UBOUND(PSPDIV,1),IF_UV - CALL ABORT_TRANS('INV_TRANS : PSPDIV TOO SHORT') + IF(UBOUND(PSPDIV,1) /= IF_UV) THEN + WRITE(NERR,*)'INV_TRANS : UBOUND(PSPDIV,1) /= IF_UV ',UBOUND(PSPDIV,1),IF_UV + CALL ABORT_TRANS('INV_TRANS : PSPDIV TOO SHORT OR TOO LONG') ENDIF ENDIF @@ -451,10 +451,10 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& IF(PRESENT(PSPSC2))THEN CALL ABORT_TRANS('INV_TRANS : PSPSCALAR AND PSPSC2 BOTH PRESENT') ENDIF - IF(UBOUND(PSPSCALAR,1) < IF_SCALARS) THEN - WRITE(NERR,*)'INV_TRANS : UBOUND(PSPSCALAR,1) < IF_SCALARS) ',& + IF(UBOUND(PSPSCALAR,1) /= IF_SCALARS) THEN + WRITE(NERR,*)'INV_TRANS : UBOUND(PSPSCALAR,1) /= IF_SCALARS) ',& & UBOUND(PSPSCALAR,1),IF_SCALARS - CALL ABORT_TRANS('INV_TRANS : PSPSCALAR TOO SHORT') + CALL ABORT_TRANS('INV_TRANS : PSPSCALAR TOO SHORT OR TOO LONG') ENDIF ELSEIF(PRESENT(PSPSC3A)) THEN ENDIF @@ -506,19 +506,19 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& CALL ABORT_TRANS('INV_TRANS:PGP AND PGP2 CAN NOT BOTH BE PRESENT') ENDIF IUBOUND(1:3)=UBOUND(PGP) - IF(IUBOUND(1) < NPROMA) THEN - WRITE(NOUT,*)'INV_TRANS:FIRST DIM. OF PGP TOO SMALL ',IUBOUND(1),NPROMA - CALL ABORT_TRANS('INV_TRANS:FIRST DIMENSION OF PGP TOO SMALL ') + IF(IUBOUND(1) /= NPROMA) THEN + WRITE(NOUT,*)'INV_TRANS:FIRST DIM. OF PGP TOO SMALL/LARGE ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('INV_TRANS:FIRST DIMENSION OF PGP TOO SMALL/LARGE ') ENDIF - IF(IUBOUND(2) < IF_GP) THEN - WRITE(NOUT,*)'INV_TRANS:SEC. DIM. OF PGP TOO SMALL ',IUBOUND(2),IF_GP + IF(IUBOUND(2) /= IF_GP) THEN + WRITE(NOUT,*)'INV_TRANS:SEC. DIM. OF PGP TOO SMALL/LARGE ',IUBOUND(2),IF_GP WRITE(NOUT,*)'IF_UV_G,IF_SCALARS_G,LSCDERS,LVORGP,LDIVGP,LUVDER ',& & IF_UV_G,IF_SCALARS_G,LSCDERS,LVORGP,LDIVGP,LUVDER - CALL ABORT_TRANS('INV_TRANS:SECOND DIMENSION OF PGP TOO SMALL ') + CALL ABORT_TRANS('INV_TRANS:SECOND DIMENSION OF PGP TOO SMALL/LARGE ') ENDIF - IF(IUBOUND(3) < NGPBLKS) THEN - WRITE(NOUT,*)'INV_TRANS:THIRD DIM. OF PGP TOO SMALL ',IUBOUND(3),NGPBLKS - CALL ABORT_TRANS('INV_TRANS:THIRD DIMENSION OF PGP TOO SMALL ') + IF(IUBOUND(3) /= NGPBLKS) THEN + WRITE(NOUT,*)'INV_TRANS:THIRD DIM. OF PGP TOO SMALL/LARGE ',IUBOUND(3),NGPBLKS + CALL ABORT_TRANS('INV_TRANS:THIRD DIMENSION OF PGP TOO SMALL/LARGE ') ENDIF ELSE IF(NPROMATR > 0 .AND. 2*IF_UV_G+IF_SCALARS_G > NPROMATR) THEN @@ -531,21 +531,21 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& CALL ABORT_TRANS('INV_TRANS:PSPVOR HAS TO BE PRESENT WHEN PGPUV IS') ENDIF IUBOUND(1:4)=UBOUND(PGPUV) - IF(IUBOUND(1) < NPROMA) THEN - WRITE(NOUT,*)'INV_TRANS:FIRST DIM. OF PGPUV TOO SMALL ',IUBOUND(1),NPROMA - CALL ABORT_TRANS('INV_TRANS:FIRST DIMENSION OF PGPUV TOO SMALL ') + IF(IUBOUND(1) /= NPROMA) THEN + WRITE(NOUT,*)'INV_TRANS:FIRST DIM. OF PGPUV TOO SMALL/LARGE ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('INV_TRANS:FIRST DIMENSION OF PGPUV TOO SMALL/LARGE ') ENDIF IF(IUBOUND(2) /= IF_UV_G) THEN WRITE(NOUT,*)'INV_TRANS:SEC. DIM. OF PGPUV INCONSISTENT ',IUBOUND(2),IF_UV_G CALL ABORT_TRANS('INV_TRANS:SEC. DIMENSION OF PGPUV INCONSISTENT ') ENDIF - IF(IUBOUND(3) < IF_UV_PAR) THEN - WRITE(NOUT,*)'INV_TRANS:THIRD DIM. OF PGPUV TOO SMALL ',IUBOUND(3),IF_UV_PAR - CALL ABORT_TRANS('INV_TRANS:THIRD DIMENSION OF PGPUV TOO SMALL ') + IF(IUBOUND(3) /= IF_UV_PAR) THEN + WRITE(NOUT,*)'INV_TRANS:THIRD DIM. OF PGPUV TOO SMALL/LARGE ',IUBOUND(3),IF_UV_PAR + CALL ABORT_TRANS('INV_TRANS:THIRD DIMENSION OF PGPUV TOO SMALL/LARGE ') ENDIF - IF(IUBOUND(4) < NGPBLKS) THEN - WRITE(NOUT,*)'INV_TRANS:THIRD DIM. OF PGPUV TOO SMALL ',IUBOUND(4),NGPBLKS - CALL ABORT_TRANS('INV_TRANS:THIRD DIMENSION OF PGPUV TOO SMALL ') + IF(IUBOUND(4) /= NGPBLKS) THEN + WRITE(NOUT,*)'INV_TRANS:THIRD DIM. OF PGPUV TOO SMALL/LARGE ',IUBOUND(4),NGPBLKS + CALL ABORT_TRANS('INV_TRANS:THIRD DIMENSION OF PGPUV TOO SMALL/LARGE ') ENDIF ENDIF IF(PRESENT(PGP2)) THEN @@ -556,17 +556,17 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& IF(IF_SC2_G > 0) THEN IF(PRESENT(PGP2)) THEN IUBOUND(1:3)=UBOUND(PGP2) - IF(IUBOUND(1) < NPROMA) THEN - WRITE(NOUT,*)'INV_TRANS:FIRST DIM. OF PGP2 TOO SMALL ',IUBOUND(1),NPROMA - CALL ABORT_TRANS('INV_TRANS:FIRST DIMENSION OF PGP2 TOO SMALL ') + IF(IUBOUND(1) /= NPROMA) THEN + WRITE(NOUT,*)'INV_TRANS:FIRST DIM. OF PGP2 TOO SMALL/LARGE ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('INV_TRANS:FIRST DIMENSION OF PGP2 TOO SMALL/LARGE ') ENDIF IF(IUBOUND(2) /= IF_SC2_G) THEN WRITE(NOUT,*)'INV_TRANS:SEC. DIM. OF PGP2 INCONSISTENT ',IUBOUND(2),IF_SC2_G CALL ABORT_TRANS('INV_TRANS:SEC. DIMENSION OF PGP2 INCONSISTENT') ENDIF - IF(IUBOUND(3) < NGPBLKS) THEN - WRITE(NOUT,*)'INV_TRANS:THIRD DIM. OF PGP2 TOO SMALL ',IUBOUND(3),NGPBLKS - CALL ABORT_TRANS('INV_TRANS:THIRD DIMENSION OF PGP2 TOO SMALL ') + IF(IUBOUND(3) /= NGPBLKS) THEN + WRITE(NOUT,*)'INV_TRANS:THIRD DIM. OF PGP2 TOO SMALL/LARGE ',IUBOUND(3),NGPBLKS + CALL ABORT_TRANS('INV_TRANS:THIRD DIMENSION OF PGP2 TOO SMALL/LARGE ') ENDIF ELSE CALL ABORT_TRANS('INV_TRANS:PGP2 MISSING') @@ -581,9 +581,9 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& IF(IF_SC3A_G3 > 0) THEN IF(PRESENT(PGP3A)) THEN IUBOUND=UBOUND(PGP3A) - IF(IUBOUND(1) < NPROMA) THEN - WRITE(NOUT,*)'INV_TRANS:FIRST DIM. OF PGP3A TOO SMALL ',IUBOUND(1),NPROMA - CALL ABORT_TRANS('INV_TRANS:FIRST DIMENSION OF PGP3A TOO SMALL ') + IF(IUBOUND(1) /= NPROMA) THEN + WRITE(NOUT,*)'INV_TRANS:FIRST DIM. OF PGP3A TOO SMALL/LARGE ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('INV_TRANS:FIRST DIMENSION OF PGP3A TOO SMALL/LARGE ') ENDIF IF(IUBOUND(2) /= IF_SC3A_G2) THEN WRITE(NOUT,*)'INV_TRANS:SEC DIM. OF PGP3A INCONSISTENT ',IUBOUND(2),IF_SC3A_G2 @@ -594,9 +594,9 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& & IUBOUND(3),IF_SC3A_G3 CALL ABORT_TRANS('INV_TRANS:THIRD DIMENSION OF PGP3A INCONSISTENT ') ENDIF - IF(IUBOUND(4) < NGPBLKS) THEN - WRITE(NOUT,*)'INV_TRANS:FOURTH DIM. OF PGP3A TOO SMALL ',IUBOUND(4),NGPBLKS - CALL ABORT_TRANS('INV_TRANS:FOURTH DIMENSION OF PGP3A TOO SMALL ') + IF(IUBOUND(4) /= NGPBLKS) THEN + WRITE(NOUT,*)'INV_TRANS:FOURTH DIM. OF PGP3A TOO SMALL/LARGE ',IUBOUND(4),NGPBLKS + CALL ABORT_TRANS('INV_TRANS:FOURTH DIMENSION OF PGP3A TOO SMALL/LARGE ') ENDIF ELSE CALL ABORT_TRANS('INV_TRANS:PGP3A MISSING') @@ -611,9 +611,9 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& IF(IF_SC3B_G3 > 0) THEN IF(PRESENT(PGP3B)) THEN IUBOUND=UBOUND(PGP3B) - IF(IUBOUND(1) < NPROMA) THEN - WRITE(NOUT,*)'INV_TRANS:FIRST DIM. OF PGP3B TOO SMALL ',IUBOUND(1),NPROMA - CALL ABORT_TRANS('INV_TRANS:FIRST DIMENSION OF PGP3B TOO SMALL ') + IF(IUBOUND(1) /= NPROMA) THEN + WRITE(NOUT,*)'INV_TRANS:FIRST DIM. OF PGP3B TOO SMALL/LARGE ',IUBOUND(1),NPROMA + CALL ABORT_TRANS('INV_TRANS:FIRST DIMENSION OF PGP3B TOO SMALL/LARGE ') ENDIF IF(IUBOUND(2) /= IF_SC3B_G2) THEN WRITE(NOUT,*)'INV_TRANS:SEC DIM. OF PGP3B INCONSISTENT ',IUBOUND(2),IF_SC3B_G2 @@ -624,9 +624,9 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& & IUBOUND(3),IF_SC3B_G3 CALL ABORT_TRANS('INV_TRANS:THIRD DIMENSION OF PGP3B INCONSISTENT ') ENDIF - IF(IUBOUND(4) < NGPBLKS) THEN - WRITE(NOUT,*)'INV_TRANS:FOURTH DIM. OF PGP3B TOO SMALL ',IUBOUND(4),NGPBLKS - CALL ABORT_TRANS('INV_TRANS:FOURTH DIMENSION OF PGP3B TOO SMALL ') + IF(IUBOUND(4) /= NGPBLKS) THEN + WRITE(NOUT,*)'INV_TRANS:FOURTH DIM. OF PGP3B TOO SMALL/LARGE ',IUBOUND(4),NGPBLKS + CALL ABORT_TRANS('INV_TRANS:FOURTH DIMENSION OF PGP3B TOO SMALL/LARGE ') ENDIF ELSE CALL ABORT_TRANS('INV_TRANS:PGP3B MISSING') From ec7e47d37e4915f1ae08c709e331f1b84bdbb860 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:08 -0700 Subject: [PATCH 098/263] Slightly reduce the interfaces --- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 39 ++++++++------------ src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 8 ++-- src/trans/gpu/internal/trltog_mod.F90 | 1 + 3 files changed, 20 insertions(+), 28 deletions(-) diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index c559e3cca..3a9c1055e 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -10,10 +10,10 @@ MODULE FTINV_CTL_MOD CONTAINS -SUBROUTINE FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& - & KF_UV,KF_SCALARS,KF_SCDERS,KF_GP,KF_FS,KF_OUT_LT,FOUBUF,KVSETUV,KVSETSC,KPTRGP, & - & KVSETSC3A,KVSETSC3B,KVSETSC2,& - & PGP,PGPUV,PGP3A,PGP3B,PGP2) +SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& + & KF_UV,KF_SCALARS,KF_GP,FOUBUF, & + & KVSETUV,KVSETSC,KVSETSC3A,KVSETSC3B,KVSETSC2,& + & PGP,PGPUV,PGP3A,PGP3B,PGP2,KPTRGP) !**** *FTINV_CTL - Inverse Fourier transform control @@ -32,10 +32,7 @@ SUBROUTINE FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& ! KF_SCALARS_G - global number of scalar spectral fields ! KF_UV - local number of spectral u-v fields ! KF_SCALARS - local number of scalar spectral fields -! KF_SCDERS - local number of derivatives of scalar spectral fields ! KF_GP - total number of output gridpoint fields -! KF_FS - total number of fields in fourier space -! KF_OUT_LT - total number of fields coming out from inverse LT ! KVSETUV - "B" set in spectral/fourier space for ! u and v variables ! KVSETSC - "B" set in spectral/fourier space for @@ -78,15 +75,12 @@ SUBROUTINE FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& IMPLICIT NONE -INTEGER(KIND=JPIM) ,INTENT(IN) :: KFIELD +INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_INPUT INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_UV_G INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_SCALARS_G INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_UV INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_SCALARS -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_SCDERS INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_GP -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_FS -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_OUT_LT INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) @@ -110,8 +104,7 @@ SUBROUTINE FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& INTEGER(KIND=JPIM) :: IVSET(KF_GP) INTEGER(KIND=JPIM) :: J3,JGL,IGL,IOFF,IFGP2,IFGP3A,IFGP3B,IGP3APAR,IGP3BPAR INTEGER(KIND=JPIM) :: IFIRST - -INTEGER(KIND=JPIM) :: JF_FS +INTEGER(KIND=JPIM) :: KF_FS ! ------------------------------------------------------------------ @@ -119,7 +112,6 @@ SUBROUTINE FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& ! 1. Copy Fourier data to local array CALL GSTATS(107,0) - CALL GSTATS(1639,0) ! Compute ZGTF Domain decomposition @@ -129,11 +121,13 @@ SUBROUTINE FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& IFIRST = IFIRST + 2*KF_UV ! U and V IFIRST = IFIRST + KF_SCALARS ! Scalars IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives -IF (2*IFIRST /= KFIELD) CALL ABORT_TRANS('IFIRST /= KFIELD') +! This verifies if we get the same assumptions about how much data we get from the LT space +IF (2*IFIRST /= KF_INPUT) CALL ABORT_TRANS('Size mismatch: LT and FT do not agree on input size') IF (LUVDER) IST = IST+2*KF_UV ! U and V derivatives IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives +KF_FS = IFIRST -ALLOCATE(ZGTF(2*IFIRST, D%NLENGTF)) +ALLOCATE(ZGTF(2*KF_FS, D%NLENGTF)) !$ACC ENTER DATA CREATE(ZGTF) ! And reiterate domain decomposition to assign pointers @@ -158,7 +152,7 @@ SUBROUTINE FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& ENDIF ! from FOUBUF to ZGTF -CALL FOURIER_IN(FOUBUF,ZGTF,KFIELD) +CALL FOURIER_IN(FOUBUF,ZGTF,KF_INPUT) ! 2. Fourier space computations @@ -166,13 +160,12 @@ SUBROUTINE FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& CALL FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCALARS_EWDER) ! 3. Fourier transform -! from ZGTF to ZGTF +! inplace operation IF(KF_FS > 0) THEN - CALL FTINV(ZGTF,SIZE(ZGTF,1)) + CALL FTINV(ZGTF,2*KF_FS) ENDIF CALL GSTATS(1639,1) - CALL GSTATS(107,1) ! 4. Transposition @@ -253,15 +246,13 @@ SUBROUTINE FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& ENDIF CALL GSTATS(157,0) -JF_FS=KF_FS-D%IADJUST_I #ifdef USE_CUDA_AWARE_MPI_FT WRITE(NOUT,*) 'ftinv_ctl:TRLTOG_CUDAAWARE' -CALL TRLTOG_CUDAAWARE(ZGTF,JF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& +CALL TRLTOG_CUDAAWARE(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) #else !WRITE(NOUT,*) 'ftinv_ctl:TRLTOG' -!$ACC UPDATE HOST(ZGTF) -CALL TRLTOG(ZGTF,JF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& +CALL TRLTOG(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) #endif CALL GSTATS(157,1) diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index 285c5f3ec..3315e6ac7 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -265,18 +265,18 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& IF(IF_UV_G > 0 .AND. IF_SCALARS_G > 0) THEN CALL FTINV_CTL(KFIELD,IF_UV_G,IF_SCALARS_G,& - & IF_UV,IF_SCALARS,IF_SCDERS,IF_GP,IF_FS,IF_OUT_LT,& + & IF_UV,IF_SCALARS,IF_GP,& & FOUBUF,KVSETUV=IVSETUV(ISTUV_G:IENUV_G),& & KVSETSC=IVSETSC(ISTSC_G:IENSC_G),KPTRGP=IPTRGP,& & PGP=PGP) ELSEIF(IF_UV_G > 0) THEN CALL FTINV_CTL(KFIELD,IF_UV_G,IF_SCALARS_G,& - & IF_UV,IF_SCALARS,IF_SCDERS,IF_GP,IF_FS,IF_OUT_LT,& + & IF_UV,IF_SCALARS,IF_GP,& & FOUBUF,KVSETUV=IVSETUV(ISTUV_G:IENUV_G),KPTRGP=IPTRGP,& & PGP=PGP) ELSEIF(IF_SCALARS_G > 0) THEN CALL FTINV_CTL(KFIELD,IF_UV_G,IF_SCALARS_G,& - & IF_UV,IF_SCALARS,IF_SCDERS,IF_GP,IF_FS,IF_OUT_LT,& + & IF_UV,IF_SCALARS,IF_GP,& & FOUBUF,KVSETSC=IVSETSC(ISTSC_G:IENSC_G),KPTRGP=IPTRGP,& & PGP=PGP) ENDIF @@ -295,7 +295,7 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& ! from FOUBUF to PGPXXX call nvtxStartRange("FTINV") CALL FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& - & KF_UV,KF_SCALARS,KF_SCDERS,KF_GP,KF_FS,KF_OUT_LT,& + & KF_UV,KF_SCALARS,KF_GP,& & FOUBUF,KVSETUV=KVSETUV,KVSETSC=KVSETSC,& & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index c1b19f023..26d6e6982 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -784,6 +784,7 @@ SUBROUTINE TRLTOG(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ! -------------------- IF (LHOOK) CALL DR_HOOK('TRLTOG',0,ZHOOK_HANDLE) + !$ACC UPDATE HOST(PGLAT) CALL GSTATS(1806,0) From 78514a5a312ce9b447e85cc3d1bf2ddb994a730c Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:08 -0700 Subject: [PATCH 099/263] Typo: Wrong offsets to FSC --- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 6 +++--- src/trans/gpu/internal/trltog_mod.F90 | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index 3a9c1055e..1c451b43e 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -123,7 +123,7 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives ! This verifies if we get the same assumptions about how much data we get from the LT space IF (2*IFIRST /= KF_INPUT) CALL ABORT_TRANS('Size mismatch: LT and FT do not agree on input size') -IF (LUVDER) IST = IST+2*KF_UV ! U and V derivatives +IF (LUVDER) IFIRST = IFIRST+2*KF_UV ! U and V derivatives IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives KF_FS = IFIRST @@ -144,7 +144,7 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& ENDIF IF (LUVDER) THEN PUV_EWDER => ZGTF(2*IFIRST+1:2*(IFIRST+2*KF_UV),:) - IST = IST+2*KF_UV ! U and V derivatives + IFIRST = IFIRST+2*KF_UV ! U and V derivatives ENDIF IF (LSCDERS) THEN PSCALARS_EWDER => ZGTF(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) @@ -248,7 +248,7 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& CALL GSTATS(157,0) #ifdef USE_CUDA_AWARE_MPI_FT WRITE(NOUT,*) 'ftinv_ctl:TRLTOG_CUDAAWARE' -CALL TRLTOG_CUDAAWARE(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& +CALL TRLTOG_CUDAAWARE(ZGTF,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) #else !WRITE(NOUT,*) 'ftinv_ctl:TRLTOG' diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 26d6e6982..6fb5edca7 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -10,7 +10,7 @@ MODULE TRLTOG_MOD CONTAINS - SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& + SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) !**** *trltog * - transposition of grid point data from latitudinal @@ -89,7 +89,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& REAL(KIND=JPRBT), INTENT(IN) :: PGLAT(:,:) INTEGER(KIND=JPIM),INTENT(IN) :: KVSET(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP - INTEGER(KIND=JPIM),INTENT(IN) :: KF_SCALARS_G + INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV_G, KF_SCALARS_G INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP(:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGPUV(:,:,:,:) From 7bf22c08b8a6f15eb104e09dcbe9f8ab751af3ea Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:08 -0700 Subject: [PATCH 100/263] Cleanup TRLTOG vertical offsets --- src/trans/gpu/internal/trltog_mod.F90 | 327 +++++++++++--------------- 1 file changed, 139 insertions(+), 188 deletions(-) diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 6fb5edca7..70f7b540b 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -82,6 +82,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, USE PE2SET_MOD ,ONLY : PE2SET USE MPL_DATA_MODULE ,ONLY : MPL_COMM_OML USE OML_MOD ,ONLY : OML_MY_THREAD + USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS USE MPI IMPLICIT NONE @@ -113,11 +114,10 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, &IR, ILOCAL_LAT, ISEND_COUNTS, IRECV_COUNTS, IERROR, II, ILEN, IBUFLENS, IBUFLENR, & &JBLK, ILAT_STRIP - INTEGER(KIND=JPIM) :: IUVLEVS(KF_GP),IUVPARS(KF_GP),IGP2PARS(KF_GP) - INTEGER(KIND=JPIM) :: IGP3APARS(KF_GP),IGP3ALEVS(KF_GP),IGP3BPARS(KF_GP),IGP3BLEVS(KF_GP) - INTEGER(KIND=JPIM) :: IUVPAR,IUVLEV,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR,IPAROFF - INTEGER(KIND=JPIM) :: IOFF,IOFF1,IOFFNS,IOFFEW - INTEGER(KIND=JPIM) :: NDERS + ! Contains FIELD, PARS, LEVS + INTEGER(KIND=JPIM) :: IGP_OFFSETS(KF_GP,3) + INTEGER(KIND=JPIM), PARAMETER :: IGP_OFFSETS_UV=1, IGP_OFFSETS_GP2=2, IGP_OFFSETS_GP3A=3, IGP_OFFSETS_GP3B=4 + INTEGER(KIND=JPIM) :: IUVPAR,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR,IPAROFF,IOFF INTEGER(KIND=JPIM) :: IFLDA(KF_GP) INTEGER(KIND=JPIM) :: IIN_TO_SEND_BUFR(D%NLENGTF),IIN_TO_SEND_BUFR_OFFSET(NPROC), IIN_TO_SEND_BUFR_V @@ -127,10 +127,6 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V - LOGICAL :: LLPGPUV,LLPGP3A,LLPGP3B,LLPGP2,LLPGPONLY - LOGICAL :: LLUV(KF_GP),LLGP2(KF_GP),LLGP3A(KF_GP),LLGP3B(KF_GP) - LOGICAL :: LLINDER - REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR @@ -147,180 +143,135 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, ! -------------------- IF (LHOOK) CALL DR_HOOK('TRLTOG',0,ZHOOK_HANDLE) - CALL GSTATS(1806,0) - LLINDER = .FALSE. - LLPGPUV = .FALSE. - LLPGP3A = .FALSE. - LLPGP3B = .FALSE. - LLPGP2 = .FALSE. - LLPGPONLY = .FALSE. - IF(PRESENT(KPTRGP)) LLINDER=.TRUE. - IF(PRESENT(PGP)) LLPGPONLY=.TRUE. - IF(PRESENT(PGPUV)) LLPGPUV=.TRUE. - IF(PRESENT(PGP3A)) LLPGP3A=.TRUE. - IF(PRESENT(PGP3B)) LLPGP3B=.TRUE. - IF(PRESENT(PGP2)) LLPGP2=.TRUE. - IUVPAR=0 - IUVLEV=0 - IOFF1=0 - IOFFNS=KF_SCALARS_G - IOFFEW=2*KF_SCALARS_G + IF (.NOT. PRESENT(PGP)) THEN + ! This is only relevant if we use the split interface (i.e. not PGP) + IGP2PAR = 0 + IGP3APAR = 0 + IGP3ALEV = 0 + IGP3BPAR = 0 + IGP3BLEV = 0 + IF (PRESENT(PGP2)) THEN + IGP2PAR = UBOUND(PGP2,2) + IF(LSCDERS) IGP2PAR = IGP2PAR/3 + ENDIF + IF (PRESENT(PGP3A)) THEN + IGP3ALEV = UBOUND(PGP3A,2) + IGP3APAR = UBOUND(PGP3A,3) + IF(LSCDERS) IGP3APAR = IGP3APAR/3 + ENDIF + IF (PRESENT(PGP3B)) THEN + IGP3BLEV = UBOUND(PGP3B,2) + IGP3BPAR = UBOUND(PGP3B,3) + IF(LSCDERS) IGP3BPAR = IGP3BPAR/3 + ENDIF + IF (IGP2PAR + IGP3ALEV*IGP3APAR + IGP3BPAR*IGP3BLEV /= KF_SCALARS_G) THEN + PRINT *, IGP2PAR, IGP3APAR, IGP3ALEV, IGP3BPAR, IGP3BLEV + CALL ABORT_TRANS("INCONSISTENCY IN SCALARS") + ENDIF - LLUV(:) = .FALSE. - IF (LLPGPUV) THEN - IOFF=0 - IUVLEV=UBOUND(PGPUV,2) + ! ! This is only relevant if we use the split interface (i.e. not PGP) + IUVPAR = 1 + IOFF=1 IF(LVORGP) THEN + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) IUVPAR=IUVPAR+1 - DO J=1,IUVLEV - IUVLEVS(IOFF+J)=J - IUVPARS(IOFF+J)=IUVPAR - LLUV(IOFF+J)=.TRUE. - ENDDO - IOFF=IOFF+IUVLEV + IOFF=IOFF+KF_UV_G ENDIF + IF(LDIVGP) THEN + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) IUVPAR=IUVPAR+1 - DO J=1,IUVLEV - IUVLEVS(IOFF+J)=J - IUVPARS(IOFF+J)=IUVPAR - LLUV(IOFF+J)=.TRUE. - ENDDO - IOFF=IOFF+IUVLEV + IOFF=IOFF+KF_UV_G ENDIF - DO J=1,IUVLEV - IUVLEVS(IOFF+J)=J - IUVPARS(IOFF+J)=IUVPAR+1 - IUVLEVS(IOFF+J+IUVLEV)=J - IUVPARS(IOFF+J+IUVLEV)=IUVPAR+2 - ENDDO - IUVPAR=IUVPAR+2 - LLUV(IOFF+1:IOFF+2*IUVLEV)=.TRUE. - IOFF=IOFF+2*IUVLEV - IOFF1=IOFF - IOFFNS=IOFFNS+IOFF - IOFFEW=IOFFEW+IOFF - IOFF=IUVPAR*IUVLEV+KF_SCALARS_G - IF(LUVDER) THEN - IF(LSCDERS) IOFF=IOFF+KF_SCALARS_G - DO J=1,IUVLEV - IUVLEVS(IOFF+J)=J - IUVPARS(IOFF+J)=IUVPAR+1 - LLUV(IOFF+J)=.TRUE. - IUVLEVS(IOFF+J+IUVLEV)=J - IUVPARS(IOFF+J+IUVLEV)=IUVPAR+2 - LLUV(IOFF+J+IUVLEV)=.TRUE. - ENDDO - IUVPAR=IUVPAR+2 - IOFF=IOFF+2*IUVLEV - IOFFEW=IOFFEW+2*IUVLEV - ENDIF - ENDIF + ! U + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) + IUVPAR=IUVPAR+1 + IOFF=IOFF+KF_UV_G + + ! V + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) + IUVPAR=IUVPAR+1 + IOFF=IOFF+KF_UV_G + + ! Scalars + ! PGP2 + IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,1) = IGP_OFFSETS_GP2 + IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,2) = (/(J, J=1,IGP2PAR)/) + IOFF=IOFF+IGP2PAR + ! PGP3A + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,1) = IGP_OFFSETS_GP3A + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,2) = (/(1+J/IGP3ALEV, J=0,IGP3APAR*IGP3ALEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,3) = (/(1+MOD(J,IGP3ALEV), J=0,IGP3APAR*IGP3ALEV-1)/) + IOFF=IOFF+IGP3APAR*IGP3ALEV + ! PGP3B + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,1) = IGP_OFFSETS_GP3B + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,2) = (/(1+J/IGP3BLEV, J=0,IGP3BPAR*IGP3BLEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,3) = (/(1+MOD(J,IGP3BLEV), J=0,IGP3BPAR*IGP3BLEV-1)/) + IOFF=IOFF+IGP3BPAR*IGP3BLEV - LLGP2(:)=.FALSE. - IF(LLPGP2) THEN - IOFF=IOFF1 - IGP2PAR=UBOUND(PGP2,2) - IF(LSCDERS) IGP2PAR=IGP2PAR/3 - DO J=1,IGP2PAR - LLGP2(J+IOFF) = .TRUE. - IGP2PARS(J+IOFF)=J - ENDDO - IOFF1=IOFF1+IGP2PAR IF(LSCDERS) THEN - IOFF=IOFFNS - DO J=1,IGP2PAR - LLGP2(J+IOFF) = .TRUE. - IGP2PARS(J+IOFF)=J+IGP2PAR - ENDDO - IOFFNS=IOFF+IGP2PAR - IOFF=IOFFEW - DO J=1,IGP2PAR - LLGP2(J+IOFF) = .TRUE. - IGP2PARS(J+IOFF)=J+2*IGP2PAR - ENDDO - IOFFEW=IOFF+IGP2PAR + !Scalars NS Derivatives + ! PGP2 + IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,1) = IGP_OFFSETS_GP2 + IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,2) = (/(J+IGP2PAR, J=1,IGP2PAR)/) + IOFF=IOFF+IGP2PAR + ! PGP3A + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,1) = IGP_OFFSETS_GP3A + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,2) = (/(1+IGP3APAR+J/IGP3ALEV, J=0,IGP3APAR*IGP3ALEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,3) = (/(1+MOD(J,IGP3ALEV), J=0,IGP3APAR*IGP3ALEV-1)/) + IOFF=IOFF+IGP3APAR*IGP3ALEV + ! PGP3B + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,1) = IGP_OFFSETS_GP3B + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,2) = (/(1+IGP3BPAR+J/IGP3BLEV, J=0,IGP3BPAR*IGP3BLEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,3) = (/(1+MOD(J,IGP3BLEV), J=0,IGP3BPAR*IGP3BLEV-1)/) + IOFF=IOFF+IGP3BPAR*IGP3BLEV ENDIF - ENDIF - LLGP3A(:) = .FALSE. - IF(LLPGP3A) THEN - IGP3ALEV=UBOUND(PGP3A,2) - IGP3APAR=UBOUND(PGP3A,3) - IF(LSCDERS) IGP3APAR=IGP3APAR/3 - IOFF=IOFF1 - DO J1=1,IGP3APAR - DO J2=1,IGP3ALEV - LLGP3A(J2+(J1-1)*IGP3ALEV+IOFF) = .TRUE. - IGP3APARS(J2+(J1-1)*IGP3ALEV+IOFF)=J1 - IGP3ALEVS(J2+(J1-1)*IGP3ALEV+IOFF)=J2 - ENDDO - ENDDO - IPAROFF=IGP3APAR - IOFF1=IOFF1+IGP3APAR*IGP3ALEV - IF(LSCDERS) THEN - IOFF=IOFFNS - DO J1=1,IGP3APAR - DO J2=1,IGP3ALEV - LLGP3A(J2+(J1-1)*IGP3ALEV+IOFF) = .TRUE. - IGP3APARS(J2+(J1-1)*IGP3ALEV+IOFF)=J1+IPAROFF - IGP3ALEVS(J2+(J1-1)*IGP3ALEV+IOFF)=J2 - ENDDO - ENDDO - IPAROFF=IPAROFF+IGP3APAR - IOFFNS=IOFFNS+IGP3APAR*IGP3ALEV - IOFF=IOFFEW - DO J1=1,IGP3APAR - DO J2=1,IGP3ALEV - LLGP3A(J2+(J1-1)*IGP3ALEV+IOFF) = .TRUE. - IGP3APARS(J2+(J1-1)*IGP3ALEV+IOFF)=J1+IPAROFF - IGP3ALEVS(J2+(J1-1)*IGP3ALEV+IOFF)=J2 - ENDDO - ENDDO - IOFFEW=IOFFEW+IGP3APAR*IGP3ALEV + IF(LUVDER) THEN + ! U Derivative NS + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) + IUVPAR=IUVPAR+1 + IOFF=IOFF+KF_UV_G + + ! V Derivative NS + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) + IUVPAR=IUVPAR+1 + IOFF=IOFF+KF_UV_G ENDIF - ENDIF - LLGP3B(:) = .FALSE. - IF(LLPGP3B) THEN - IGP3BLEV=UBOUND(PGP3B,2) - IGP3BPAR=UBOUND(PGP3B,3) - IF(LSCDERS) IGP3BPAR=IGP3BPAR/3 - IOFF=IOFF1 - DO J1=1,IGP3BPAR - DO J2=1,IGP3BLEV - LLGP3B(J2+(J1-1)*IGP3BLEV+IOFF) = .TRUE. - IGP3BPARS(J2+(J1-1)*IGP3BLEV+IOFF)=J1 - IGP3BLEVS(J2+(J1-1)*IGP3BLEV+IOFF)=J2 - ENDDO - ENDDO - IPAROFF=IGP3BPAR - IOFF1=IOFF1+IGP3BPAR*IGP3BLEV IF(LSCDERS) THEN - IOFF=IOFFNS - DO J1=1,IGP3BPAR - DO J2=1,IGP3BLEV - LLGP3B(J2+(J1-1)*IGP3BLEV+IOFF) = .TRUE. - IGP3BPARS(J2+(J1-1)*IGP3BLEV+IOFF)=J1+IPAROFF - IGP3BLEVS(J2+(J1-1)*IGP3BLEV+IOFF)=J2 - ENDDO - ENDDO - IPAROFF=IPAROFF+IGP3BPAR - IOFFNS=IOFFNS+IGP3BPAR*IGP3BLEV - IOFF=IOFFEW - DO J1=1,IGP3BPAR - DO J2=1,IGP3BLEV - LLGP3B(J2+(J1-1)*IGP3BLEV+IOFF) = .TRUE. - IGP3BPARS(J2+(J1-1)*IGP3BLEV+IOFF)=J1+IPAROFF - IGP3BLEVS(J2+(J1-1)*IGP3BLEV+IOFF)=J2 - ENDDO - ENDDO - IOFFEW=IOFFEW+IGP3BPAR*IGP3BLEV + !Scalars NS Derivatives + ! PGP2 + IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,1) = IGP_OFFSETS_GP2 + IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,2) = (/(J+2*IGP2PAR, J=1,IGP2PAR)/) + IOFF=IOFF+IGP2PAR + ! PGP3A + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,1) = IGP_OFFSETS_GP3A + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,2) = (/(1+2*IGP3APAR+J/IGP3ALEV, J=0,IGP3APAR*IGP3ALEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,3) = (/(1+MOD(J,IGP3ALEV), J=0,IGP3APAR*IGP3ALEV-1)/) + IOFF=IOFF+IGP3APAR*IGP3ALEV + ! PGP3B + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,1) = IGP_OFFSETS_GP3B + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,2) = (/(1+2*IGP3BPAR+J/IGP3BLEV, J=0,IGP3BPAR*IGP3BLEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,3) = (/(1+MOD(J,IGP3BLEV), J=0,IGP3BPAR*IGP3BLEV-1)/) + IOFF=IOFF+IGP3BPAR*IGP3BLEV ENDIF ENDIF @@ -388,12 +339,12 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, ISENDTOT(JROC) = IPOS*KF_FS ENDDO - !$ACC DATA PRESENT(PGLAT) COPYIN(IIN_TO_SEND_BUFR,LLUV,LLGP2,LLGP3A,LLGP3B) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP)) COPYOUT(PGP) ASYNC(1) - !$ACC DATA IF(PRESENT(PGPUV)) COPYOUT(PGPUV) COPYIN(IUVLEVS,IUVPARS) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP2)) COPYOUT(PGP2) COPYIN(IGP2PARS) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3A)) COPYOUT(PGP3A) COPYIN(IGP3APARS,IGP3ALEVS) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3B)) COPYOUT(PGP3B) COPYIN(IGP3BPARS, IGP3BLEVS) ASYNC(1) + !$ACC DATA PRESENT(PGLAT) COPYIN(IIN_TO_SEND_BUFR,IGP_OFFSETS) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP)) COPYOUT(PGP) ASYNC(1) + !$ACC DATA IF(PRESENT(PGPUV)) COPYOUT(PGPUV) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP2)) COPYOUT(PGP2) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3A)) COPYOUT(PGP3A) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3B)) COPYOUT(PGP3B) ASYNC(1) CALL GSTATS(1806,1) @@ -422,7 +373,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, IRECV_WSET_SIZE_V = IRECV_WSET_SIZE(MYSETW) IIN_TO_SEND_BUFR_V = IIN_TO_SEND_BUFR_OFFSET(MYPROC) IF (PRESENT(PGP)) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS) ASYNC(1) DO JL=1,IRECV_WSET_SIZE_V DO JFLD=1,KF_FS JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 @@ -433,21 +384,21 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, ENDDO ENDDO ELSE - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS) ASYNC(1) DO JL=1,IRECV_WSET_SIZE_V DO JFLD=1,KF_FS JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL) - IF(LLUV(IFLD)) THEN - PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) = PGLAT(JFLD,IPOS) - ELSEIF(LLGP2(IFLD)) THEN - PGP2(JK,IGP2PARS(IFLD),JBLK)=PGLAT(JFLD,IPOS) - ELSEIF(LLGP3A(IFLD)) THEN - PGP3A(JK,IGP3ALEVS(IFLD),IGP3APARS(IFLD),JBLK)=PGLAT(JFLD,IPOS) - ELSEIF(LLGP3B(IFLD)) THEN - PGP3B(JK,IGP3BLEVS(IFLD),IGP3BPARS(IFLD),JBLK)=PGLAT(JFLD,IPOS) + IF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_UV) THEN + PGPUV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PGLAT(JFLD,IPOS) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP2) THEN + PGP2(JK,IGP_OFFSETS(IFLD,2),JBLK)=PGLAT(JFLD,IPOS) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3A) THEN + PGP3A(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PGLAT(JFLD,IPOS) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3B) THEN + PGP3B(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PGLAT(JFLD,IPOS) ENDIF ENDDO ENDDO @@ -603,14 +554,14 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD=IFLDA(JFLD) JI = ICOMBUFR_OFFSET_V+(JFLD-1)*IRECV_WSET_SIZE_V+JL - IF(LLUV(IFLD)) THEN - PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) = ZCOMBUFR(JI) - ELSEIF(LLGP2(IFLD)) THEN - PGP2(JK,IGP2PARS(IFLD),JBLK) = ZCOMBUFR(JI) - ELSEIF(LLGP3A(IFLD)) THEN - PGP3A(JK,IGP3ALEVS(IFLD),IGP3APARS(IFLD),JBLK) = ZCOMBUFR(JI) - ELSEIF(LLGP3B(IFLD)) THEN - PGP3B(JK,IGP3BLEVS(IFLD),IGP3BPARS(IFLD),JBLK) = ZCOMBUFR(JI) + IF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_UV) THEN + PGPUV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP2) THEN + PGP2(JK,IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3A) THEN + PGP3A(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3B) THEN + PGP3B(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) ENDIF ENDDO ENDDO From f5b47a232c355ad067bec74dba475c5265ef7ee8 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:09 -0700 Subject: [PATCH 101/263] Explicitly pass arrays into FTDIR --- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 18 +----------------- src/trans/gpu/internal/ftdir_mod.F90 | 5 +++-- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index 1800b9f6b..d3d5321cd 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -73,19 +73,6 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & IMPLICIT NONE -INTERFACE - SUBROUTINE cudaProfilerStart() BIND(C,name='cudaProfilerStart') - USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_INT - IMPLICIT NONE - END SUBROUTINE cudaProfilerStart -END INTERFACE - -INTERFACE - SUBROUTINE cudaProfilerStop() BIND(C,name='cudaProfilerStop') - USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_INT - IMPLICIT NONE - END SUBROUTINE cudaProfilerStop -END INTERFACE ! Dummy arguments @@ -117,8 +104,6 @@ END SUBROUTINE cudaProfilerStop ! Field distribution in Spectral/Fourier space -!call cudaProfilerStart() - IF(PRESENT(KVSETUV)) THEN IVSETUV(:) = KVSETUV(:) ELSE @@ -202,14 +187,13 @@ END SUBROUTINE cudaProfilerStop CALL GSTATS(1640,0) IF (KF_FS > 0) THEN - CALL FTDIR(SIZE(ZGTF,1),KF_FS) + CALL FTDIR(ZGTF,FOUBUF_IN,SIZE(ZGTF,1),KF_FS) ENDIF CALL GSTATS(1640,1) !DEALLOCATE(ZGTF) CALL GSTATS(106,1) ! ------------------------------------------------------------------ -!call cudaProfilerStop() END SUBROUTINE FTDIR_CTL END MODULE FTDIR_CTL_MOD diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 8c354a1ea..3948e932d 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -10,7 +10,7 @@ MODULE FTDIR_MOD CONTAINS -SUBROUTINE FTDIR(STRIDE,KF_FS) +SUBROUTINE FTDIR(ZGTF,FOUBUF_IN,STRIDE,KF_FS) !**** *FTDIR - Direct Fourier transform @@ -51,7 +51,6 @@ SUBROUTINE FTDIR(STRIDE,KF_FS) USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NSTAGTF,D_NPTRLS, D_NSTAGT0B, D_NPNTGTB0, D_NPROCM -USE TPM_TRANS ,ONLY : ZGTF, FOUBUF_IN USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NLOEN USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT USE CUDA_DEVICE_MOD @@ -62,6 +61,8 @@ SUBROUTINE FTDIR(STRIDE,KF_FS) INTEGER(KIND=JPIM),INTENT(IN) :: STRIDE,KF_FS INTEGER(KIND=JPIM) :: KGL +REAL(KIND=JPRBT), INTENT(IN) :: ZGTF(:,:) +REAL(KIND=JPRBT), INTENT(OUT) :: FOUBUF_IN(:) INTEGER(KIND=JPIM) :: IGLG,JM,JF,IST1, IPROC, ISTA INTEGER(KIND=JPIM) :: IOFF From be78530c5c23b71d841cdd67f9bdf413d34190b5 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:09 -0700 Subject: [PATCH 102/263] Add back FOURIER_OUT function/file --- src/trans/gpu/internal/fourier_out_mod.F90 | 87 ++++++++++++++++++++++ src/trans/gpu/internal/ftdir_ctl_mod.F90 | 4 +- src/trans/gpu/internal/ftdir_mod.F90 | 39 ++-------- 3 files changed, 98 insertions(+), 32 deletions(-) create mode 100755 src/trans/gpu/internal/fourier_out_mod.F90 diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 new file mode 100755 index 000000000..d86ee1d21 --- /dev/null +++ b/src/trans/gpu/internal/fourier_out_mod.F90 @@ -0,0 +1,87 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE FOURIER_OUT_MOD +CONTAINS +SUBROUTINE FOURIER_OUT(ZGTF,FOUBUF_IN,KF_FS) + +!**** *FOURIER_OUT* - Copy fourier data from local array to buffer + +! Purpose. +! -------- +! Routine for copying fourier data from local array to buffer + +!** Interface. +! ---------- +! CALL FOURIER_OUT(...) + +! Explicit arguments : PREEL - local fourier/GP array +! -------------------- KFIELDS - number of fields +! +! Externals. None. +! ---------- + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 2000-04-01 + +! ------------------------------------------------------------------ + +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT + +USE TPM_DISTR ,ONLY : D, MYSETW,D_NPTRLS,D_NSTAGTF,D_NSTAGT0B,D_NPNTGTB0,D_NPROCM,D_NPROCL +USE TPM_GEOMETRY ,ONLY : G, G_NMEN,G_NLOEN +! + +IMPLICIT NONE + +REAL(KIND=JPRBT), INTENT(IN) :: ZGTF(:,:) +REAL(KIND=JPRBT), INTENT(OUT) :: FOUBUF_IN(:) +INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS +INTEGER(KIND=JPIM) :: KGL +REAL(KIND=JPRBT) :: SCAL +INTEGER(KIND=JPIM) :: OFFSET_VAR + +INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,IR,II,ISTA, ISTA1,JMMAX, iunit + +INTEGER(KIND=JPIM) :: IBEG,IEND,IINC, IOFF,iimax1,iimax2,iimax3 + +! scale results and move into next transformation buffer + +OFFSET_VAR=D_NPTRLS(MYSETW) +!$ACC DATA PRESENT(D,G_NLOEN,D_NSTAGT0B,D_NPNTGTB0,FOUBUF_IN,D_NPROCM,ZGTF,G_NMEN,D_NSTAGTF) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF,SCAL,JM) DEFAULT(NONE) +DO KGL=1,D%NDGL_FS + DO JF=1,KF_FS + IGLG = OFFSET_VAR+KGL-1 + IOFF = D_NSTAGTF(KGL)+1 + + SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) + !$ACC LOOP SEQ + DO JM=0,G_NMEN(IGLG) + IPROC = D_NPROCM(JM) + ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*2*KF_FS + + FOUBUF_IN(ISTA+2*JF-1) = SCAL * ZGTF(2*JF-1, 2*JM+IOFF) + FOUBUF_IN(ISTA+2*JF ) = SCAL * ZGTF(2*JF , 2*JM+IOFF) + ENDDO + ENDDO +ENDDO +!$ACC END DATA + +! ------------------------------------------------------------------ + +END SUBROUTINE FOURIER_OUT +END MODULE FOURIER_OUT_MOD + diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index d3d5321cd..018d4bcf5 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -62,6 +62,7 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & !USE TPM_GEOMETRY USE TPM_TRANS ,ONLY : ZGTF, FOUBUF_IN USE TPM_DISTR ,ONLY : D, MYPROC, NPROC +USE FOURIER_OUT_MOD , ONLY: FOURIER_OUT USE TRGTOL_MOD ,ONLY : TRGTOL, TRGTOL_CUDAAWARE USE FTDIR_MOD ,ONLY : FTDIR @@ -187,7 +188,8 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & CALL GSTATS(1640,0) IF (KF_FS > 0) THEN - CALL FTDIR(ZGTF,FOUBUF_IN,SIZE(ZGTF,1),KF_FS) + CALL FTDIR(ZGTF,SIZE(ZGTF,1),KF_FS) + CALL FOURIER_OUT(ZGTF,FOUBUF_IN,KF_FS) ENDIF CALL GSTATS(1640,1) diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 3948e932d..1df97b3a6 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -10,7 +10,7 @@ MODULE FTDIR_MOD CONTAINS -SUBROUTINE FTDIR(ZGTF,FOUBUF_IN,STRIDE,KF_FS) +SUBROUTINE FTDIR(ZGTF,STRIDE,KF_FS) !**** *FTDIR - Direct Fourier transform @@ -50,8 +50,8 @@ SUBROUTINE FTDIR(ZGTF,FOUBUF_IN,STRIDE,KF_FS) USE TPM_GEN ,ONLY : LSYNC_TRANS USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT -USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NSTAGTF,D_NPTRLS, D_NSTAGT0B, D_NPNTGTB0, D_NPROCM -USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NLOEN +USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC +USE TPM_GEOMETRY ,ONLY : G USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT USE CUDA_DEVICE_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER @@ -60,17 +60,14 @@ SUBROUTINE FTDIR(ZGTF,FOUBUF_IN,STRIDE,KF_FS) IMPLICIT NONE INTEGER(KIND=JPIM),INTENT(IN) :: STRIDE,KF_FS +REAL(KIND=JPRBT), INTENT(INOUT) :: ZGTF(:,:) INTEGER(KIND=JPIM) :: KGL -REAL(KIND=JPRBT), INTENT(IN) :: ZGTF(:,:) -REAL(KIND=JPRBT), INTENT(OUT) :: FOUBUF_IN(:) INTEGER(KIND=JPIM) :: IGLG,JM,JF,IST1, IPROC, ISTA INTEGER(KIND=JPIM) :: IOFF INTEGER(KIND=JPIM) :: IPLAN_R2C -REAL(KIND=JPRBT) :: SCAL INTEGER(KIND=JPIM) :: IBEG,IEND,IINC, IRET -INTEGER(KIND=JPIM) :: OFFSET_VAR real(kind=jprbt), allocatable :: zgtf2(:,:) ! ------------------------------------------------------------------ @@ -87,10 +84,7 @@ SUBROUTINE FTDIR(ZGTF,FOUBUF_IN,STRIDE,KF_FS) allocate(zgtf2(size(zgtf,1),size(zgtf,2))) -!$ACC DATA & -!$ACC& PRESENT(ZGTF,FOUBUF_IN, & -!$ACC& D,D_NSTAGTF,G_NMEN,G_NLOEN,D_NSTAGT0B,D_NPNTGTB0,D_NPROCM) & -!$ACC& CREATE(ZGTF2) +!$ACC DATA CREATE(ZGTF2) PRESENT(ZGTF) IF (LSYNC_TRANS) THEN CALL MPL_BARRIER(CDSTRING='FTDIR BARRIER') @@ -115,26 +109,9 @@ SUBROUTINE FTDIR(ZGTF,FOUBUF_IN,STRIDE,KF_FS) ENDIF CALL GSTATS(450,1) -! scale results and move into next transformation buffer - -OFFSET_VAR=D_NPTRLS(MYSETW) -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF,SCAL,JM) DEFAULT(NONE) -DO KGL=1,D%NDGL_FS - DO JF=1,KF_FS - IGLG = OFFSET_VAR+KGL-1 - IOFF = D_NSTAGTF(KGL)+1 - - SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) - !$ACC LOOP SEQ - DO JM=0,G_NMEN(IGLG) - IPROC = D_NPROCM(JM) - ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*2*KF_FS - - FOUBUF_IN(ISTA+2*JF-1) = SCAL * ZGTF2(2*JF-1, 2*JM+IOFF) - FOUBUF_IN(ISTA+2*JF ) = SCAL * ZGTF2(2*JF , 2*JM+IOFF) - ENDDO - ENDDO -ENDDO +!$ACC KERNELS DEFAULT(NONE) +ZGTF(:,:) = ZGTF2(:,:) +!$ACC END KERNELS !$ACC END DATA From e99a030c692d924eab6d87d4f722d2aaadea0dbd Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:09 -0700 Subject: [PATCH 103/263] ZGTF is now a local variable --- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 14 +++++++++----- src/trans/gpu/internal/ftdir_mod.F90 | 12 ++++++------ 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index 018d4bcf5..9f3b6f4c0 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -60,7 +60,7 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & USE TPM_GEN, only: nout !USE TPM_DIM !USE TPM_GEOMETRY -USE TPM_TRANS ,ONLY : ZGTF, FOUBUF_IN +USE TPM_TRANS ,ONLY : FOUBUF_IN USE TPM_DISTR ,ONLY : D, MYPROC, NPROC USE FOURIER_OUT_MOD , ONLY: FOURIER_OUT @@ -91,14 +91,14 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP2(:,:,:) ! Local variables -!REAL(KIND=JPRBT),ALLOCATABLE :: ZGTF(:,:) +REAL(KIND=JPRBT), ALLOCATABLE :: ZGTF(:,:) INTEGER(KIND=JPIM) :: IST,JGL,IGL,JF_FS INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) INTEGER(KIND=JPIM) :: IVSET(KF_GP) INTEGER(KIND=JPIM) :: IFGP2,IFGP3A,IFGP3B,IOFF,J3 -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC +INTEGER(KIND=JPIM) :: IBEG,IEND,IINC,IFIRST INTEGER(KIND=JPIM) :: ISIZE,IFIELDS,ICHUNK,ICHUNKS,JK ! ------------------------------------------------------------------ @@ -148,6 +148,9 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & IST = IST+KF_SCALARS_G ENDIF +ALLOCATE(ZGTF(2*KF_FS, D%NLENGTF)) +!$ACC ENTER DATA CREATE(ZGTF) + ! Transposition CALL GSTATS(158,0) @@ -188,12 +191,13 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & CALL GSTATS(1640,0) IF (KF_FS > 0) THEN - CALL FTDIR(ZGTF,SIZE(ZGTF,1),KF_FS) + CALL FTDIR(ZGTF,SIZE(ZGTF,1),2*KF_FS) CALL FOURIER_OUT(ZGTF,FOUBUF_IN,KF_FS) ENDIF CALL GSTATS(1640,1) -!DEALLOCATE(ZGTF) +!$ACC EXIT DATA DELETE(ZGTF) +DEALLOCATE(ZGTF) CALL GSTATS(106,1) ! ------------------------------------------------------------------ END SUBROUTINE FTDIR_CTL diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 1df97b3a6..1998d7340 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -10,7 +10,7 @@ MODULE FTDIR_MOD CONTAINS -SUBROUTINE FTDIR(ZGTF,STRIDE,KF_FS) +SUBROUTINE FTDIR(ZGTF,STRIDE,KFIELD) !**** *FTDIR - Direct Fourier transform @@ -24,7 +24,7 @@ SUBROUTINE FTDIR(ZGTF,STRIDE,KF_FS) ! Explicit arguments : PREEL - Fourier/grid-point array ! -------------------- KSTIRDE - stride of PREEL -! KF_FS - number of fields +! KFIELD - number of fields ! Method. ! ------- @@ -59,7 +59,7 @@ SUBROUTINE FTDIR(ZGTF,STRIDE,KF_FS) IMPLICIT NONE -INTEGER(KIND=JPIM),INTENT(IN) :: STRIDE,KF_FS +INTEGER(KIND=JPIM),INTENT(IN) :: STRIDE,KFIELD REAL(KIND=JPRBT), INTENT(INOUT) :: ZGTF(:,:) INTEGER(KIND=JPIM) :: KGL @@ -96,10 +96,10 @@ SUBROUTINE FTDIR(ZGTF,STRIDE,KF_FS) IOFF=D%NSTAGTF(KGL)+1 IGLG = D%NPTRLS(MYSETW)+KGL-1 - CALL CREATE_PLAN_FFT(IPLAN_R2C,-1,G%NLOEN(IGLG),2*KF_FS,STRIDE) - !$ACC host_data use_device(ZGTF,ZGTF2) + CALL CREATE_PLAN_FFT(IPLAN_R2C,-1,G%NLOEN(IGLG),KFIELD,KFIELD) + !$ACC HOST_DATA USE_DEVICE(ZGTF,ZGTF2) CALL EXECUTE_PLAN_FFTC(IPLAN_R2C,-1,ZGTF(1,IOFF),ZGTF2(1,IOFF)) - !$ACC end host_data + !$ACC END HOST_DATA END DO IRET = CUDA_SYNCHRONIZE() From 8a79c6ad7d1bd483110ba20ee9a4bcb7914d46a9 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:09 -0700 Subject: [PATCH 104/263] Implement pointer swap in ftdir --- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 4 +-- src/trans/gpu/internal/ftdir_mod.F90 | 34 +++++++++++++----------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index 9f3b6f4c0..b5f05bd16 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -91,7 +91,7 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP2(:,:,:) ! Local variables -REAL(KIND=JPRBT), ALLOCATABLE :: ZGTF(:,:) +REAL(KIND=JPRBT), POINTER :: ZGTF(:,:) INTEGER(KIND=JPIM) :: IST,JGL,IGL,JF_FS INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) @@ -191,7 +191,7 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & CALL GSTATS(1640,0) IF (KF_FS > 0) THEN - CALL FTDIR(ZGTF,SIZE(ZGTF,1),2*KF_FS) + CALL FTDIR(ZGTF,2*KF_FS) CALL FOURIER_OUT(ZGTF,FOUBUF_IN,KF_FS) ENDIF diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 1998d7340..e05d19c3f 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -10,8 +10,7 @@ MODULE FTDIR_MOD CONTAINS -SUBROUTINE FTDIR(ZGTF,STRIDE,KFIELD) - +SUBROUTINE FTDIR(ZGTF,KFIELD) !**** *FTDIR - Direct Fourier transform @@ -23,8 +22,7 @@ SUBROUTINE FTDIR(ZGTF,STRIDE,KFIELD) ! CALL FTDIR(..) ! Explicit arguments : PREEL - Fourier/grid-point array -! -------------------- KSTIRDE - stride of PREEL -! KFIELD - number of fields +! -------------------- KFIELD - number of fields ! Method. ! ------- @@ -44,7 +42,6 @@ SUBROUTINE FTDIR(ZGTF,STRIDE,KFIELD) ! D. Degrauwe (Feb 2012): Alternative extension zone (E') ! G. Mozdzynski (Oct 2014): support for FFTW transforms ! G. Mozdzynski (Jun 2015): Support alternative FFTs to FFTW - ! ------------------------------------------------------------------ USE TPM_GEN ,ONLY : LSYNC_TRANS @@ -55,12 +52,12 @@ SUBROUTINE FTDIR(ZGTF,STRIDE,KFIELD) USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT USE CUDA_DEVICE_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER -! IMPLICIT NONE -INTEGER(KIND=JPIM),INTENT(IN) :: STRIDE,KFIELD -REAL(KIND=JPRBT), INTENT(INOUT) :: ZGTF(:,:) +INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD +REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: ZGTF(:,:) + INTEGER(KIND=JPIM) :: KGL INTEGER(KIND=JPIM) :: IGLG,JM,JF,IST1, IPROC, ISTA @@ -68,7 +65,7 @@ SUBROUTINE FTDIR(ZGTF,STRIDE,KFIELD) INTEGER(KIND=JPIM) :: IPLAN_R2C INTEGER(KIND=JPIM) :: IBEG,IEND,IINC, IRET -real(kind=jprbt), allocatable :: zgtf2(:,:) +REAL(KIND=JPRBT), POINTER :: ZGTF2(:,:), TMP(:,:) ! ------------------------------------------------------------------ @@ -83,8 +80,10 @@ SUBROUTINE FTDIR(ZGTF,STRIDE,KFIELD) ENDIF -allocate(zgtf2(size(zgtf,1),size(zgtf,2))) -!$ACC DATA CREATE(ZGTF2) PRESENT(ZGTF) +ALLOCATE(ZGTF2(SIZE(ZGTF,1),SIZE(ZGTF,2))) +!$ACC ENTER DATA CREATE(ZGTF2) + +!$ACC DATA PRESENT(ZGTF,ZGTF2) IF (LSYNC_TRANS) THEN CALL MPL_BARRIER(CDSTRING='FTDIR BARRIER') @@ -109,12 +108,17 @@ SUBROUTINE FTDIR(ZGTF,STRIDE,KFIELD) ENDIF CALL GSTATS(450,1) -!$ACC KERNELS DEFAULT(NONE) -ZGTF(:,:) = ZGTF2(:,:) -!$ACC END KERNELS - !$ACC END DATA +! Swap pointers +TMP => ZGTF +ZGTF => ZGTF2 +ZGTF2 => TMP + +! and deallocate the local pointer +!$ACC EXIT DATA DELETE(ZGTF2) +DEALLOCATE(ZGTF2) + ! ------------------------------------------------------------------ END SUBROUTINE FTDIR From d189d72ef32586dad1441b85ca67b112b8aeb21a Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:09 -0700 Subject: [PATCH 105/263] Non-critical: FTDIR and FTINV perfectly shadow eachother now --- src/trans/gpu/internal/ftdir_mod.F90 | 34 ++++++++++++---------------- src/trans/gpu/internal/ftinv_mod.F90 | 13 +++++------ 2 files changed, 21 insertions(+), 26 deletions(-) diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index e05d19c3f..1855fa6bd 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -10,7 +10,7 @@ MODULE FTDIR_MOD CONTAINS -SUBROUTINE FTDIR(ZGTF,KFIELD) +SUBROUTINE FTDIR(PREEL,KFIELD) !**** *FTDIR - Direct Fourier transform @@ -56,16 +56,12 @@ SUBROUTINE FTDIR(ZGTF,KFIELD) IMPLICIT NONE INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD -REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: ZGTF(:,:) +REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL(:,:) -INTEGER(KIND=JPIM) :: KGL - -INTEGER(KIND=JPIM) :: IGLG,JM,JF,IST1, IPROC, ISTA -INTEGER(KIND=JPIM) :: IOFF +INTEGER(KIND=JPIM) :: IGLG,IOFF,KGL,IRET INTEGER(KIND=JPIM) :: IPLAN_R2C - -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC, IRET -REAL(KIND=JPRBT), POINTER :: ZGTF2(:,:), TMP(:,:) +INTEGER(KIND=JPIM) :: IBEG,IEND,IINC +REAL(KIND=JPRBT), POINTER :: PREEL2(:,:), TMP(:,:) ! ------------------------------------------------------------------ @@ -80,10 +76,10 @@ SUBROUTINE FTDIR(ZGTF,KFIELD) ENDIF -ALLOCATE(ZGTF2(SIZE(ZGTF,1),SIZE(ZGTF,2))) -!$ACC ENTER DATA CREATE(ZGTF2) +ALLOCATE(PREEL2(SIZE(PREEL,1),SIZE(PREEL,2))) +!$ACC ENTER DATA CREATE(PREEL2) -!$ACC DATA PRESENT(ZGTF,ZGTF2) +!$ACC DATA PRESENT(PREEL,PREEL2) IF (LSYNC_TRANS) THEN CALL MPL_BARRIER(CDSTRING='FTDIR BARRIER') @@ -96,8 +92,8 @@ SUBROUTINE FTDIR(ZGTF,KFIELD) IGLG = D%NPTRLS(MYSETW)+KGL-1 CALL CREATE_PLAN_FFT(IPLAN_R2C,-1,G%NLOEN(IGLG),KFIELD,KFIELD) - !$ACC HOST_DATA USE_DEVICE(ZGTF,ZGTF2) - CALL EXECUTE_PLAN_FFTC(IPLAN_R2C,-1,ZGTF(1,IOFF),ZGTF2(1,IOFF)) + !$ACC HOST_DATA USE_DEVICE(PREEL,PREEL2) + CALL EXECUTE_PLAN_FFTC(IPLAN_R2C,-1,PREEL(1,IOFF),PREEL2(1,IOFF)) !$ACC END HOST_DATA END DO @@ -111,13 +107,13 @@ SUBROUTINE FTDIR(ZGTF,KFIELD) !$ACC END DATA ! Swap pointers -TMP => ZGTF -ZGTF => ZGTF2 -ZGTF2 => TMP +TMP => PREEL +PREEL => PREEL2 +PREEL2 => TMP ! and deallocate the local pointer -!$ACC EXIT DATA DELETE(ZGTF2) -DEALLOCATE(ZGTF2) +!$ACC EXIT DATA DELETE(PREEL2) +DEALLOCATE(PREEL2) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index bcba1b304..7510bc79b 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -22,7 +22,7 @@ SUBROUTINE FTINV(PREEL,KFIELD) ! CALL FTINV(..) ! Explicit arguments : PREEL - Fourier/grid-point array -! -------------------- KFIELDS - number of fields +! -------------------- KFIELD - number of fields ! Method. ! ------- @@ -38,16 +38,16 @@ SUBROUTINE FTINV(PREEL,KFIELD) ! Modifications. ! -------------- ! Original : 00-03-03 -! G. Radnoti 01-04-24 : 2D model (NLOEN=1) +! G. Radnoti 01-04-24 2D model (NLOEN=1) ! D. Degrauwe (Feb 2012): Alternative extension zone (E') ! G. Mozdzynski (Oct 2014): support for FFTW transforms ! G. Mozdzynski (Jun 2015): Support alternative FFTs to FFTW ! ------------------------------------------------------------------ USE TPM_GEN ,ONLY : LSYNC_TRANS -USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT +USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT -USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC +USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC USE TPM_GEOMETRY ,ONLY : G USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT USE CUDA_DEVICE_MOD @@ -65,8 +65,6 @@ SUBROUTINE FTINV(PREEL,KFIELD) ! ------------------------------------------------------------------ - - IF(MYPROC > NPROC/2)THEN IBEG=1 IEND=D%NDGL_FS @@ -77,6 +75,7 @@ SUBROUTINE FTINV(PREEL,KFIELD) IINC=-1 ENDIF + ALLOCATE(PREEL2(SIZE(PREEL,1),SIZE(PREEL,2))) !$ACC ENTER DATA CREATE(PREEL2) @@ -90,7 +89,7 @@ SUBROUTINE FTINV(PREEL,KFIELD) DO KGL=IBEG,IEND,IINC IOFF=D%NSTAGTF(KGL)+1 - IGLG = D%NPTRLS(MYSETW)+KGL-1 + IGLG = D%NPTRLS(MYSETW)+KGL-1 CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),KFIELD,KFIELD) !$ACC HOST_DATA USE_DEVICE(PREEL,PREEL2) From 29dd4c353c7e8434c1ab2253c3f504b43f054dd7 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:09 -0700 Subject: [PATCH 106/263] Minor changes to make FOURIER_IN and FOURIER_OUT more ismilar --- src/trans/gpu/internal/fourier_in_mod.F90 | 21 +++------ src/trans/gpu/internal/fourier_out_mod.F90 | 50 +++++++++++++--------- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 5 +-- 3 files changed, 37 insertions(+), 39 deletions(-) diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index 122f816b1..a51d2631d 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -38,26 +38,19 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL,KFIELDS) ! ------------------------------------------------------------------ -USE PARKIND_ECTRANS ,ONLY : JPIM, JPRB, JPRBT - -USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NSTAGTF,D_NSTAGT0B,D_NPNTGTB0,D_NPROCM,D_NPTRLS -USE TPM_GEOMETRY ,ONLY : G, G_NMEN,G_NMEN_MAX -use tpm_gen, only: nout +USE PARKIND_ECTRANS ,ONLY : JPIM,JPRBT +USE TPM_DISTR ,ONLY : D,MYSETW,MYPROC, NPROC, D_NSTAGTF, D_NSTAGT0B,D_NPNTGTB0,D_NPROCM,D_NPTRLS +USE TPM_GEOMETRY ,ONLY : G,G_NMEN ! IMPLICIT NONE -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS - -INTEGER(KIND=JPIM) :: KGL - +REAL(KIND=JPRBT), ALLOCATABLE, INTENT(IN) :: FOUBUF(:) REAL(KIND=JPRBT), INTENT(OUT) :: PREEL(:,:) -REAL(KIND=JPRB), ALLOCATABLE, INTENT(IN) :: FOUBUF(:) - -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,IR,II,ISTA,OFFSET_VAR,IOFF -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC,iimax1,iimax2,iimax3,iunit +INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS -! ------------------------------------------------------------------ +INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA,OFFSET_VAR,IOFF,KGL +INTEGER(KIND=JPIM) :: IBEG,IEND,IINC IF(MYPROC > NPROC/2)THEN IBEG=1 diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 index d86ee1d21..8e697195b 100755 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ b/src/trans/gpu/internal/fourier_out_mod.F90 @@ -10,7 +10,7 @@ MODULE FOURIER_OUT_MOD CONTAINS -SUBROUTINE FOURIER_OUT(ZGTF,FOUBUF_IN,KF_FS) +SUBROUTINE FOURIER_OUT(PREEL,FOUBUF_IN,KFIELDS) !**** *FOURIER_OUT* - Copy fourier data from local array to buffer @@ -38,32 +38,40 @@ SUBROUTINE FOURIER_OUT(ZGTF,FOUBUF_IN,KF_FS) ! ------------------------------------------------------------------ -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - -USE TPM_DISTR ,ONLY : D, MYSETW,D_NPTRLS,D_NSTAGTF,D_NSTAGT0B,D_NPNTGTB0,D_NPROCM,D_NPROCL -USE TPM_GEOMETRY ,ONLY : G, G_NMEN,G_NLOEN +USE PARKIND_ECTRANS ,ONLY : JPIM,JPRBT +USE TPM_DISTR ,ONLY : D,MYSETW,MYPROC, NPROC, D_NSTAGTF, D_NSTAGT0B,D_NPNTGTB0,D_NPROCM,D_NPTRLS +USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NLOEN ! IMPLICIT NONE -REAL(KIND=JPRBT), INTENT(IN) :: ZGTF(:,:) +REAL(KIND=JPRBT), INTENT(IN) :: PREEL(:,:) REAL(KIND=JPRBT), INTENT(OUT) :: FOUBUF_IN(:) -INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS -INTEGER(KIND=JPIM) :: KGL +INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS + +INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA,OFFSET_VAR,IOFF,KGL +INTEGER(KIND=JPIM) :: IBEG,IEND,IINC + REAL(KIND=JPRBT) :: SCAL -INTEGER(KIND=JPIM) :: OFFSET_VAR -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,IR,II,ISTA, ISTA1,JMMAX, iunit +IF(MYPROC > NPROC/2)THEN + IBEG=1 + IEND=D%NDGL_FS + IINC=1 +ELSE + IBEG=D%NDGL_FS + IEND=1 + IINC=-1 +ENDIF -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC, IOFF,iimax1,iimax2,iimax3 +!$ACC DATA PRESENT(G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF_IN,PREEL,D_NSTAGTF,G_NLOEN) ASYNC(1) ! scale results and move into next transformation buffer OFFSET_VAR=D_NPTRLS(MYSETW) -!$ACC DATA PRESENT(D,G_NLOEN,D_NSTAGT0B,D_NPNTGTB0,FOUBUF_IN,D_NPROCM,ZGTF,G_NMEN,D_NSTAGTF) -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF,SCAL,JM) DEFAULT(NONE) -DO KGL=1,D%NDGL_FS - DO JF=1,KF_FS +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF,JM,IPROC,ISTA,SCAL) DEFAULT(NONE) ASYNC(1) +DO KGL=IBEG,IEND,IINC + DO JF=1,KFIELDS/2 IGLG = OFFSET_VAR+KGL-1 IOFF = D_NSTAGTF(KGL)+1 @@ -71,16 +79,16 @@ SUBROUTINE FOURIER_OUT(ZGTF,FOUBUF_IN,KF_FS) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) IPROC = D_NPROCM(JM) - ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*2*KF_FS + ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KFIELDS - FOUBUF_IN(ISTA+2*JF-1) = SCAL * ZGTF(2*JF-1, 2*JM+IOFF) - FOUBUF_IN(ISTA+2*JF ) = SCAL * ZGTF(2*JF , 2*JM+IOFF) - ENDDO - ENDDO + FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL(2*JF-1, 2*JM+IOFF) + FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL(2*JF , 2*JM+IOFF) + ENDDO + ENDDO ENDDO !$ACC END DATA -! ------------------------------------------------------------------ +!$ACC WAIT(1) END SUBROUTINE FOURIER_OUT END MODULE FOURIER_OUT_MOD diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index b5f05bd16..d57803341 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -155,7 +155,6 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & CALL GSTATS(158,0) -! needed ??? JF_FS=KF_FS-D%IADJUST_D #ifdef USE_CUDA_AWARE_MPI_FT IF (LSYNC_TRANS) THEN CALL MPL_BARRIER(CDSTRING='') @@ -187,12 +186,10 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & ! Fourier transform -!write(301,*) 'sizey: ', myproc, size(zgtf,1), KF_FS - CALL GSTATS(1640,0) IF (KF_FS > 0) THEN CALL FTDIR(ZGTF,2*KF_FS) - CALL FOURIER_OUT(ZGTF,FOUBUF_IN,KF_FS) + CALL FOURIER_OUT(ZGTF,FOUBUF_IN,2*KF_FS) ENDIF CALL GSTATS(1640,1) From 2951caa3a9ab12cafa19a753eed1e8187cc7b05e Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:10 -0700 Subject: [PATCH 107/263] Pass through FOUBUF_IN --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 15 ++++++--------- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 4 ++-- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 2 +- src/trans/gpu/internal/ltdir_ctl_mod.F90 | 6 ++++-- src/trans/gpu/internal/trltom_mod.F90 | 4 ++-- 5 files changed, 15 insertions(+), 16 deletions(-) diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 21a9f740d..3aa9d5a5c 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -162,23 +162,20 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ENDDO IF(IF_UV_G > 0 .AND. IF_SCALARS_G > 0) THEN - CALL FTDIR_CTL(IF_UV_G,IF_SCALARS_G,IF_GP,IF_FS,& + CALL FTDIR_CTL(IF_UV_G,IF_SCALARS_G,IF_GP,IF_FS,FOUBUF_IN,& & KVSETUV=IVSETUV(ISTUV_G:IENUV_G),& & KVSETSC=IVSETSC(ISTSC_G:IENSC_G),KPTRGP=IPTRGP,PGP=PGP) ELSEIF(IF_UV_G > 0) THEN - CALL FTDIR_CTL(IF_UV_G,IF_SCALARS_G,IF_GP,IF_FS,& + CALL FTDIR_CTL(IF_UV_G,IF_SCALARS_G,IF_GP,IF_FS,FOUBUF_IN,& & KVSETUV=IVSETUV(ISTUV_G:IENUV_G),& & KPTRGP=IPTRGP,PGP=PGP) ELSEIF(IF_SCALARS_G > 0) THEN - CALL FTDIR_CTL(IF_UV_G,IF_SCALARS_G,IF_GP,IF_FS,& + CALL FTDIR_CTL(IF_UV_G,IF_SCALARS_G,IF_GP,IF_FS,FOUBUF_IN,& & KVSETSC=IVSETSC(ISTSC_G:IENSC_G),KPTRGP=IPTRGP,PGP=PGP) ENDIF - !$ACC DATA COPYIN(FOUBUF_IN) - CALL LTDIR_CTL(IF_FS,IF_UV,IF_SCALARS, & + CALL LTDIR_CTL(IF_FS,IF_UV,IF_SCALARS,FOUBUF_IN,& & PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& & KFLDPTRUV=IPTRSPUV,KFLDPTRSC=IPTRSPSC) - !$ACC END DATA - ENDDO ELSE @@ -187,7 +184,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& !$ACC DATA CREATE(FOUBUF_IN) call nvtxStartRange("FTDIR") - CALL FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,& + CALL FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN,& & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) @@ -199,7 +196,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& !$ACC DATA COPYOUT(PSPSC2) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC2) .AND. NF_SC2 > 0) !$ACC DATA COPYOUT(PSPSC3A) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC3A) .AND. NF_SC3A > 0) !$ACC DATA COPYOUT(PSPSC3B) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC3B) .AND. NF_SC3B > 0) - CALL LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS, & + CALL LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN, & & PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& & PSPSC3A=PSPSC3A,PSPSC3B=PSPSC3B,PSPSC2=PSPSC2) call nvtxEndRange diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index d57803341..224ec9b0a 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -10,7 +10,7 @@ MODULE FTDIR_CTL_MOD CONTAINS -SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & +SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & & KVSETUV,KVSETSC,KPTRGP,& & KVSETSC3A,KVSETSC3B,KVSETSC2,& & PGP,PGPUV,PGP3A,PGP3B,PGP2) @@ -60,7 +60,6 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & USE TPM_GEN, only: nout !USE TPM_DIM !USE TPM_GEOMETRY -USE TPM_TRANS ,ONLY : FOUBUF_IN USE TPM_DISTR ,ONLY : D, MYPROC, NPROC USE FOURIER_OUT_MOD , ONLY: FOURIER_OUT @@ -89,6 +88,7 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP3A(:,:,:,:) REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP2(:,:,:) +REAL(KIND=JPRBT), INTENT(OUT) :: FOUBUF_IN(:) ! Local variables REAL(KIND=JPRBT), POINTER :: ZGTF(:,:) diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index 1c451b43e..3a2e9c420 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -92,7 +92,7 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3A(:,:,:,:) REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP2(:,:,:) -REAL(KIND=JPRB), ALLOCATABLE, INTENT(IN) :: FOUBUF(:) +REAL(KIND=JPRBT), ALLOCATABLE, INTENT(IN) :: FOUBUF(:) REAL(KIND=JPRBT), POINTER :: ZGTF(:,:) REAL(KIND=JPRBT), POINTER :: & diff --git a/src/trans/gpu/internal/ltdir_ctl_mod.F90 b/src/trans/gpu/internal/ltdir_ctl_mod.F90 index da7ac3973..9ee876785 100755 --- a/src/trans/gpu/internal/ltdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltdir_ctl_mod.F90 @@ -10,7 +10,7 @@ MODULE LTDIR_CTL_MOD CONTAINS - SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS, & + SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN,& & PSPVOR,PSPDIV,PSPSCALAR, & & PSPSC3A,PSPSC3B,PSPSC2, & & KFLDPTRUV,KFLDPTRSC) @@ -39,10 +39,11 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS, & ! ------------------------------------------------------------------ USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND_ECTRANS ,ONLY : JPRBT USE TPM_GEN, only: nout USE TPM_DIM ,ONLY : R - USE TPM_TRANS ,ONLY : FOUBUF, FOUBUF_IN + USE TPM_TRANS ,ONLY : FOUBUF USE TPM_DISTR ,ONLY : D USE TPM_GEOMETRY ,ONLY : G USE TPM_FIELDS ,ONLY : F @@ -56,6 +57,7 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS, & IMPLICIT NONE INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_UV,KF_SCALARS + REAL(KIND=JPRBT), INTENT(IN) :: FOUBUF_IN(:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index aedfa79c5..ba18c7271 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -74,7 +74,7 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_FS REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) -REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF_IN(:) +REAL(KIND=JPRBT) ,INTENT(IN) :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK @@ -214,7 +214,7 @@ SUBROUTINE TRLTOM(PFBUF_IN,PFBUF,KF_FS) INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_FS REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) -REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF_IN(:) +REAL(KIND=JPRBT) ,INTENT(IN) :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) INTEGER(KIND=JPIM) :: J, ILEN, ISTA From 8b60f835b240763d85dca85e31507133999655e1 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:10 -0700 Subject: [PATCH 108/263] Re-allocate FOUBUF_IN in DIR_TRANS --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 7 ++++--- src/trans/gpu/internal/fourier_out_mod.F90 | 5 ++++- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 2 +- src/trans/gpu/internal/ltdir_ctl_mod.F90 | 4 ++-- src/trans/gpu/internal/trltom_mod.F90 | 13 +++++++++++-- 5 files changed, 22 insertions(+), 9 deletions(-) diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 3aa9d5a5c..3a6cb21a1 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -73,9 +73,10 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ! ------------------------------------------------------------------ USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND_ECTRANS ,ONLY : JPRBT USE TPM_GEN ,ONLY : NPROMATR -USE TPM_TRANS ,ONLY : FOUBUF_IN, NF_SC2, NF_SC3A, NF_SC3B +USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B !USE TPM_DISTR USE SHUFFLE_MOD ,ONLY : SHUFFLE @@ -124,6 +125,8 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& INTEGER(KIND=JPIM) :: IF_UV_G,IF_UV,ISTUV,IF_SCALARS,IF_SCALARS_G,IF_FS,IF_GP INTEGER(KIND=JPIM) :: JFLD,ISTSC_G,IENSC_G,ISTSC,IENSC,IENUV,IF_GPB +REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF_IN(:) + ! ------------------------------------------------------------------ @@ -182,7 +185,6 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ! No splitting of fields, transform done in one go call nvtxStartRange("DIRTRANS_nodata") - !$ACC DATA CREATE(FOUBUF_IN) call nvtxStartRange("FTDIR") CALL FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN,& & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& @@ -209,7 +211,6 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& !$ACC END DATA !$ACC END DATA !$ACC END DATA - !$ACC END DATA IF (LSYNC_TRANS) THEN CALL MPL_BARRIER(CDSTRING='') ENDIF diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 index 8e697195b..47785924c 100755 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ b/src/trans/gpu/internal/fourier_out_mod.F90 @@ -46,7 +46,7 @@ SUBROUTINE FOURIER_OUT(PREEL,FOUBUF_IN,KFIELDS) IMPLICIT NONE REAL(KIND=JPRBT), INTENT(IN) :: PREEL(:,:) -REAL(KIND=JPRBT), INTENT(OUT) :: FOUBUF_IN(:) +REAL(KIND=JPRBT), ALLOCATABLE, INTENT(OUT) :: FOUBUF_IN(:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA,OFFSET_VAR,IOFF,KGL @@ -64,6 +64,9 @@ SUBROUTINE FOURIER_OUT(PREEL,FOUBUF_IN,KFIELDS) IINC=-1 ENDIF +ALLOCATE(FOUBUF_IN(D%NLENGT0B*KFIELDS)) +!$ACC ENTER DATA CREATE(FOUBUF_IN) + !$ACC DATA PRESENT(G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF_IN,PREEL,D_NSTAGTF,G_NLOEN) ASYNC(1) ! scale results and move into next transformation buffer diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index 224ec9b0a..f991cfa95 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -88,7 +88,7 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP3A(:,:,:,:) REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP2(:,:,:) -REAL(KIND=JPRBT), INTENT(OUT) :: FOUBUF_IN(:) +REAL(KIND=JPRBT), ALLOCATABLE, INTENT(OUT) :: FOUBUF_IN(:) ! Local variables REAL(KIND=JPRBT), POINTER :: ZGTF(:,:) diff --git a/src/trans/gpu/internal/ltdir_ctl_mod.F90 b/src/trans/gpu/internal/ltdir_ctl_mod.F90 index 9ee876785..9ebb396cd 100755 --- a/src/trans/gpu/internal/ltdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltdir_ctl_mod.F90 @@ -57,7 +57,7 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN,& IMPLICIT NONE INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_UV,KF_SCALARS - REAL(KIND=JPRBT), INTENT(IN) :: FOUBUF_IN(:) + REAL(KIND=JPRBT), ALLOCATABLE, INTENT(IN) :: FOUBUF_IN(:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) @@ -69,7 +69,7 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN,& INTEGER(KIND=JPIM) :: JM,IM - !$ACC DATA PRESENT(FOUBUF_IN) CREATE(FOUBUF) + !$ACC DATA CREATE(FOUBUF) CALL GSTATS(153,0) #ifdef USE_CUDA_AWARE_MPI_FT diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index ba18c7271..a980d1ca2 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -74,7 +74,7 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_FS REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) -REAL(KIND=JPRBT) ,INTENT(IN) :: PFBUF_IN(:) +REAL(KIND=JPRBT) ,INTENT(IN), ALLOCATABLE :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK @@ -148,6 +148,12 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) CALL GSTATS(1607,1) ENDIF +IF (ALLOCATED(PFBUF_IN)) THEN + !$ACC EXIT DATA DELETE(PFBUF_IN) + DEALLOCATE(PFBUF_IN) +ENDIF + + IF (LHOOK) CALL DR_HOOK('TRLTOM_CUDAAWARE',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ END SUBROUTINE TRLTOM_CUDAAWARE @@ -214,7 +220,7 @@ SUBROUTINE TRLTOM(PFBUF_IN,PFBUF,KF_FS) INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_FS REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) -REAL(KIND=JPRBT) ,INTENT(IN) :: PFBUF_IN(:) +REAL(KIND=JPRBT) ,INTENT(IN), ALLOCATABLE :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) INTEGER(KIND=JPIM) :: J, ILEN, ISTA @@ -252,6 +258,9 @@ SUBROUTINE TRLTOM(PFBUF_IN,PFBUF,KF_FS) CALL GSTATS(1607,1) ENDIF +!$ACC EXIT DATA DELETE(PFBUF_IN) +DEALLOCATE(PFBUF_IN) + IF (LHOOK) CALL DR_HOOK('TRLTOM',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ END SUBROUTINE TRLTOM From 4328cbf78fcfbe655c5e6af2df138c20fe95a970 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:10 -0700 Subject: [PATCH 109/263] Reallocate FOUBUF in DIR_TRANS --- src/trans/gpu/internal/ledir_mod.F90 | 13 +++++++++---- src/trans/gpu/internal/ltdir_ctl_mod.F90 | 9 +++------ src/trans/gpu/internal/ltdir_mod.F90 | 5 +++-- src/trans/gpu/internal/trltom_mod.F90 | 10 ++++++++-- 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 4caabe680..858dd0167 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -10,7 +10,7 @@ MODULE LEDIR_MOD CONTAINS -SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) +SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !**** *LEDIR* - Direct Legendre transform. @@ -60,7 +60,6 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) USE TPM_GEOMETRY ,ONLY : G, G_NDGLU USE TPM_FIELDS ,ONLY : F,ZAA,ZAS,ZAA0,ZAS0,TDZAA,TDZAS,KMLOC0 USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPROCL,D_NPNTGTB1,D_NSTAGT1B -USE TPM_TRANS ,ONLY : FOUBUF USE CUDA_GEMM_BATCHED_MOD!!, ONLY: CUDA_TCGEMM_BATCHED, CUDA_GEMM_BATCHED USE CUBLAS_MOD, ONLY : CUDA_DGEMM_BATCHED USE MPL_MODULE ,ONLY : MPL_BARRIER @@ -72,8 +71,9 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) ! DUMMY ARGUMENTS +REAL(KIND=JPRBT), ALLOCATABLE, INTENT(IN) :: FOUBUF(:) +REAL(KIND=JPRBT), INTENT(OUT) :: POA1(:,:,:) INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV -REAL(KIND=JPRBT), INTENT(OUT) :: POA1(:,:,:) ! LOCAL VARIABLES INTEGER(KIND=JPIM) :: KM @@ -105,7 +105,7 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) !$ACC& PRESENT(F,F%RW) & !$ACC& PRESENT(D,D_MYMS,R,G,G_NDGLU) & !$ACC& PRESENT(ZAA,ZAS,POA1) & -!$ACC& PRESENT(FOUBUF,D_NPNTGTB1,D_NSTAGT1B,D_NPROCL) +!$ACC& PRESENT(D_NPNTGTB1,D_NSTAGT1B,D_NPROCL) ! TODO this doesn't make sense that we need it (???) !$ACC KERNELS @@ -113,6 +113,7 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) ZINPA(:) = 0 !$ACC END KERNELS +!$ACC DATA PRESENT(FOUBUF) !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) DO KMLOC=1,D_NUMP DO JF=1,KF_FS*2 @@ -135,6 +136,10 @@ SUBROUTINE LEDIR(KF_FS,KF_UV,POA1) ENDDO ENDDO END DO +!$ACC END DATA + +!$ACC EXIT DATA DELETE(FOUBUF) +DEALLOCATE(FOUBUF) ! anti-symmetric diff --git a/src/trans/gpu/internal/ltdir_ctl_mod.F90 b/src/trans/gpu/internal/ltdir_ctl_mod.F90 index 9ebb396cd..4440efafc 100755 --- a/src/trans/gpu/internal/ltdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltdir_ctl_mod.F90 @@ -43,7 +43,6 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN,& USE TPM_GEN, only: nout USE TPM_DIM ,ONLY : R - USE TPM_TRANS ,ONLY : FOUBUF USE TPM_DISTR ,ONLY : D USE TPM_GEOMETRY ,ONLY : G USE TPM_FIELDS ,ONLY : F @@ -66,10 +65,10 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN,& REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC2(:,:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) + + REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF(:) INTEGER(KIND=JPIM) :: JM,IM - - !$ACC DATA CREATE(FOUBUF) CALL GSTATS(153,0) #ifdef USE_CUDA_AWARE_MPI_FT @@ -86,7 +85,7 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN,& CALL GSTATS(1645,0) IF (KF_FS > 0) THEN - CALL LTDIR(KF_FS,KF_UV,KF_SCALARS, & + CALL LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS, & & PSPVOR,PSPDIV,PSPSCALAR,& & PSPSC3A,PSPSC3B,PSPSC2 , & & KFLDPTRUV,KFLDPTRSC) @@ -94,8 +93,6 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN,& CALL GSTATS(1645,1) CALL GSTATS(103,1) - - !$ACC END DATA ! ----------------------------------------------------------------- diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index fbfc9258e..197706995 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -10,7 +10,7 @@ MODULE LTDIR_MOD CONTAINS - SUBROUTINE LTDIR(KF_FS,KF_UV,KF_SCALARS,& + SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& & PSPVOR,PSPDIV,PSPSCALAR,& & PSPSC3A,PSPSC3B,PSPSC2, & & KFLDPTRUV,KFLDPTRSC) @@ -100,6 +100,7 @@ SUBROUTINE LTDIR(KF_FS,KF_UV,KF_SCALARS,& REAL(KIND=JPRB) ,OPTIONAL,INTENT(OUT) :: PSPSC3B(:,:,:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) + REAL(KIND=JPRBT), ALLOCATABLE, INTENT(IN) :: FOUBUF(:) ! LOCAL INTEGER SCALARS INTEGER(KIND=JPIM) :: IFC, IIFC, IDGLU @@ -126,7 +127,7 @@ SUBROUTINE LTDIR(KF_FS,KF_UV,KF_SCALARS,& ! -------------------- ! do the legendre transform - CALL LEDIR(KF_FS,KF_UV,ZOA1) + CALL LEDIR(FOUBUF,ZOA1,KF_FS,KF_UV) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index a980d1ca2..cb27871a7 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -73,7 +73,7 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) IMPLICIT NONE INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_FS -REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) +REAL(KIND=JPRBT) ,INTENT(OUT), ALLOCATABLE :: PFBUF(:) REAL(KIND=JPRBT) ,INTENT(IN), ALLOCATABLE :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) @@ -89,6 +89,9 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) IF (LHOOK) CALL DR_HOOK('TRLTOM_CUDAAWARE',0,ZHOOK_HANDLE) +ALLOCATE(PFBUF(D%NLENGT1B*2*KF_FS)) +!$ACC ENTER DATA CREATE(PFBUF) + IF(NPROC > 1) THEN DO J=1,NPRTRW ILENS(J) = D%NLTSGTB(J)*2*KF_FS @@ -219,7 +222,7 @@ SUBROUTINE TRLTOM(PFBUF_IN,PFBUF,KF_FS) IMPLICIT NONE INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_FS -REAL(KIND=JPRBT) ,INTENT(INOUT) :: PFBUF(:) +REAL(KIND=JPRBT) ,INTENT(INOUT), ALLOCATABLE :: PFBUF(:) REAL(KIND=JPRBT) ,INTENT(IN), ALLOCATABLE :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) @@ -229,6 +232,9 @@ SUBROUTINE TRLTOM(PFBUF_IN,PFBUF,KF_FS) IF (LHOOK) CALL DR_HOOK('TRLTOM',0,ZHOOK_HANDLE) +ALLOCATE(PFBUF(D%NLENGT1B*2*KF_FS)) +!$ACC ENTER DATA CREATE(PFBUF) + IF(NPROC > 1) THEN DO J=1,NPRTRW ILENS(J) = D%NLTSGTB(J)*2*KF_FS From 27bdcfdcd61541ebfa57196de47609e1bb56ed5b Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:11 -0700 Subject: [PATCH 110/263] Reallocate POA1 in LEDIR --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 1 - src/trans/gpu/internal/ltdir_mod.F90 | 17 +++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 3a6cb21a1..fb11595fb 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -83,7 +83,6 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& USE FIELD_SPLIT_MOD ,ONLY : FIELD_SPLIT USE LTDIR_CTL_MOD ,ONLY : LTDIR_CTL USE FTDIR_CTL_MOD ,ONLY : FTDIR_CTL -USE TPM_TRANS ,ONLY : ZGTF USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS use nvtx diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index 197706995..5c20bd5dd 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -28,8 +28,6 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& USE UVTVD_MOD USE UPDSP_MOD ,ONLY : UPDSP USE UPDSPB_MOD ,ONLY : UPDSPB - - USE TPM_FIELDS ,ONLY : ZOA1,ZOA2,ZEPSNM !**** *LTDIR* - Control of Direct Legendre transform step @@ -107,6 +105,7 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& INTEGER(KIND=JPIM) :: IUS, IUE, IVS, IVE, IVORS, IVORE, IDIVS, IDIVE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + REAL(KIND=JPRB), ALLOCATABLE :: POA1(:,:,:) REAL(KIND=JPRB), ALLOCATABLE :: POA2(:,:,:) @@ -125,9 +124,12 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& !* 2. PREPARE WORK ARRAYS. ! -------------------- - + + ALLOCATE(POA1(2*KF_FS,R%NTMAX+3,D%NUMP)) + !$ACC ENTER DATA CREATE(POA1) + ! do the legendre transform - CALL LEDIR(FOUBUF,ZOA1,KF_FS,KF_UV) + CALL LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ! ------------------------------------------------------------------ @@ -151,7 +153,7 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& ! Compute vorticity and divergence - CALL UVTVD(KF_UV,ZOA1(IUS:IUE,:,:),ZOA1(IVS:IVE,:,:),& + CALL UVTVD(KF_UV,POA1(IUS:IUE,:,:),POA1(IVS:IVE,:,:),& & POA2(IVORS:IVORE,:,:),POA2(IDIVS:IDIVE,:,:)) ! Write back. Note, if we have UV, the contract says we *must* have VOR/DIV @@ -176,11 +178,14 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& ! KM = D%MYMS(KMLOC) ! this is on the host, so need to cp from device, Nils - CALL UPDSP(KF_UV,KF_SCALARS,ZOA1,& + CALL UPDSP(KF_UV,KF_SCALARS,POA1,& & PSPSCALAR,& & PSPSC3A,PSPSC3B,PSPSC2 , & & KFLDPTRUV,KFLDPTRSC) + !$ACC EXIT DATA DELETE(POA1) + DEALLOCATE(POA1) + ! ------------------------------------------------------------------ IF (LHOOK) CALL DR_HOOK('LTDIR_MOD',1,ZHOOK_HANDLE) From f77daa375cbbf39a1796603854cf4752681f783d Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:11 -0700 Subject: [PATCH 111/263] Remove some allocations from setup_trans --- src/trans/gpu/external/setup_trans.F90 | 63 +++++++++----------------- src/trans/gpu/external/trans_end.F90 | 14 ++---- src/trans/gpu/internal/leinv_mod.F90 | 7 +++ src/trans/gpu/internal/prfi1b_mod.F90 | 5 +- src/trans/gpu/internal/tpm_dim.F90 | 2 +- src/trans/gpu/internal/uvtvd_mod.F90 | 2 +- 6 files changed, 35 insertions(+), 58 deletions(-) diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index fe7159659..ce8d7bb78 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -104,11 +104,11 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& USE TPM_GEN ,ONLY : NOUT, MSETUP0, NCUR_RESOL, NDEF_RESOL, & & NMAX_RESOL, NPRINTLEV, LENABLED, NERR -USE TPM_DIM ,ONLY : R, DIM_RESOL, R_NSMAX,R_NTMAX, R_NDGNH, R_NDGL, R_NNOEXTZL +USE TPM_DIM ,ONLY : R, DIM_RESOL, R_NSMAX,R_NTMAX, R_NDGNH, R_NDGL USE TPM_DISTR ,ONLY : D, DISTR_RESOL,NPROC,nprtrv, D_NUMP,D_MYMS,D_NSTAGT0B,D_NSTAGT1B,D_NPROCL,D_NPNTGTB1, D_NASM0, & & D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS,mysetv,mysetw, MYPROC USE TPM_GEOMETRY ,ONLY : G, GEOM_RESOL, G_NDGLU, G_NMEN, G_NMEN_MAX,G_NLOEN, G_NLOEN_MAX -USE TPM_FIELDS ,ONLY : FIELDS_RESOL, F,F_RW, ZIA,ZEPSNM,ZOA1,ZOA2, & +USE TPM_FIELDS ,ONLY : FIELDS_RESOL, F,F_RW, ZEPSNM, & & ZAA,ZAS,TDZAA,TDZAS,& & IF_FS_INV0,IF_FS_DIR0,NFLEV0,ZAA0,& & ZAS0,KMLOC0 @@ -118,7 +118,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& #endif USE TPM_FFTC ,ONLY : TC, FFTC_RESOL USE TPM_FLT -USE TPM_TRANS ,ONLY : FOUBUF_IN, FOUBUF, ZGTF, ZAVE, ZMINGL, ZMAXGL, ZMINGPN, ZMAXGPN +USE TPM_TRANS ,ONLY : ZAVE, ZMINGL, ZMAXGL, ZMINGPN, ZMAXGPN USE TPM_CTL USE SET_RESOL_MOD ,ONLY : SET_RESOL @@ -172,7 +172,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& INTEGER(KIND=JPIM),PARAMETER :: IMAXFLD=240 INTEGER(KIND=JPIM) :: JGL,JRES,IDEF_RESOL INTEGER(KIND=JPIM) :: NFLEVL, JMLOC, KM, ILA, ILS, KMLOC, KDGLU, JK, i, J, IF_FS, IF_OUT_LT, IF_UV, IF_SCALARS -INTEGER(KIND=JPIM) :: IPPNUM, IF_PP, IF_FOUBUF +INTEGER(KIND=JPIM) :: IPPNUM, IF_PP LOGICAL :: LLP1,LLP2, LLSPSETUPONLY REAL(KIND=JPRD) :: ZTIME0,ZTIME1,ZTIME2 @@ -559,51 +559,32 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ENDDO !$ACC ENTER DATA COPYIN(ZAA,ZAS) -IF_FOUBUF=MAX(IF_OUT_LT,IF_FS) -ALLOCATE(FOUBUF_IN(MAX(1,D%NLENGT0B*2*IF_FOUBUF))) -ALLOCATE(FOUBUF(MAX(1,D%NLENGT0B*2*IF_FOUBUF))) -! memory save - -ALLOCATE(ZGTF(2*IF_FS,D%NLENGTF)) -write(nout,*)'ZGTF :',size(ZGTF) -!$ACC ENTER DATA CREATE(ZGTF) - -ALLOCATE(ZIA(IF_FS_INV0,R%NLEI1,D%NUMP)) -ALLOCATE(ZOA1(4*IF_FS_DIR0,R%NLED4,D%NUMP)) -ALLOCATE(ZOA2(MAX(4*IF_UV,1),R%NLED4,D%NUMP)) ALLOCATE(ZEPSNM(d%nump,0:R%NTMAX+2)) -write(nout,*)'ZIA :',size(ZIA ) -write(nout,*)'ZOA1 :',size(ZOA1 ) -write(nout,*)'ZOA2 :',size(ZOA2 ) - +write(nout,*)'ZEPSNM :',size(ZEPSNM) ZEPSNM = 0._JPRBT CALL PREPSNM -ZGTF = 0._JPRBT -ZIA = 0._JPRBT -ZOA1 = 0._JPRBT -ZOA2 = 0._JPRBT -!$ACC ENTER DATA COPYIN(ZIA,ZEPSNM,ZOA1,ZOA2) +!$ACC ENTER DATA COPYIN(ZEPSNM) +! TODO: I guess tose might be needed again ! add arrays for GPNORM1 -ALLOCATE(ZAVE(IF_FS,R%NDGL)) -ALLOCATE(ZMINGL(IF_FS,R%NDGL)) -ALLOCATE(ZMAXGL(IF_FS,R%NDGL)) -ALLOCATE(ZMINGPN(IF_FS)) -ALLOCATE(ZMAXGPN(IF_FS)) - -ZAVE = 0._JPRBT -ZMINGL = 0._JPRBT -ZMAXGL = 0._JPRBT -ZMINGPN = 0._JPRBT -ZMAXGPN = 0._JPRBT -!$ACC ENTER DATA COPYIN(ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) +!ALLOCATE(ZAVE(IF_FS,R%NDGL)) +!ALLOCATE(ZMINGL(IF_FS,R%NDGL)) +!ALLOCATE(ZMAXGL(IF_FS,R%NDGL)) +!ALLOCATE(ZMINGPN(IF_FS)) +!ALLOCATE(ZMAXGPN(IF_FS)) + +!ZAVE = 0._JPRBT +!ZMINGL = 0._JPRBT +!ZMAXGL = 0._JPRBT +!ZMINGPN = 0._JPRBT +!ZMAXGPN = 0._JPRBT +!!$ACC ENTER DATA COPYIN(ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) !set up flat copies of constant data R_NSMAX=R%NSMAX R_NTMAX=R%NTMAX R_NDGNH=R%NDGNH R_NDGL=R%NDGL -R_NNOEXTZL=R%NNOEXTZL ALLOCATE(D_NSTAGT0B(SIZE(D%NSTAGT0B))) @@ -707,10 +688,10 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& F_RW(I)=F%RW(I) END DO -!$ACC ENTER DATA COPYIN(R_NSMAX,R_NTMAX,R_NDGL,R_NNOEXTZL,R_NDGNH,D_NSTAGT0B,D_NSTAGT1B,& +!$ACC ENTER DATA COPYIN(D_NSTAGT0B,D_NSTAGT1B,& !$ACC& D_NPNTGTB1,D_NPROCL,D_NUMP,D_MYMS,D_NASM0,D_NSTAGTF,D_MSTABF,& -!$ACC& D_NPNTGTB0,D_NPROCM,D_NPTRLS,G_NDGLU,G_NMEN,G_NMEN_MAX,G_NLOEN,& -!$ACC& G_NLOEN_MAX,F_RW) +!$ACC& D_NPNTGTB0,D_NPROCM,D_NPTRLS,G_NDGLU,G_NMEN,G_NLOEN,& +!$ACC& F_RW) WRITE(NOUT,*) '===GPU arrays successfully allocated' !$ACC wait diff --git a/src/trans/gpu/external/trans_end.F90 b/src/trans/gpu/external/trans_end.F90 index 5f7156c9d..20fdef985 100755 --- a/src/trans/gpu/external/trans_end.F90 +++ b/src/trans/gpu/external/trans_end.F90 @@ -46,7 +46,7 @@ SUBROUTINE TRANS_END(CDMODE) !ifndef INTERFACE USE TPM_GEN ,ONLY : MSETUP0, NCUR_RESOL, NMAX_RESOL, LENABLED,NDEF_RESOL -USE TPM_DIM ,ONLY : R, DIM_RESOL, R_NSMAX,R_NTMAX, R_NDGNH, R_NDGL, R_NNOEXTZL +USE TPM_DIM ,ONLY : R, DIM_RESOL, R_NSMAX,R_NTMAX, R_NDGNH, R_NDGL USE TPM_DISTR ,ONLY : D, DISTR_RESOL, NPRCIDS,D_NUMP,D_MYMS,D_NSTAGT0B,D_NSTAGT1B,D_NPROCL,D_NPNTGTB1, D_NASM0, & & D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS USE TPM_GEOMETRY ,ONLY : G, GEOM_RESOL, G_NDGLU, G_NMEN, G_NMEN_MAX,G_NLOEN, G_NLOEN_MAX @@ -74,23 +74,15 @@ SUBROUTINE TRANS_END(CDMODE) IF (PRESENT(CDMODE)) CLMODE=CDMODE IF (CLMODE == 'FINAL') THEN - !$ACC EXIT DATA DELETE(ZAA0,ZAS0,ZIA,ZEPSNM,ZOA1,ZOA2,ZAA,ZAS,ZGTF) + !$ACC EXIT DATA DELETE(ZAA0,ZAS0,ZEPSNM,ZAA,ZAS) DEALLOCATE(ZAA0) DEALLOCATE(ZAS0) - DEALLOCATE(ZIA) DEALLOCATE(ZEPSNM) - DEALLOCATE(ZOA1) - DEALLOCATE(ZOA2) DEALLOCATE(ZAA) DEALLOCATE(ZAS) - DEALLOCATE(ZGTF) - - !memory save - DEALLOCATE(FOUBUF_IN) - DEALLOCATE(FOUBUF) DEALLOCATE(D_NSTAGT0B,D_NSTAGT1B,D_NPNTGTB1,D_MYMS,D_NPROCL,D_NASM0,D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS,G_NDGLU,G_NMEN,G_NLOEN,F_RW) - !$ACC exit data delete(R_NSMAX,R_NTMAX,R_NDGL,R_NNOEXTZL,R_NDGNH,D_NSTAGT0B,D_NSTAGT1B,D_NPNTGTB1,D_NPROCL,D_NUMP,D_MYMS,G_NDGLU,G_NMEN,G_NMEN_MAX,G_NLOEN,G_NLOEN_MAX,D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS,D_NASM0,F_RW) + !$ACC EXIT DATA DELETE(D_NSTAGT0B,D_NSTAGT1B,D_NPNTGTB1,D_NPROCL,D_MYMS,G_NDGLU,G_NMEN,G_NLOEN,D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS,D_NASM0,F_RW) !call CUDA_DGEMM_BATCHED_FINALIZE() diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 5e401ef0c..3f206a994 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -107,6 +107,13 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) !$ACC& PRESENT(ZAA,ZAS,PIA) & !$ACC& PRESENT(D_MYMS,D_NPROCL,D_NSTAGT1B,D_NPNTGTB1,G_NDGLU) +! TODO this doesn't make sense that we need it (???) +!$ACC KERNELS +ZINP(:) = 0 +ZOUTS(:) = 0 +ZOUTA(:) = 0 +!$ACC END KERNELS + ! READ 2:NSMAX+3 !IF KM=0 and NSMAX is 6: diff --git a/src/trans/gpu/internal/prfi1b_mod.F90 b/src/trans/gpu/internal/prfi1b_mod.F90 index 06ef39d52..fdd141a99 100755 --- a/src/trans/gpu/internal/prfi1b_mod.F90 +++ b/src/trans/gpu/internal/prfi1b_mod.F90 @@ -81,10 +81,7 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) !* 1. EXTRACT FIELDS FROM SPECTRAL ARRAYS. ! -------------------------------------------------- - !$ACC DATA & - !$ACC PRESENT(D_NUMP,R_NSMAX,D_MYMS,D_NASM0) & - !$ACC PRESENT(PIA) & - !$ACC PRESENT(PSPEC) + !$ACC DATA PRESENT(D_MYMS,D_NASM0,PIA,PSPEC) !$ACC DATA IF(PRESENT(KFLDPTR)) PRESENT(KFLDPTR) diff --git a/src/trans/gpu/internal/tpm_dim.F90 b/src/trans/gpu/internal/tpm_dim.F90 index 9cc76b6bc..162c20395 100755 --- a/src/trans/gpu/internal/tpm_dim.F90 +++ b/src/trans/gpu/internal/tpm_dim.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -52,6 +53,5 @@ MODULE TPM_DIM INTEGER(KIND=JPIM) :: R_NTMAX ! Truncation order for tendencies INTEGER(KIND=JPIM) :: R_NDGNH ! Number of rows in northern hemisphere INTEGER(KIND=JPIM) :: R_NDGL ! Number of rows of latitudes -INTEGER(KIND=JPIM) :: R_NNOEXTZL ! Longitude direction END MODULE TPM_DIM diff --git a/src/trans/gpu/internal/uvtvd_mod.F90 b/src/trans/gpu/internal/uvtvd_mod.F90 index bd0aac9a0..2cffd6ba0 100755 --- a/src/trans/gpu/internal/uvtvd_mod.F90 +++ b/src/trans/gpu/internal/uvtvd_mod.F90 @@ -86,7 +86,7 @@ SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) ! ------------------------------------------ !$ACC DATA& -!$ACC& PRESENT(D_MYMS,D_NUMP,R_NTMAX) & +!$ACC& PRESENT(D_MYMS) & !$ACC& PRESENT(F,F%RN,F%NLTN) & !$ACC& PRESENT(ZEPSNM,PU,PV,PVOR,PDIV) From fe1cee4c528839c8ae621d32785ca45bee22804e Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:11 -0700 Subject: [PATCH 112/263] Remove redundand variables from fields and dir files --- src/trans/gpu/external/dir_trans.F90 | 19 --- src/trans/gpu/external/inv_trans.F90 | 7 -- src/trans/gpu/external/setup_trans.F90 | 62 +-------- src/trans/gpu/external/trans_end.F90 | 2 +- src/trans/gpu/internal/ltdir_ctl_mod.F90 | 2 +- src/trans/gpu/internal/tpm_fields.F90 | 10 -- src/trans/gpu/internal/updspb_vd_mod.F90 | 154 ----------------------- 7 files changed, 5 insertions(+), 251 deletions(-) delete mode 100755 src/trans/gpu/internal/updspb_vd_mod.F90 diff --git a/src/trans/gpu/external/dir_trans.F90 b/src/trans/gpu/external/dir_trans.F90 index 7c4939467..d78b5d86d 100755 --- a/src/trans/gpu/external/dir_trans.F90 +++ b/src/trans/gpu/external/dir_trans.F90 @@ -116,7 +116,6 @@ SUBROUTINE DIR_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& & NF_SC2, NF_SC3A, NF_SC3B, & & NGPBLKS, NPROMA USE TPM_DISTR ,ONLY : D, NPRTRV, MYSETV -USE TPM_FIELDS ,ONLY : IF_FS_DIR,IF_FS_DIR0,NFLEV,NFLEV0 USE TPM_FLT, ONLY: S USE TPM_GEOMETRY ,ONLY : G USE SET_RESOL_MOD ,ONLY : SET_RESOL @@ -309,26 +308,8 @@ SUBROUTINE DIR_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& NGPBLKS = (D%NGPTOT-1)/NPROMA+1 IF_FS = 2*IF_UV + IF_SCALARS -!D%IADJUST_D=0 -!IF(MOD(IF_FS,2)==1) THEN -! IF_FS = IF_FS + 1 -! D%IADJUST_D=1 -!ENDIF - IF_GP = 2*IF_UV_G+IF_SCALARS_G -! add additional post-processing requirements -! (copied from setup_trans.F90. Or does this need to be different here than in setup_trans.F90?) -!IF_PP = 2*NFLEV -!IF_PP = 0 - -! How do I get the current number of levels? For now I use: (Andreas) -!NFLEV = NFLEV0 - -! set currently used array sizes for the GPU arrays: -IF_FS_DIR=2*IF_FS+2!2*(2*IF_UV+NFLEV+2+IF_PP) -print*,"dir_trans: IF_FS_DIR=",IF_FS_DIR," IF_FS_DIR0=",IF_FS_DIR0 - ! Consistency checks IF (IF_UV > 0) THEN diff --git a/src/trans/gpu/external/inv_trans.F90 b/src/trans/gpu/external/inv_trans.F90 index aa0747ce8..e495554de 100755 --- a/src/trans/gpu/external/inv_trans.F90 +++ b/src/trans/gpu/external/inv_trans.F90 @@ -132,7 +132,6 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, LATLON, & & NF_SC2, NF_SC3A, NF_SC3B, NGPBLKS, NPROMA -USE TPM_FIELDS ,ONLY : IF_FS_INV,IF_FS_INV0 USE TPM_FLT, ONLY: S USE TPM_GEOMETRY ,ONLY : G !USE TPM_GEOMETRY @@ -415,12 +414,6 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& IF_UV_PAR = IF_UV_PAR+2 ENDIF -! set currently used array sizes for the GPU arrays: -!IF_FS_INV= 8*IF_UV + 2*IF_SCALARS + 2*IF_SCDERS -!Andreas: we were using the previous line in setup_trans but this doesn't consider derivatives. Better: -IF_FS_INV=2*IF_OUT_LT -print*,"inv_trans: IF_FS_INV=",IF_FS_INV," IF_FS_INV0=",IF_FS_INV0 - ! Consistency checks IF (IF_UV > 0) THEN diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index ce8d7bb78..4b20bacb2 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -110,7 +110,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& USE TPM_GEOMETRY ,ONLY : G, GEOM_RESOL, G_NDGLU, G_NMEN, G_NMEN_MAX,G_NLOEN, G_NLOEN_MAX USE TPM_FIELDS ,ONLY : FIELDS_RESOL, F,F_RW, ZEPSNM, & & ZAA,ZAS,TDZAA,TDZAS,& -& IF_FS_INV0,IF_FS_DIR0,NFLEV0,ZAA0,& +& ZAA0,& & ZAS0,KMLOC0 USE TPM_FFT ,ONLY : T, FFT_RESOL #ifdef WITH_FFTW @@ -171,8 +171,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! Local variables INTEGER(KIND=JPIM),PARAMETER :: IMAXFLD=240 INTEGER(KIND=JPIM) :: JGL,JRES,IDEF_RESOL -INTEGER(KIND=JPIM) :: NFLEVL, JMLOC, KM, ILA, ILS, KMLOC, KDGLU, JK, i, J, IF_FS, IF_OUT_LT, IF_UV, IF_SCALARS -INTEGER(KIND=JPIM) :: IPPNUM, IF_PP +INTEGER(KIND=JPIM) :: JMLOC, KM, ILA, ILS, KMLOC, KDGLU, JK, i, J LOGICAL :: LLP1,LLP2, LLSPSETUPONLY REAL(KIND=JPRD) :: ZTIME0,ZTIME1,ZTIME2 @@ -444,19 +443,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& IF( .NOT.D%LGRIDONLY ) THEN -!allocating arrays for the GPU: -IF(PRESENT(KFLEV)) THEN - NFLEV0 = KFLEV -! NFLEVL = NFLEV0/NPRTRV -ELSE - NFLEV0 = ceiling(REAL(IMAXFLD)/NPRTRV) -ENDIF - -! need to get local rank to be able to set device (1GPU == 1 MPI-rank) -!ilocal_rank = 0 -!call GETENV("OMPI_COMM_WORLD_LOCAL_RANK",comm_local_rank) -!read(comm_local_rank,'(I2)') ilocal_rank - iunit=300+myproc #ifdef _OPENACC @@ -470,48 +456,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& WRITE(iunit,*) '===now going to allocate GPU arrays on processor: ', myproc, ' device = ', mygpu, ' ',idev, ' of ', inumdevs #endif -!dimensions of matrices for Legendre Transforms for RAPS ? -!IF_OUT_LT = 5*NFLEV0+2 -!IF_FS = 6*NFLEV0+3 - -! add additional post-processing requirements -!IF_PP = 2*NFLEV0 -IF_PP = 0 - -! u/v + scalars 3d + scalars 2d -IF_UV = NFLEV0 -! SCALARS INCLUDING DERIVATIVES -IF_SCALARS = NFLEV0 + 2*NFLEV0 + 1 + 2 + IF_PP -IF_OUT_LT = 4*IF_UV+3*NFLEV0+3+IF_PP -!IF_OUT_LT = 4*IF_UV+3*NFLEV0+3 -!8*KF_UV+2*KF_SCALARS -!ILEI2 = 8*KF_UV + 2*KF_SCALARS + 2*KF_SCDERS -IF_FS_INV0=8*IF_UV+2*IF_SCALARS - -! fields in Fourier space for inv trans the same -!IF_FS=4*IF_UV+1*NFLEV0+2 -IF_FS=4*IF_UV+1*NFLEV0+2 -! for derivatives u/v add -!IF_FS=IFS_FS+2*(2*NFLEV0) -! for each 3d scalar derivative add -IF_FS=IF_FS+2*NFLEV0 ! temperature -! for each 2d scalar derivative add -IF_FS=IF_FS+2 ! sfc pressure -IF_FS=IF_FS+IF_PP - -! u/v + scalars for direct transforms -! plus postprocessing buffer -!ippnum=NFLEV0 -IF_FS_DIR0=2*(2*IF_UV+NFLEV0+2+IF_PP) -!QUESTION: Why do we have NFLEV0 here? (Andreas) - -! fields in Fourier space for dir trans -!IF_FS = 2*IF_UV + IF_SCALARS -! plus add 2*scalar_derivatives + add vorg/divg + 2*IF_UV for u/v zonal derivatives - -write(nout,*)'setup_trans: if_uv=',if_uv,' if_out_lt=',if_out_lt,' IF_FS_DIR0=',IF_FS_DIR0,'IF_FS_INV0= ',IF_FS_INV0 -IF(MOD(IF_FS,2)==1) IF_FS = IF_FS + 1 - !leading and trailing dimensions of A for symmetric and antisymmetric cases ! (same for ltinv and ltdir) TDZAA=(R%NTMAX+2)/2 @@ -521,7 +465,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& !$ACC ENTER DATA & !$ACC& COPYIN(F,F%RN,F%RLAPIN,S,S%FA,S%ITHRESHOLD,S%LUSEFLT,D,D%NUMP,D%MYMS,R,R%NDGNH,R%NSMAX,G,G%NDGLU) & -!$ACC& copyin(D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,G%NMEN,D%NPROCM,D%NPTRLS,G,G%NLOEN,D%MSTABF) +!$ACC& COPYIN(D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,G%NMEN,D%NPROCM,D%NPTRLS,G,G%NLOEN,D%MSTABF) ! Initialize A arrays diff --git a/src/trans/gpu/external/trans_end.F90 b/src/trans/gpu/external/trans_end.F90 index 20fdef985..590a850ef 100755 --- a/src/trans/gpu/external/trans_end.F90 +++ b/src/trans/gpu/external/trans_end.F90 @@ -50,7 +50,7 @@ SUBROUTINE TRANS_END(CDMODE) USE TPM_DISTR ,ONLY : D, DISTR_RESOL, NPRCIDS,D_NUMP,D_MYMS,D_NSTAGT0B,D_NSTAGT1B,D_NPROCL,D_NPNTGTB1, D_NASM0, & & D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS USE TPM_GEOMETRY ,ONLY : G, GEOM_RESOL, G_NDGLU, G_NMEN, G_NMEN_MAX,G_NLOEN, G_NLOEN_MAX -USE TPM_FIELDS ,ONLY : F, FIELDS_RESOL,F_RW, ZIA,ZEPSNM,ZOA1,ZOA2,ZAA,ZAS,ZAA0,ZAS0 +USE TPM_FIELDS ,ONLY : F, FIELDS_RESOL,F_RW, ZEPSNM,ZAA,ZAS,ZAA0,ZAS0 USE TPM_FFT ,ONLY : T, FFT_RESOL USE TPM_CTL ,ONLY : C, CTL_RESOL #ifdef WITH_FFTW diff --git a/src/trans/gpu/internal/ltdir_ctl_mod.F90 b/src/trans/gpu/internal/ltdir_ctl_mod.F90 index 4440efafc..3bb763b46 100755 --- a/src/trans/gpu/internal/ltdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltdir_ctl_mod.F90 @@ -51,7 +51,7 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN,& USE LTDIR_MOD ,ONLY : LTDIR USE TRLTOM_MOD ,ONLY : TRLTOM, TRLTOM_CUDAAWARE - USE TPM_FIELDS ,ONLY : ZOA1,ZEPSNM + USE TPM_FIELDS ,ONLY : ZEPSNM IMPLICIT NONE diff --git a/src/trans/gpu/internal/tpm_fields.F90 b/src/trans/gpu/internal/tpm_fields.F90 index 59f14bab3..affaf7248 100755 --- a/src/trans/gpu/internal/tpm_fields.F90 +++ b/src/trans/gpu/internal/tpm_fields.F90 @@ -50,16 +50,6 @@ MODULE TPM_FIELDS INTEGER(KIND=JPIM) :: TDZAA INTEGER(KIND=JPIM) :: TDZAS -! enable calling setup_trans with a different set of fields than inv_trans and dir_trans: -! IF_FS_INV0: size used for the allocation in setup_trans -! IF_FS_INV: size used in inv_trans and dir_Trans, needs to be <= IF_FS_INV0 -INTEGER(KIND=JPIM) :: IF_FS_INV, IF_FS_INV0 -INTEGER(KIND=JPIM) :: IF_FS_DIR, IF_FS_DIR0 -INTEGER(KIND=JPIM) :: NFLEV, NFLEV0 - -REAL(KIND=JPRB),ALLOCATABLE, TARGET :: ZIA(:,:,:) REAL(KIND=JPRBT),ALLOCATABLE :: ZEPSNM(:,:) -REAL(KIND=JPRBT),ALLOCATABLE, TARGET :: ZOA1(:,:,:) -REAL(KIND=JPRBT),ALLOCATABLE, TARGET :: ZOA2(:,:,:) END MODULE TPM_FIELDS diff --git a/src/trans/gpu/internal/updspb_vd_mod.F90 b/src/trans/gpu/internal/updspb_vd_mod.F90 deleted file mode 100755 index 400990a11..000000000 --- a/src/trans/gpu/internal/updspb_vd_mod.F90 +++ /dev/null @@ -1,154 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE UPDSPB_VD_MOD -CONTAINS -SUBROUTINE UPDSPB_VD(KFIELD,PSPVOR,PSPDIV,KFLDPTR) - - !**** *UPDSPB* - Update spectral arrays after direct Legendre transform - - ! Purpose. - ! -------- - ! To update spectral arrays for a fixed zonal wave-number - ! from values in POA. - - !** Interface. - ! ---------- - ! CALL UPDSPB(....) - - ! Explicit arguments : KM - zonal wavenumber - ! -------------------- KFIELD - number of fields - ! POA - work array - ! PSPEC - spectral array - - ! Implicit arguments : None - ! -------------------- - - ! Method. - ! ------- - - ! Externals. - ! ---------- - - ! Reference. - ! ---------- - ! ECMWF Research Department documentation of the IFS - - ! Author. - ! ------- - ! Mats Hamrud and Philippe Courtier *ECMWF* - - ! Modifications. - ! -------------- - ! Original : 88-02-02 - ! D. Giard : 93-03-19 truncations NSMAX and NTMAX (see NOTE) - ! R. El Khatib : 94-08-02 Replace number of fields by indexes of the - ! first and last field - ! L. Isaksen : 95-06-06 Reordering of spectral arrays - ! ------------------------------------------------------------------ - - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT - - - - USE TPM_DIM ,ONLY : R,R_NSMAX,R_NTMAX - USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,D_NASM0 - USE TPM_FIELDS ,ONLY : ZOA2 - ! - - IMPLICIT NONE - - INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD - INTEGER(KIND=JPIM) :: KM,KMLOC - REAL(KIND=JPRB) ,INTENT(OUT) :: PSPVOR(:,:) - REAL(KIND=JPRB) ,INTENT(OUT) :: PSPDIV(:,:) - INTEGER(KIND=JPIM),INTENT(IN),OPTIONAL :: KFLDPTR(:) - - ! LOCAL INTEGER SCALARS - INTEGER(KIND=JPIM) :: II, INM, IR, JFLD, JN, ISMAX, ITMAX, IASM0,IFLD - INTEGER(KIND=JPIM) :: IVORS, IDIVS - - - ! ------------------------------------------------------------------ - - !* 0. NOTE. - ! ----- - - ! The following transfer reads : - ! SPEC(k,NASM0(m)+NLTN(n)*2) =POA(nn,2*k-1) (real part) - ! SPEC(k,NASM0(m)+NLTN(n)*2+1)=POA(nn,2*k ) (imaginary part) - ! with n from m to NSMAX - ! and nn=NTMAX+2-n from NTMAX+2-m to NTMAX+2-NSMAX. - ! NLTN(m)=NTMAX+2-m : n=NLTN(nn),nn=NLTN(n) - ! nn is the loop index. - - IVORS = 1 - IDIVS = 2*KFIELD+1 - - !* 1. UPDATE SPECTRAL FIELDS. - ! ----------------------- - !$ACC data & - !$ACC& present(ZOA2) & - !$ACC& copy(PSPVOR,PSPDIV) & - !$ACC& copy(D,D_NUMP,D_MYMS,R,R_NSMAX,R_NTMAX,D,D_NASM0) - - !$ACC parallel loop collapse(3) private(KM,INM,IR,II,IASM0,IFLD) - DO KMLOC=1,D_NUMP - DO JN=R_NTMAX+2-R_NSMAX,R_NTMAX+2 - DO JFLD=1,KFIELD - - KM = D_MYMS(KMLOC) - IASM0 = D_NASM0(KM) - - IF(KM == 0) THEN - - if (JN .le. R_NTMAX+2-KM) then - INM = IASM0+(R_NTMAX+2-JN)*2 - IR = 2*JFLD-1 - PSPVOR(JFLD,INM) = ZOA2(IVORS+IR-1,JN,KMLOC) - PSPDIV(JFLD,INM) = ZOA2(IDIVS+IR-1,JN,KMLOC) - PSPVOR(JFLD,INM+1) = 0.0_JPRBT - PSPDIV(JFLD,INM+1) = 0.0_JPRBT - end if - IF(PRESENT(KFLDPTR)) THEN - IFLD = KFLDPTR(JFLD) - PSPVOR(IFLD,IASM0) = 0.0_JPRBT - PSPDIV(IFLD,IASM0) = 0.0_JPRBT - ELSE - PSPVOR(JFLD,IASM0) = 0.0_JPRBT - PSPDIV(JFLD,IASM0) = 0.0_JPRBT - ENDIF - - ELSE - - - if (JN .le. R_NTMAX+2-KM) then - INM = IASM0+((R_NTMAX+2-JN)-KM)*2 - - IR = 2*JFLD-1 - II = IR+1 - PSPVOR(JFLD,INM) = ZOA2(IVORS+IR-1,JN,KMLOC) - PSPVOR(JFLD,INM+1) = ZOA2(IVORS+II-1,JN,KMLOC) - PSPDIV(JFLD,INM) = ZOA2(IDIVS+IR-1,JN,KMLOC) - PSPDIV(JFLD,INM+1) = ZOA2(IDIVS+II-1,JN,KMLOC) - - end if - end if - - ENDDO - - ENDDO - !end loop over wavenumber - END DO - !$ACC end data - - ! ------------------------------------------------------------------ - - END SUBROUTINE UPDSPB_VD - END MODULE UPDSPB_VD_MOD From 57a5e606399898dfb2b600dba37cc23439e83263 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:11 -0700 Subject: [PATCH 113/263] No more need to compute divergence if vorticity is needed This was needed in an earlier version! --- src/trans/gpu/external/inv_trans.F90 | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/trans/gpu/external/inv_trans.F90 b/src/trans/gpu/external/inv_trans.F90 index e495554de..ed08bac69 100755 --- a/src/trans/gpu/external/inv_trans.F90 +++ b/src/trans/gpu/external/inv_trans.F90 @@ -371,8 +371,6 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& ! Compute derived variables -IF(LVORGP) LDIVGP = .TRUE. - NGPBLKS = (D%NGPTOT-1)/NPROMA+1 IF_OUT_LT = 2*IF_UV + IF_SCALARS+IF_SCDERS From 2a4a6fd48b0b1e8aa1d5eea19bcc7b16a18a9475 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:12 -0700 Subject: [PATCH 114/263] Remove redundant variables --- src/trans/gpu/external/inv_trans.F90 | 6 ------ src/trans/gpu/internal/tpm_distr.F90 | 4 +--- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/src/trans/gpu/external/inv_trans.F90 b/src/trans/gpu/external/inv_trans.F90 index ed08bac69..ca2c127ea 100755 --- a/src/trans/gpu/external/inv_trans.F90 +++ b/src/trans/gpu/external/inv_trans.F90 @@ -385,12 +385,6 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& IF(IF_UV > 0 .AND. LUVDER) THEN IF_FS = IF_FS+2*IF_UV ENDIF -! importance related to cuFFT -D%IADJUST_I=0 -!IF(MOD(IF_FS,2)==1) THEN -! IF_FS = IF_FS + 1 -! D%IADJUST_I=1 -!ENDIF IF_GP = 2*IF_UV_G+IF_SCALARS_G IF(LSCDERS) THEN diff --git a/src/trans/gpu/internal/tpm_distr.F90 b/src/trans/gpu/internal/tpm_distr.F90 index c196c815f..8901edf7c 100755 --- a/src/trans/gpu/internal/tpm_distr.F90 +++ b/src/trans/gpu/internal/tpm_distr.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -159,9 +160,6 @@ MODULE TPM_DISTR REAL(KIND=JPRBT) ,ALLOCATABLE :: RWEIGHT(:) ! Weight per grid-point (if weighted distribution) INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPROCA_GP(:) ! Number of grid-points per a-set -INTEGER(KIND=JPIM) :: IADJUST_D -INTEGER(KIND=JPIM) :: IADJUST_I - END TYPE DISTR_TYPE !flat versions of the above From 7887b0d9e916fc321311366835d34b01a374223d Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:12 -0700 Subject: [PATCH 115/263] Use pointers for clarity --- src/trans/gpu/internal/ltdir_mod.F90 | 52 +++++++++------------------- 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index 5c20bd5dd..77fe24de5 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -101,15 +101,12 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& REAL(KIND=JPRBT), ALLOCATABLE, INTENT(IN) :: FOUBUF(:) ! LOCAL INTEGER SCALARS - INTEGER(KIND=JPIM) :: IFC, IIFC, IDGLU - INTEGER(KIND=JPIM) :: IUS, IUE, IVS, IVE, IVORS, IVORE, IDIVS, IDIVE + INTEGER(KIND=JPIM) :: IFC, IIFC, IDGLU, IFIRST REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - REAL(KIND=JPRB), ALLOCATABLE :: POA1(:,:,:) - REAL(KIND=JPRB), ALLOCATABLE :: POA2(:,:,:) - - - !call cudaProfilerStart + REAL(KIND=JPRB), ALLOCATABLE, TARGET :: POA1(:,:,:) + REAL(KIND=JPRB), ALLOCATABLE, TARGET :: POA2(:,:,:) + REAL(KIND=JPRB), POINTER :: PU(:,:,:), PV(:,:,:), PVOR(:,:,:), PDIV(:,:,:) ! ------------------------------------------------------------------ IF (LHOOK) CALL DR_HOOK('LTDIR_MOD',0,ZHOOK_HANDLE) @@ -137,28 +134,26 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& ! --------------------------------- IF( KF_UV > 0 ) THEN - !!CALL PREPSNM - - IUS = 1 - IUE = 2*KF_UV - IVS = 2*KF_UV+1 - IVE = 4*KF_UV - IVORS = 1 - IVORE = 2*KF_UV - IDIVS = 2*KF_UV+1 - IDIVE = 4*KF_UV - ALLOCATE(POA2(4*KF_UV,R%NTMAX+3,D%NUMP)) !$ACC ENTER DATA CREATE(POA2) + ! U and V are in POA1 + IFIRST = 0 + PU => POA1(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV + PV => POA1(IFIRST+1:IFIRST+2*KF_UV,:,:) + ! Compute VOR and DIV ino POA2 + IFIRST = 0 + PVOR => POA2(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV + PDIV => POA2(IFIRST+1:IFIRST+2*KF_UV,:,:) ! Compute vorticity and divergence - CALL UVTVD(KF_UV,POA1(IUS:IUE,:,:),POA1(IVS:IVE,:,:),& - & POA2(IVORS:IVORE,:,:),POA2(IDIVS:IDIVE,:,:)) + CALL UVTVD(KF_UV,PU,PV,PVOR,PDIV) ! Write back. Note, if we have UV, the contract says we *must* have VOR/DIV - CALL UPDSPB(KF_UV,POA2(IVORS:IVORE,:,:),PSPVOR,KFLDPTRUV) - CALL UPDSPB(KF_UV,POA2(IDIVS:IDIVE,:,:),PSPDIV,KFLDPTRUV) + CALL UPDSPB(KF_UV,PVOR,PSPVOR,KFLDPTRUV) + CALL UPDSPB(KF_UV,PDIV,PSPDIV,KFLDPTRUV) !$ACC EXIT DATA DELETE(POA2) @@ -169,14 +164,6 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& !* 6. UPDATE SPECTRAL ARRAYS. ! ----------------------- - !end loop over wavenumber - - !END DO - - !loop over wavenumber - !DO KMLOC=1,D%NUMP - ! KM = D%MYMS(KMLOC) - ! this is on the host, so need to cp from device, Nils CALL UPDSP(KF_UV,KF_SCALARS,POA1,& & PSPSCALAR,& @@ -189,11 +176,6 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& ! ------------------------------------------------------------------ IF (LHOOK) CALL DR_HOOK('LTDIR_MOD',1,ZHOOK_HANDLE) - - !end loop over wavenumber - !END DO - - !call cudaProfilerStop END SUBROUTINE LTDIR END MODULE LTDIR_MOD From 3b66b49ed45480747d31fb541b578391b358d860 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:12 -0700 Subject: [PATCH 116/263] Accidentally added to many FFTs again --- src/trans/gpu/internal/ftdir_mod.F90 | 2 +- src/trans/gpu/internal/ftinv_mod.F90 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 1855fa6bd..5e42e38ec 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -91,7 +91,7 @@ SUBROUTINE FTDIR(PREEL,KFIELD) IOFF=D%NSTAGTF(KGL)+1 IGLG = D%NPTRLS(MYSETW)+KGL-1 - CALL CREATE_PLAN_FFT(IPLAN_R2C,-1,G%NLOEN(IGLG),KFIELD,KFIELD) + CALL CREATE_PLAN_FFT(IPLAN_R2C,-1,G%NLOEN(IGLG),KFIELD/2,KFIELD) !$ACC HOST_DATA USE_DEVICE(PREEL,PREEL2) CALL EXECUTE_PLAN_FFTC(IPLAN_R2C,-1,PREEL(1,IOFF),PREEL2(1,IOFF)) !$ACC END HOST_DATA diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index 7510bc79b..36b91833c 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -91,7 +91,7 @@ SUBROUTINE FTINV(PREEL,KFIELD) IOFF=D%NSTAGTF(KGL)+1 IGLG = D%NPTRLS(MYSETW)+KGL-1 - CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),KFIELD,KFIELD) + CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),KFIELD/2,KFIELD) !$ACC HOST_DATA USE_DEVICE(PREEL,PREEL2) CALL EXECUTE_PLAN_FFTC(IPLAN_C2R,1,PREEL(1,IOFF),PREEL2(1,IOFF)) !$ACC END HOST_DATA From 6445bc1f30b8a18871bb0b92a3bdbd9e942b8312 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:12 -0700 Subject: [PATCH 117/263] Interface changes between complex/non-complex field counts --- src/trans/gpu/internal/fourier_in_mod.F90 | 4 ++-- src/trans/gpu/internal/fourier_out_mod.F90 | 7 ++++--- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 5 +++-- src/trans/gpu/internal/ftdir_mod.F90 | 5 ++++- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 7 ++++--- src/trans/gpu/internal/ftinv_mod.F90 | 2 +- 6 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index a51d2631d..d059cda77 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -74,14 +74,14 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL,KFIELDS) OFFSET_VAR=D_NPTRLS(MYSETW) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF,JM,IPROC,ISTA) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC - DO JF=1,KFIELDS/2 + DO JF=1,KFIELDS IGLG = OFFSET_VAR+KGL-1 IOFF = D_NSTAGTF(KGL)+1 !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) IPROC = D_NPROCM(JM) - ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KFIELDS + ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KFIELDS*2 PREEL(2*JF-1,2*JM+IOFF) = FOUBUF(ISTA+2*JF-1) PREEL(2*JF, 2*JM+IOFF) = FOUBUF(ISTA+2*JF ) diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 index 47785924c..6983f32dc 100755 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ b/src/trans/gpu/internal/fourier_out_mod.F90 @@ -64,7 +64,7 @@ SUBROUTINE FOURIER_OUT(PREEL,FOUBUF_IN,KFIELDS) IINC=-1 ENDIF -ALLOCATE(FOUBUF_IN(D%NLENGT0B*KFIELDS)) +ALLOCATE(FOUBUF_IN(D%NLENGT0B*KFIELDS*2)) !$ACC ENTER DATA CREATE(FOUBUF_IN) !$ACC DATA PRESENT(G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF_IN,PREEL,D_NSTAGTF,G_NLOEN) ASYNC(1) @@ -74,7 +74,7 @@ SUBROUTINE FOURIER_OUT(PREEL,FOUBUF_IN,KFIELDS) OFFSET_VAR=D_NPTRLS(MYSETW) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF,JM,IPROC,ISTA,SCAL) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC - DO JF=1,KFIELDS/2 + DO JF=1,KFIELDS IGLG = OFFSET_VAR+KGL-1 IOFF = D_NSTAGTF(KGL)+1 @@ -82,8 +82,9 @@ SUBROUTINE FOURIER_OUT(PREEL,FOUBUF_IN,KFIELDS) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) IPROC = D_NPROCM(JM) - ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KFIELDS + ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KFIELDS*2 + ! This is not contiguous in PREEL due to the memory layout. FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL(2*JF-1, 2*JM+IOFF) FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL(2*JF , 2*JM+IOFF) ENDDO diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index f991cfa95..e22ffa626 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -148,6 +148,7 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & IST = IST+KF_SCALARS_G ENDIF +! Note that this buffer is 2X too large, we will need to transpose ZGTF to get rid of this ALLOCATE(ZGTF(2*KF_FS, D%NLENGTF)) !$ACC ENTER DATA CREATE(ZGTF) @@ -188,8 +189,8 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & CALL GSTATS(1640,0) IF (KF_FS > 0) THEN - CALL FTDIR(ZGTF,2*KF_FS) - CALL FOURIER_OUT(ZGTF,FOUBUF_IN,2*KF_FS) + CALL FTDIR(ZGTF,KF_FS) + CALL FOURIER_OUT(ZGTF,FOUBUF_IN,KF_FS) ENDIF CALL GSTATS(1640,1) diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 5e42e38ec..3dd640969 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -88,10 +88,13 @@ SUBROUTINE FTDIR(PREEL,KFIELD) DO KGL=IBEG,IEND,IINC + ! NSTAGTF gives us space for NLOEN+3 elements + ! In reality, at this point we need space for at most NLOEN+2 elements + ! (in case NLOEN is even, otherwise NLOEN+1, due to the R2C definition) IOFF=D%NSTAGTF(KGL)+1 IGLG = D%NPTRLS(MYSETW)+KGL-1 - CALL CREATE_PLAN_FFT(IPLAN_R2C,-1,G%NLOEN(IGLG),KFIELD/2,KFIELD) + CALL CREATE_PLAN_FFT(IPLAN_R2C,-1,G%NLOEN(IGLG),KFIELD,KFIELD*2) !$ACC HOST_DATA USE_DEVICE(PREEL,PREEL2) CALL EXECUTE_PLAN_FFTC(IPLAN_R2C,-1,PREEL(1,IOFF),PREEL2(1,IOFF)) !$ACC END HOST_DATA diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index 3a2e9c420..e2c9cfeef 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -127,6 +127,7 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives KF_FS = IFIRST +! Note that this buffer is 2X too large, we will need to transpose ZGTF to get rid of this ALLOCATE(ZGTF(2*KF_FS, D%NLENGTF)) !$ACC ENTER DATA CREATE(ZGTF) @@ -151,8 +152,8 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives ENDIF -! from FOUBUF to ZGTF -CALL FOURIER_IN(FOUBUF,ZGTF,KF_INPUT) +! from FOUBUF to ZGTF. Divide by two because we move into complex space now +CALL FOURIER_IN(FOUBUF,ZGTF,KF_INPUT/2) ! 2. Fourier space computations @@ -162,7 +163,7 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& ! 3. Fourier transform ! inplace operation IF(KF_FS > 0) THEN - CALL FTINV(ZGTF,2*KF_FS) + CALL FTINV(ZGTF,KF_FS) ENDIF CALL GSTATS(1639,1) diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index 36b91833c..85129a1a3 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -91,7 +91,7 @@ SUBROUTINE FTINV(PREEL,KFIELD) IOFF=D%NSTAGTF(KGL)+1 IGLG = D%NPTRLS(MYSETW)+KGL-1 - CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),KFIELD/2,KFIELD) + CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),KFIELD,KFIELD*2) !$ACC HOST_DATA USE_DEVICE(PREEL,PREEL2) CALL EXECUTE_PLAN_FFTC(IPLAN_C2R,1,PREEL(1,IOFF),PREEL2(1,IOFF)) !$ACC END HOST_DATA From 40d75960ef37de4890bf6f4a16312e9d830d9d76 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:12 -0700 Subject: [PATCH 118/263] Tiny cleanup in modules --- src/trans/gpu/internal/fsc_mod.F90 | 6 +++--- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 2 +- src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 1 - src/trans/gpu/internal/trgtol_mod.F90 | 6 +++--- src/trans/gpu/internal/trltog_mod.F90 | 3 ++- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 8fd50706e..43f090a0d 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -46,13 +46,13 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA ! ------------------------------------------------------------------ USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - -USE TPM_TRANS ,ONLY : LUVDER, LATLON, LVORGP, LDIVGP USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NSTAGTF USE TPM_GEOMETRY ,ONLY : G, G_NMEN USE TPM_FIELDS ,ONLY : F USE TPM_FLT ,ONLY: S -use tpm_gen, only: nout +USE TPM_GEN, ONLY: NOUT + +USE TPM_TRANS ,ONLY : LATLON ! IMPLICIT NONE diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index e22ffa626..c72dbda49 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -170,7 +170,7 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & CALL MPL_BARRIER(CDSTRING='') ENDIF CALL GSTATS(430,1) -CALL TRGTOL_CUDAAWARE(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& +CALL TRGTOL_CUDAAWARE(ZGTF,KF_FS,KF_GP,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) !$ACC END DATA !$ACC END DATA diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index 3315e6ac7..00ceb3c6f 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -88,7 +88,6 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& USE TPM_GEN ,ONLY : NPROMATR USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP -!USE TPM_DISTR USE SHUFFLE_MOD ,ONLY : SHUFFLE USE FIELD_SPLIT_MOD ,ONLY : FIELD_SPLIT diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 0b9ba6ffb..f538933af 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -10,7 +10,7 @@ MODULE TRGTOL_MOD CONTAINS - SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& + SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) !**** *TRGTOL * - transposition of grid point data from column @@ -77,18 +77,18 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& USE TPM_GEN ,ONLY : LSYNC_TRANS USE EQ_REGIONS_MOD ,ONLY : MY_REGION_EW, MY_REGION_NS USE TPM_DISTR ,ONLY : D,MYSETV, MYSETW, MTAGLG,NPRCIDS,MYPROC,NPROC,NPRTRW,NPRTRV - USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, NGPBLKS, NPROMA USE PE2SET_MOD ,ONLY : PE2SET USE MPL_DATA_MODULE ,ONLY : MPL_COMM_OML USE OML_MOD ,ONLY : OML_MY_THREAD USE MPI + USE TPM_TRANS ,ONLY : NPROMA + IMPLICIT NONE REAL(KIND=JPRBT),INTENT(OUT) :: PGLAT(:,:) INTEGER(KIND=JPIM),INTENT(IN) :: KVSET(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP - INTEGER(KIND=JPIM),INTENT(IN) :: KF_SCALARS_G INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP(:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGPUV(:,:,:,:) diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 70f7b540b..491200107 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -78,13 +78,14 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, USE TPM_GEN ,ONLY : LSYNC_TRANS USE EQ_REGIONS_MOD ,ONLY : MY_REGION_EW, MY_REGION_NS USE TPM_DISTR ,ONLY : D,MYSETV, MYSETW, MTAGLG,NPRCIDS,MYPROC,NPROC,NPRTRW,NPRTRV - USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, NGPBLKS, NPROMA USE PE2SET_MOD ,ONLY : PE2SET USE MPL_DATA_MODULE ,ONLY : MPL_COMM_OML USE OML_MOD ,ONLY : OML_MY_THREAD USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS USE MPI + USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, NPROMA + IMPLICIT NONE REAL(KIND=JPRBT), INTENT(IN) :: PGLAT(:,:) From 9d2e2b36e548946cb246365baddeb9bea8be9022 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:13 -0700 Subject: [PATCH 119/263] Put copyins and copyouts at the same place for INV and DIR --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 32 ++------------------ src/trans/gpu/internal/ftinv_ctl_mod.F90 | 10 ++++++ src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 8 ----- src/trans/gpu/internal/ltdir_mod.F90 | 23 ++++++++++++++ src/trans/gpu/internal/ltinv_mod.F90 | 8 ++--- src/trans/gpu/internal/trltog_mod.F90 | 10 +++--- src/trans/gpu/internal/updsp_mod.F90 | 6 ++-- 7 files changed, 48 insertions(+), 49 deletions(-) diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index fb11595fb..795e5cf40 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -182,40 +182,14 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ELSE ! No splitting of fields, transform done in one go - call nvtxStartRange("DIRTRANS_nodata") - - call nvtxStartRange("FTDIR") CALL FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN,& & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) - call nvtxEndRange - - call nvtxStartRange("LTDIR") - !$ACC DATA COPYOUT(PSPVOR,PSPDIV) IF(KF_UV > 0) - !$ACC DATA COPYOUT(PSPSCALAR) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSCALAR)) - !$ACC DATA COPYOUT(PSPSC2) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC2) .AND. NF_SC2 > 0) - !$ACC DATA COPYOUT(PSPSC3A) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC3A) .AND. NF_SC3A > 0) - !$ACC DATA COPYOUT(PSPSC3B) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC3B) .AND. NF_SC3B > 0) - CALL LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN, & - & PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& - & PSPSC3A=PSPSC3A,PSPSC3B=PSPSC3B,PSPSC2=PSPSC2) - call nvtxEndRange -IF (LSYNC_TRANS) THEN - CALL MPL_BARRIER(CDSTRING='') -ENDIF - CALL GSTATS(430,0) - !$ACC END DATA - !$ACC END DATA - !$ACC END DATA - !$ACC END DATA - !$ACC END DATA -IF (LSYNC_TRANS) THEN - CALL MPL_BARRIER(CDSTRING='') -ENDIF - CALL GSTATS(430,1) - call nvtxEndRange + CALL LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN, & + & PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& + & PSPSC3A=PSPSC3A,PSPSC3B=PSPSC3B,PSPSC2=PSPSC2) ENDIF ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index e2c9cfeef..daaf5ac85 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -249,8 +249,18 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& CALL GSTATS(157,0) #ifdef USE_CUDA_AWARE_MPI_FT WRITE(NOUT,*) 'ftinv_ctl:TRLTOG_CUDAAWARE' +!$ACC DATA IF(PRESENT(PGP)) COPYOUT(PGP) +!$ACC DATA IF(PRESENT(PGPUV)) COPYOUT(PGPUV) +!$ACC DATA IF(PRESENT(PGP2)) COPYOUT(PGP2) +!$ACC DATA IF(PRESENT(PGP3A)) COPYOUT(PGP3A) +!$ACC DATA IF(PRESENT(PGP3B)) COPYOUT(PGP3B) CALL TRLTOG_CUDAAWARE(ZGTF,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) +!$ACC END DATA +!$ACC END DATA +!$ACC END DATA +!$ACC END DATA +!$ACC END DATA #else !WRITE(NOUT,*) 'ftinv_ctl:TRLTOG' CALL TRLTOG(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index 00ceb3c6f..26dd26573 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -282,25 +282,17 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& ENDDO ELSE - call nvtxStartRange("INVTRANS") - ! No splitting of fields, transform done in one go ! from PSPXXX to FOUBUF - call nvtxStartRange("LTINV") CALL LTINV_CTL(KF_UV,KF_SCALARS, & & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,FOUBUF,KFIELD) - call nvtxEndRange ! from FOUBUF to PGPXXX - call nvtxStartRange("FTINV") CALL FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& & KF_UV,KF_SCALARS,KF_GP,& & FOUBUF,KVSETUV=KVSETUV,KVSETSC=KVSETSC,& & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) - call nvtxEndRange - - call nvtxEndRange ENDIF diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index 77fe24de5..c80ff373b 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -28,6 +28,9 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& USE UVTVD_MOD USE UPDSP_MOD ,ONLY : UPDSP USE UPDSPB_MOD ,ONLY : UPDSPB + USE MPL_MODULE ,ONLY : MPL_BARRIER + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B !**** *LTDIR* - Control of Direct Legendre transform step @@ -128,6 +131,12 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& ! do the legendre transform CALL LEDIR(FOUBUF,POA1,KF_FS,KF_UV) + !$ACC DATA COPYOUT(PSPVOR,PSPDIV) IF(KF_UV > 0) + !$ACC DATA COPYOUT(PSPSCALAR) IF(PRESENT(PSPSCALAR) .AND. KF_SCALARS > 0) + !$ACC DATA COPYOUT(PSPSC2) IF(NF_SC2 > 0) + !$ACC DATA COPYOUT(PSPSC3A) IF(NF_SC3A > 0) + !$ACC DATA COPYOUT(PSPSC3B) IF(NF_SC3B > 0) + ! ------------------------------------------------------------------ !* 5. COMPUTE VORTICITY AND DIVERGENCE. @@ -173,6 +182,20 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& !$ACC EXIT DATA DELETE(POA1) DEALLOCATE(POA1) + IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='') + ENDIF + CALL GSTATS(430,0) + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + IF (LSYNC_TRANS) THEN + CALL MPL_BARRIER(CDSTRING='') + ENDIF + CALL GSTATS(430,1) + ! ------------------------------------------------------------------ IF (LHOOK) CALL DR_HOOK('LTDIR_MOD',1,ZHOOK_HANDLE) diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index 8dfa193bb..0fca5d2dc 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -125,10 +125,10 @@ SUBROUTINE LTINV(KF_UV,KF_SCALARS,& ENDIF CALL GSTATS(431,0) !$ACC DATA COPYIN(PSPVOR,PSPDIV) IF(KF_UV > 0) - !$ACC DATA COPYIN(PSPSCALAR) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSCALAR)) - !$ACC DATA COPYIN(PSPSC2) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC2) .AND. NF_SC2 > 0) - !$ACC DATA COPYIN(PSPSC3A) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC3A) .AND. NF_SC3A > 0) - !$ACC DATA COPYIN(PSPSC3B) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC3B) .AND. NF_SC3B > 0) + !$ACC DATA COPYIN(PSPSCALAR) IF(PRESENT(PSPSCALAR) .AND. KF_SCALARS > 0) + !$ACC DATA COPYIN(PSPSC2) IF(NF_SC2 > 0) + !$ACC DATA COPYIN(PSPSC3A) IF(NF_SC3A > 0) + !$ACC DATA COPYIN(PSPSC3B) IF(NF_SC3B > 0) IF (LSYNC_TRANS) THEN CALL MPL_BARRIER(CDSTRING='') ENDIF diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 491200107..735d24234 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -341,11 +341,11 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, ENDDO !$ACC DATA PRESENT(PGLAT) COPYIN(IIN_TO_SEND_BUFR,IGP_OFFSETS) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP)) COPYOUT(PGP) ASYNC(1) - !$ACC DATA IF(PRESENT(PGPUV)) COPYOUT(PGPUV) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP2)) COPYOUT(PGP2) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3A)) COPYOUT(PGP3A) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3B)) COPYOUT(PGP3B) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) + !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) ASYNC(1) CALL GSTATS(1806,1) diff --git a/src/trans/gpu/internal/updsp_mod.F90 b/src/trans/gpu/internal/updsp_mod.F90 index 388f0c139..df4f18227 100755 --- a/src/trans/gpu/internal/updsp_mod.F90 +++ b/src/trans/gpu/internal/updsp_mod.F90 @@ -94,9 +94,9 @@ SUBROUTINE UPDSP(KF_UV,KF_SCALARS,POA1, & !* 1.1 VORTICITY AND DIVERGENCE. !$ACC DATA PRESENT(PSPSCALAR) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSCALAR)) -!$ACC DATA PRESENT(PSPSC2) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC2) .AND. NF_SC2 > 0) -!$ACC DATA PRESENT(PSPSC3A) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC3A) .AND. NF_SC3A > 0) -!$ACC DATA PRESENT(PSPSC3B) IF(KF_SCALARS > 0 .AND. PRESENT(PSPSC3B) .AND. NF_SC3B > 0) +!$ACC DATA PRESENT(PSPSC2) IF(NF_SC2 > 0) +!$ACC DATA PRESENT(PSPSC3A) IF(NF_SC3A > 0) +!$ACC DATA PRESENT(PSPSC3B) IF(NF_SC3B > 0) IST = 1 IST = IST+4*KF_UV From a571122c7cc5d4446b9075fa3df9fd291af25ef7 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:13 -0700 Subject: [PATCH 120/263] Refactor 4XX GSTATS (NVIDIA GSTATS) --- src/trans/gpu/external/dir_trans.F90 | 6 ++-- src/trans/gpu/external/inv_trans.F90 | 6 ++-- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 8 +++-- src/trans/gpu/internal/ftdir_mod.F90 | 13 +++++--- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 14 ++++++++ src/trans/gpu/internal/ftinv_mod.F90 | 13 +++++--- src/trans/gpu/internal/gstats_label_ifs.F90 | 37 ++++++++++++--------- src/trans/gpu/internal/ledir_mod.F90 | 33 +++++++++++++----- src/trans/gpu/internal/leinv_mod.F90 | 22 ++++++------ src/trans/gpu/internal/ltdir_mod.F90 | 8 +++-- src/trans/gpu/internal/ltinv_mod.F90 | 8 +++-- src/trans/gpu/internal/trgtol_mod.F90 | 17 +++++----- src/trans/gpu/internal/trltog_mod.F90 | 18 +++++----- src/trans/gpu/internal/trltom_mod.F90 | 18 +++++----- src/trans/gpu/internal/trmtol_mod.F90 | 22 ++++++------ 15 files changed, 151 insertions(+), 92 deletions(-) diff --git a/src/trans/gpu/external/dir_trans.F90 b/src/trans/gpu/external/dir_trans.F90 index d78b5d86d..63ba412f3 100755 --- a/src/trans/gpu/external/dir_trans.F90 +++ b/src/trans/gpu/external/dir_trans.F90 @@ -166,7 +166,7 @@ SUBROUTINE DIR_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& IF (LSYNC_TRANS) THEN CALL MPL_BARRIER(CDSTRING='') ENDIF -CALL GSTATS(440,0) +CALL GSTATS(410,0) CALL GSTATS(1808,0) ! Set current resolution CALL SET_RESOL(KRESOL) @@ -510,9 +510,11 @@ SUBROUTINE DIR_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& IF (LHOOK) CALL DR_HOOK('DIR_TRANS',1,ZHOOK_HANDLE) IF (LSYNC_TRANS) THEN + CALL GSTATS(430,0) CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) ENDIF -CALL GSTATS(440,1) +CALL GSTATS(410,1) ! ------------------------------------------------------------------ !endif INTERFACE diff --git a/src/trans/gpu/external/inv_trans.F90 b/src/trans/gpu/external/inv_trans.F90 index ca2c127ea..3970b14a5 100755 --- a/src/trans/gpu/external/inv_trans.F90 +++ b/src/trans/gpu/external/inv_trans.F90 @@ -201,7 +201,7 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& IF (LSYNC_TRANS) THEN CALL MPL_BARRIER(CDSTRING='') ENDIF -CALL GSTATS(441,0) +CALL GSTATS(420,0) CALL GSTATS(1807,0) ! Set current resolution CALL SET_RESOL(KRESOL) @@ -630,9 +630,11 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& IF (LHOOK) CALL DR_HOOK('INV_TRANS',1,ZHOOK_HANDLE) IF (LSYNC_TRANS) THEN + CALL GSTATS(440,0) CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) ENDIF -CALL GSTATS(441,1) +CALL GSTATS(420,1) ! ------------------------------------------------------------------ !endif INTERFACE diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index c72dbda49..bc579ba81 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -158,18 +158,22 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & #ifdef USE_CUDA_AWARE_MPI_FT IF (LSYNC_TRANS) THEN + CALL GSTATS(430,0) CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) ENDIF -CALL GSTATS(430,0) +CALL GSTATS(412,0) !$ACC DATA IF(PRESENT(PGP)) COPYIN(PGP) !$ACC DATA IF(PRESENT(PGPUV)) COPYIN(PGPUV) !$ACC DATA IF(PRESENT(PGP2)) COPYIN(PGP2) !$ACC DATA IF(PRESENT(PGP3A)) COPYIN(PGP3A) !$ACC DATA IF(PRESENT(PGP3B)) COPYIN(PGP3B) IF (LSYNC_TRANS) THEN + CALL GSTATS(432,0) CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(432,1) ENDIF -CALL GSTATS(430,1) +CALL GSTATS(412,1) CALL TRGTOL_CUDAAWARE(ZGTF,KF_FS,KF_GP,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) !$ACC END DATA diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 3dd640969..24dae2042 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -82,9 +82,11 @@ SUBROUTINE FTDIR(PREEL,KFIELD) !$ACC DATA PRESENT(PREEL,PREEL2) IF (LSYNC_TRANS) THEN - CALL MPL_BARRIER(CDSTRING='FTDIR BARRIER') + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) ENDIF -CALL GSTATS(450,0) +CALL GSTATS(413,0) DO KGL=IBEG,IEND,IINC @@ -99,13 +101,14 @@ SUBROUTINE FTDIR(PREEL,KFIELD) CALL EXECUTE_PLAN_FFTC(IPLAN_R2C,-1,PREEL(1,IOFF),PREEL2(1,IOFF)) !$ACC END HOST_DATA END DO - IRET = CUDA_SYNCHRONIZE() IF (LSYNC_TRANS) THEN - CALL MPL_BARRIER(CDSTRING='FTDIR BARRIER') + CALL GSTATS(433,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(433,1) ENDIF -CALL GSTATS(450,1) +CALL GSTATS(413,1) !$ACC END DATA diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index daaf5ac85..7c9edc411 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -70,6 +70,8 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& USE FTINV_MOD ,ONLY : FTINV USE TRLTOG_MOD ,ONLY : TRLTOG, TRLTOG_CUDAAWARE USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS +USE MPL_MODULE ,ONLY : MPL_BARRIER +USE TPM_GEN ,ONLY : LSYNC_TRANS use ieee_arithmetic ! @@ -256,11 +258,23 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& !$ACC DATA IF(PRESENT(PGP3B)) COPYOUT(PGP3B) CALL TRLTOG_CUDAAWARE(ZGTF,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) +IF (LSYNC_TRANS) THEN + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) +ENDIF +CALL GSTATS(422,0) !$ACC END DATA !$ACC END DATA !$ACC END DATA !$ACC END DATA !$ACC END DATA +IF (LSYNC_TRANS) THEN + CALL GSTATS(442,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(442,1) +ENDIF +CALL GSTATS(422,1) #else !WRITE(NOUT,*) 'ftinv_ctl:TRLTOG' CALL TRLTOG(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index 85129a1a3..72bf2719c 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -82,9 +82,11 @@ SUBROUTINE FTINV(PREEL,KFIELD) !$ACC DATA PRESENT(PREEL,PREEL2) IF (LSYNC_TRANS) THEN - CALL MPL_BARRIER(CDSTRING='FTINV BARRIER') + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) ENDIF -CALL GSTATS(451,0) +CALL GSTATS(423,0) DO KGL=IBEG,IEND,IINC @@ -96,13 +98,14 @@ SUBROUTINE FTINV(PREEL,KFIELD) CALL EXECUTE_PLAN_FFTC(IPLAN_C2R,1,PREEL(1,IOFF),PREEL2(1,IOFF)) !$ACC END HOST_DATA END DO - IRET = CUDA_SYNCHRONIZE() IF (LSYNC_TRANS) THEN - CALL MPL_BARRIER(CDSTRING='FTINV BARRIER') + CALL GSTATS(443,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(443,1) ENDIF -CALL GSTATS(451,1) +CALL GSTATS(423,1) !$ACC END DATA diff --git a/src/trans/gpu/internal/gstats_label_ifs.F90 b/src/trans/gpu/internal/gstats_label_ifs.F90 index 253cc81bf..0bab13aaa 100644 --- a/src/trans/gpu/internal/gstats_label_ifs.F90 +++ b/src/trans/gpu/internal/gstats_label_ifs.F90 @@ -218,22 +218,27 @@ SUBROUTINE GSTATS_LABEL_IFS CALL GSTATS_LABEL(400,' ','GSTATS ') CALL GSTATS_LABEL(401,' ','GSTATS HOOK') -CALL GSTATS_LABEL(410,' ','MPI - TRMTOL') -CALL GSTATS_LABEL(411,' ','MPI - TRLTOM') -CALL GSTATS_LABEL(412,' ','MPI - TRLTOG') -CALL GSTATS_LABEL(413,' ','MPI - TRGTOL') -CALL GSTATS_LABEL(420,' ','TRLTOM Barrier') -CALL GSTATS_LABEL(421,' ','TRMTOL Barrier') -CALL GSTATS_LABEL(422,' ','TRLTOG Barrier') -CALL GSTATS_LABEL(423,' ','TRGTOL Barrier') -CALL GSTATS_LABEL(430,' ','DIR COPIES') -CALL GSTATS_LABEL(431,' ','INV COPIES') -CALL GSTATS_LABEL(440,' ','FULL DIRTRANS') -CALL GSTATS_LABEL(441,' ','FULL INVTRANS') -CALL GSTATS_LABEL(450,' ','FFTDIR - PLANS') -CALL GSTATS_LABEL(451,' ','FFTINV - PLANS') -CALL GSTATS_LABEL(452,' ','LEDIR') -CALL GSTATS_LABEL(453,' ','LEINV') +CALL GSTATS_LABEL(410,' ','DIR COMPLETE') +CALL GSTATS_LABEL(411,' ','DIR MPI') +CALL GSTATS_LABEL(412,' ','DIR COPIES') +CALL GSTATS_LABEL(413,' ','DIR FFT') +CALL GSTATS_LABEL(414,' ','DIR GEMMS') +CALL GSTATS_LABEL(430,' ','DIR COMPLETE - LB') +CALL GSTATS_LABEL(431,' ','DIR MPI - LB') +CALL GSTATS_LABEL(432,' ','DIR COPIES - LB') +CALL GSTATS_LABEL(433,' ','DIR FFT - LB') +CALL GSTATS_LABEL(434,' ','DIR GEMMS - LB') + +CALL GSTATS_LABEL(420,' ','INV COMPLETE') +CALL GSTATS_LABEL(421,' ','INV MPI') +CALL GSTATS_LABEL(422,' ','INV COPIES') +CALL GSTATS_LABEL(423,' ','INV FFT') +CALL GSTATS_LABEL(424,' ','INV GEMMS') +CALL GSTATS_LABEL(440,' ','INV COMPLETE - LB') +CALL GSTATS_LABEL(441,' ','INV MPI - LB') +CALL GSTATS_LABEL(442,' ','INV COPIES - LB') +CALL GSTATS_LABEL(443,' ','INV FFT - LB') +CALL GSTATS_LABEL(444,' ','INV GEMMS - LB') ! counters 500 to 2000 CALL GSTATS_LABEL(501,'MPL','SLCOMM2_COMMS PART1') diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 858dd0167..f76cbb5bd 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -89,11 +89,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) -IF (LSYNC_TRANS) THEN - CALL MPL_BARRIER(CDSTRING='LEDIR BARRIER') -ENDIF -CALL GSTATS(452,0) - ALLOCATE(ZINPA(2*KF_FS*R_NDGNH*D_NUMP)) ALLOCATE(ZINPS(2*KF_FS*R_NDGNH*D_NUMP)) ALLOCATE(ZOUT(2*KF_FS*TDZAS*D_NUMP)) @@ -143,6 +138,12 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ! anti-symmetric +IF (LSYNC_TRANS) THEN + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) +ENDIF +CALL GSTATS(414,0) ! Get C in transpose format to get better memory access patterns later !C=A*B => ! C^T=B^T*A^T @@ -157,6 +158,12 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & ZOUT, 2*KF_FS, TDZAA, & & D_NUMP) !$ACC END HOST_DATA +IF (LSYNC_TRANS) THEN + CALL GSTATS(434,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(434,1) +ENDIF +CALL GSTATS(414,1) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA) DEFAULT(NONE) DO KMLOC=1,D_NUMP @@ -212,6 +219,12 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ! symmetric +IF (LSYNC_TRANS) THEN + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) +ENDIF +CALL GSTATS(414,0) ! Get C in transpose format to get better memory access patterns later !C=A*B => ! C^T=B^T*A^T @@ -226,6 +239,12 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & ZOUT, 2*KF_FS, TDZAS, & & D_NUMP) !$ACC END HOST_DATA +IF (LSYNC_TRANS) THEN + CALL GSTATS(434,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(434,1) +ENDIF +CALL GSTATS(414,1) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS) DEFAULT(NONE) DO KMLOC=1,D_NUMP @@ -284,10 +303,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) DEALLOCATE(ZOUT0) -IF (LSYNC_TRANS) THEN - CALL MPL_BARRIER(CDSTRING='LEDIR BARRIER') -ENDIF -CALL GSTATS(452,1) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 3f206a994..d6505eea5 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -93,10 +93,6 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ! -------------------------- !* 1.1 PREPARATIONS. -IF (LSYNC_TRANS) THEN - CALL MPL_BARRIER(CDSTRING='LEINV BARRIER') -ENDIF -CALL GSTATS(453,0) ALLOCATE(ZINP(KFIELDS*TDZAS*D_NUMP)) ALLOCATE(ZOUTS(KFIELDS*R_NDGNH*D_NUMP)) @@ -149,7 +145,12 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) !C=A*B => ! C^T=B^T*A^T - +IF (LSYNC_TRANS) THEN + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) +ENDIF +CALL GSTATS(424,0) ! OVERLOADED FOR SINGLE AND DOUBLE PRECISION !$ACC HOST_DATA USE_DEVICE(ZAA,ZINP,ZOUTA) CALL CUDA_GEMM_BATCHED( & @@ -162,6 +163,12 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & ZOUTA, KFIELDS, R_NDGNH, & & D_NUMP) !$ACC END HOST_DATA +IF (LSYNC_TRANS) THEN + CALL GSTATS(444,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(444,1) +ENDIF +CALL GSTATS(424,1) ! 2. +++++++++++++ symmetric !IF KM=0 and NSMAX is 6: @@ -246,11 +253,6 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) DEALLOCATE(ZOUTS) DEALLOCATE(ZOUTA) -IF (LSYNC_TRANS) THEN - CALL MPL_BARRIER(CDSTRING='LEINV BARRIER') -ENDIF -CALL GSTATS(453,1) - IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index c80ff373b..f0afbd31c 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -183,18 +183,22 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& DEALLOCATE(POA1) IF (LSYNC_TRANS) THEN + CALL GSTATS(430,0) CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) ENDIF - CALL GSTATS(430,0) + CALL GSTATS(412,0) !$ACC END DATA !$ACC END DATA !$ACC END DATA !$ACC END DATA !$ACC END DATA IF (LSYNC_TRANS) THEN + CALL GSTATS(432,0) CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(432,1) ENDIF - CALL GSTATS(430,1) + CALL GSTATS(412,1) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index 0fca5d2dc..bd9e412c1 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -121,17 +121,22 @@ SUBROUTINE LTINV(KF_UV,KF_SCALARS,& ! ---------------------------------------------- IF (LSYNC_TRANS) THEN + CALL GSTATS(440,0) CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) ENDIF - CALL GSTATS(431,0) + CALL GSTATS(422,0) !$ACC DATA COPYIN(PSPVOR,PSPDIV) IF(KF_UV > 0) !$ACC DATA COPYIN(PSPSCALAR) IF(PRESENT(PSPSCALAR) .AND. KF_SCALARS > 0) !$ACC DATA COPYIN(PSPSC2) IF(NF_SC2 > 0) !$ACC DATA COPYIN(PSPSC3A) IF(NF_SC3A > 0) !$ACC DATA COPYIN(PSPSC3B) IF(NF_SC3B > 0) IF (LSYNC_TRANS) THEN + CALL GSTATS(442,0) CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(442,1) ENDIF + CALL GSTATS(422,1) ! Compute PIA Domain decomposition IFIRST = 0 @@ -175,7 +180,6 @@ SUBROUTINE LTINV(KF_UV,KF_SCALARS,& IFIRST = IFIRST + 2*KF_SCALARS ! Scalars NS Derivatives ENDIF - CALL GSTATS(431,1) IF (KF_UV > 0) THEN CALL PRFI1B(PVOR,PSPVOR,KF_UV,UBOUND(PSPVOR,2),KFLDPTRUV) CALL PRFI1B(PDIV,PSPDIV,KF_UV,UBOUND(PSPDIV,2),KFLDPTRUV) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index f538933af..8eb8432c3 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -401,21 +401,20 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& ENDIF !$ACC END DATA ENDDO - + !$ACC WAIT(1) CALL GSTATS(1602,1) - !$ACC WAIT(1) IF (LHOOK) CALL DR_HOOK('TRGTOL_BAR',0,ZHOOK_HANDLE_BAR) CALL GSTATS_BARRIER(761) IF (LHOOK) CALL DR_HOOK('TRGTOL_BAR',1,ZHOOK_HANDLE_BAR) IF (LSYNC_TRANS) THEN - CALL GSTATS(423,0) - CALL MPL_BARRIER(CDSTRING='TRGTOL BARRIER') - CALL GSTATS(423,1) + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) ENDIF - CALL GSTATS(413,0) + CALL GSTATS(411,0) IR=0 @@ -442,9 +441,11 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& & CDSTRING='TRGTOL_CUDAAWARE: WAIT FOR SENDS AND RECEIVES') ENDIF IF (LSYNC_TRANS) THEN - CALL MPL_BARRIER(CDSTRING='TRGTOL BARRIER') + CALL GSTATS(431,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(431,1) ENDIF - CALL GSTATS(413,1) + CALL GSTATS(411,1) CALL GSTATS_BARRIER2(761) ! Unpack loop......................................................... diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 735d24234..bb20480a5 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -463,19 +463,19 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, CALL GSTATS(1605,1) - IR=0 IF (LHOOK) CALL DR_HOOK('TRLTOG_BAR',0,ZHOOK_HANDLE_BAR) CALL GSTATS_BARRIER(762) IF (LHOOK) CALL DR_HOOK('TRLTOG_BAR',1,ZHOOK_HANDLE_BAR) CALL GSTATS(805,0) IF (LSYNC_TRANS) THEN - CALL GSTATS(422,0) - CALL MPL_BARRIER(CDSTRING='TRLTOG BARRIER') - CALL GSTATS(422,1) + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) ENDIF - CALL GSTATS(412,0) + CALL GSTATS(421,0) + IR=0 !...Receive loop......................................................... !$ACC HOST_DATA USE_DEVICE(ZCOMBUFS,ZCOMBUFR) DO INR=1,IRECV_COUNTS @@ -503,9 +503,11 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, ENDIF IF (LSYNC_TRANS) THEN - CALL MPL_BARRIER(CDSTRING='TRLTOG BARRIER') + CALL GSTATS(441,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(441,1) ENDIF - CALL GSTATS(412,1) + CALL GSTATS(421,1) CALL GSTATS(805,1) CALL GSTATS_BARRIER2(762) @@ -570,7 +572,6 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, !$ACC END DATA ENDDO - CALL GSTATS(431,0) !$ACC END DATA !$ACC END DATA !$ACC END DATA @@ -581,7 +582,6 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, !$ACC END DATA !! CREATE ZCOMBUFR !$ACC END DATA !! CREATE ZCOMBUFS !$ACC WAIT(1) - CALL GSTATS(431,1) CALL GSTATS(1606,1) IF (IBUFLENS > 0) DEALLOCATE(ZCOMBUFS) diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index cb27871a7..c6c967d7e 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -120,25 +120,25 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) ILENR(IRANK) = 0 ENDIF - !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) - IF (LSYNC_TRANS) THEN - CALL GSTATS(420,0) - CALL MPL_BARRIER(CDSTRING='TRLTOM BARRIER') - CALL GSTATS(420,1) + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) ENDIF CALL GSTATS(411,0) + !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,IOFFS,TRLTOM_DTYPE,& & PFBUF,ILENR,IOFFR, TRLTOM_DTYPE, & & MPL_ALL_MS_COMM,IERROR) - !$ACC END HOST_DATA - !$ACC WAIT(1) - IF (LSYNC_TRANS) THEN - CALL MPI_BARRIER(MPI_COMM_WORLD,IERROR) + CALL GSTATS(431,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(431,1) ENDIF CALL GSTATS(411,1) + + !$ACC WAIT(1) CALL GSTATS(806,1) ELSE ILEN = D%NLTSGTB(MYSETW)*2*KF_FS diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 3ae3f8902..f7c045c69 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -120,25 +120,25 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) ILENR(IRANK) = 0 ENDIF - !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) - IF (LSYNC_TRANS) THEN - CALL GSTATS(421,0) - CALL MPL_BARRIER(CDSTRING='TRLTOM BARRIER') - CALL GSTATS(421,1) + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) ENDIF - CALL GSTATS(410,0) + CALL GSTATS(421,0) + !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,IOFFS,TRMTOL_DTYPE,& & PFBUF,ILENR,IOFFR,TRMTOL_DTYPE,& & MPL_ALL_MS_COMM,IERROR) - !$ACC END HOST_DATA - !$ACC WAIT(1) - IF (LSYNC_TRANS) THEN - CALL MPI_BARRIER(MPI_COMM_WORLD,IERROR) + CALL GSTATS(441,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(441,1) ENDIF - CALL GSTATS(410,1) + CALL GSTATS(421,1) + + !$ACC WAIT(1) CALL GSTATS(807,1) ELSE ILEN = D%NLTSGTB(MYSETW)*KFIELD From b28ad84eb75a542c50024e1ccaa76b266a31544c Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:13 -0700 Subject: [PATCH 121/263] Remove barrier that are not ours --- src/trans/gpu/internal/trgtol_mod.F90 | 10 ---------- src/trans/gpu/internal/trltog_mod.F90 | 8 -------- 2 files changed, 18 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 8eb8432c3..cda804443 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -405,10 +405,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& CALL GSTATS(1602,1) - IF (LHOOK) CALL DR_HOOK('TRGTOL_BAR',0,ZHOOK_HANDLE_BAR) - CALL GSTATS_BARRIER(761) - IF (LHOOK) CALL DR_HOOK('TRGTOL_BAR',1,ZHOOK_HANDLE_BAR) - IF (LSYNC_TRANS) THEN CALL GSTATS(430,0) CALL MPL_BARRIER(CDSTRING='') @@ -447,7 +443,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& ENDIF CALL GSTATS(411,1) - CALL GSTATS_BARRIER2(761) ! Unpack loop......................................................... CALL GSTATS(1603,0) @@ -1059,10 +1054,6 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL GSTATS(1602,1) - IF (LHOOK) CALL DR_HOOK('TRGTOL_BAR',0,ZHOOK_HANDLE_BAR) - CALL GSTATS_BARRIER(761) - IF (LHOOK) CALL DR_HOOK('TRGTOL_BAR',1,ZHOOK_HANDLE_BAR) - IF(.NOT.LGPNORM)THEN CALL GSTATS(803,0) ELSE @@ -1110,7 +1101,6 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ELSE CALL GSTATS(804,1) ENDIF - CALL GSTATS_BARRIER2(761) !#ifdef COMVERBOSE ! call MPI_BARRIER(MPI_COMM_WORLD,IERROR) diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index bb20480a5..7d4d44b1c 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -463,9 +463,6 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, CALL GSTATS(1605,1) - IF (LHOOK) CALL DR_HOOK('TRLTOG_BAR',0,ZHOOK_HANDLE_BAR) - CALL GSTATS_BARRIER(762) - IF (LHOOK) CALL DR_HOOK('TRLTOG_BAR',1,ZHOOK_HANDLE_BAR) CALL GSTATS(805,0) IF (LSYNC_TRANS) THEN @@ -510,7 +507,6 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, CALL GSTATS(421,1) CALL GSTATS(805,1) - CALL GSTATS_BARRIER2(762) ! Unpack loop......................................................... @@ -1121,9 +1117,6 @@ SUBROUTINE TRLTOG(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL GSTATS(1605,1) IR=0 - IF (LHOOK) CALL DR_HOOK('TRLTOG_BAR',0,ZHOOK_HANDLE_BAR) - CALL GSTATS_BARRIER(762) - IF (LHOOK) CALL DR_HOOK('TRLTOG_BAR',1,ZHOOK_HANDLE_BAR) CALL GSTATS(805,0) #ifdef COMVERBOSE @@ -1161,7 +1154,6 @@ SUBROUTINE TRLTOG(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& #endif CALL GSTATS(805,1) - CALL GSTATS_BARRIER2(762) #ifdef COMVERBOSE call MPI_BARRIER(MPI_COMM_WORLD,IERROR) From 6000cce99aa8a0cacdac555d0105ce5d59e21828 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:13 -0700 Subject: [PATCH 122/263] Redirect some GSTATS function to add nvtx --- src/trans/gpu/external/dir_trans.F90 | 1 + src/trans/gpu/external/inv_trans.F90 | 1 + src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 1 - src/trans/gpu/internal/ftdir_ctl_mod.F90 | 1 + src/trans/gpu/internal/ftdir_mod.F90 | 1 + src/trans/gpu/internal/ftinv_ctl_mod.F90 | 1 + src/trans/gpu/internal/ftinv_mod.F90 | 1 + src/trans/gpu/internal/gstats_label_ifs.F90 | 1 + src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 1 - src/trans/gpu/internal/ledir_mod.F90 | 1 + src/trans/gpu/internal/leinv_mod.F90 | 1 + src/trans/gpu/internal/ltdir_ctl_mod.F90 | 1 + src/trans/gpu/internal/ltdir_mod.F90 | 2 + src/trans/gpu/internal/ltinv_ctl_mod.F90 | 1 + src/trans/gpu/internal/ltinv_mod.F90 | 1 + src/trans/gpu/internal/tpm_stats.F90 | 52 ++++++++++++++++++++ src/trans/gpu/internal/trgtol_mod.F90 | 1 + src/trans/gpu/internal/trltog_mod.F90 | 1 + src/trans/gpu/internal/trltom_mod.F90 | 1 + src/trans/gpu/internal/trmtol_mod.F90 | 1 + 20 files changed, 70 insertions(+), 2 deletions(-) create mode 100644 src/trans/gpu/internal/tpm_stats.F90 diff --git a/src/trans/gpu/external/dir_trans.F90 b/src/trans/gpu/external/dir_trans.F90 index 63ba412f3..958e9cdf1 100755 --- a/src/trans/gpu/external/dir_trans.F90 +++ b/src/trans/gpu/external/dir_trans.F90 @@ -124,6 +124,7 @@ SUBROUTINE DIR_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS +USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX !endif INTERFACE diff --git a/src/trans/gpu/external/inv_trans.F90 b/src/trans/gpu/external/inv_trans.F90 index 3970b14a5..8c46bc391 100755 --- a/src/trans/gpu/external/inv_trans.F90 +++ b/src/trans/gpu/external/inv_trans.F90 @@ -144,6 +144,7 @@ SUBROUTINE INV_TRANS(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS +USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX #ifdef _OPENACC use openacc diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 795e5cf40..81b2492eb 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -85,7 +85,6 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& USE FTDIR_CTL_MOD ,ONLY : FTDIR_CTL USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS -use nvtx ! IMPLICIT NONE diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index bc579ba81..cfe04fc3d 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -67,6 +67,7 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & USE FTDIR_MOD ,ONLY : FTDIR USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS +USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX use ieee_arithmetic ! diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 24dae2042..db7c1535f 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -52,6 +52,7 @@ SUBROUTINE FTDIR(PREEL,KFIELD) USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT USE CUDA_DEVICE_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER +USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX IMPLICIT NONE diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index 7c9edc411..d101cc766 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -72,6 +72,7 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS +USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX use ieee_arithmetic ! diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index 72bf2719c..e35401929 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -52,6 +52,7 @@ SUBROUTINE FTINV(PREEL,KFIELD) USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT USE CUDA_DEVICE_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER +USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX IMPLICIT NONE diff --git a/src/trans/gpu/internal/gstats_label_ifs.F90 b/src/trans/gpu/internal/gstats_label_ifs.F90 index 0bab13aaa..72bdf4940 100644 --- a/src/trans/gpu/internal/gstats_label_ifs.F90 +++ b/src/trans/gpu/internal/gstats_label_ifs.F90 @@ -52,6 +52,7 @@ SUBROUTINE GSTATS_LABEL_IFS USE PARKIND1 ,ONLY : JPRB USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE YOMGSTATS +USE TPM_STATS, ONLY: GSTATS_LABEL => GSTATS_LABEL_NVTX IMPLICIT NONE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index 26dd26573..1c7dd6dbf 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -93,7 +93,6 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& USE FIELD_SPLIT_MOD ,ONLY : FIELD_SPLIT USE LTINV_CTL_MOD ,ONLY : LTINV_CTL USE FTINV_CTL_MOD ,ONLY : FTINV_CTL -use nvtx ! IMPLICIT NONE diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index f76cbb5bd..24d83d77b 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -63,6 +63,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) USE CUDA_GEMM_BATCHED_MOD!!, ONLY: CUDA_TCGEMM_BATCHED, CUDA_GEMM_BATCHED USE CUBLAS_MOD, ONLY : CUDA_DGEMM_BATCHED USE MPL_MODULE ,ONLY : MPL_BARRIER +USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX USE, INTRINSIC :: ISO_C_BINDING USE IEEE_ARITHMETIC diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index d6505eea5..3d1b8d77a 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -62,6 +62,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) USE, INTRINSIC :: ISO_C_BINDING USE MPL_MODULE ,ONLY : MPL_BARRIER USE IEEE_ARITHMETIC +USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX IMPLICIT NONE diff --git a/src/trans/gpu/internal/ltdir_ctl_mod.F90 b/src/trans/gpu/internal/ltdir_ctl_mod.F90 index 3bb763b46..eaff425c8 100755 --- a/src/trans/gpu/internal/ltdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltdir_ctl_mod.F90 @@ -52,6 +52,7 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN,& USE TRLTOM_MOD ,ONLY : TRLTOM, TRLTOM_CUDAAWARE USE TPM_FIELDS ,ONLY : ZEPSNM + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX IMPLICIT NONE diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index f0afbd31c..e4d6b21aa 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -18,6 +18,7 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& USE PARKIND1 ,ONLY : JPIM ,JPRB USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX USE TPM_DIM ,ONLY : R USE TPM_DISTR ,ONLY : D @@ -31,6 +32,7 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX !**** *LTDIR* - Control of Direct Legendre transform step diff --git a/src/trans/gpu/internal/ltinv_ctl_mod.F90 b/src/trans/gpu/internal/ltinv_ctl_mod.F90 index aca1742a8..ec643fb1a 100755 --- a/src/trans/gpu/internal/ltinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltinv_ctl_mod.F90 @@ -59,6 +59,7 @@ SUBROUTINE LTINV_CTL(KF_UV,KF_SCALARS,& USE LTINV_MOD ,ONLY : LTINV USE TRMTOL_MOD ,ONLY : TRMTOL, TRMTOL_CUDAAWARE + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX IMPLICIT NONE diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index bd9e412c1..ec800cb98 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -32,6 +32,7 @@ SUBROUTINE LTINV(KF_UV,KF_SCALARS,& USE TPM_FIELDS ,ONLY : F,ZEPSNM USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX !**** *LTINV* - Inverse Legendre transform diff --git a/src/trans/gpu/internal/tpm_stats.F90 b/src/trans/gpu/internal/tpm_stats.F90 new file mode 100644 index 000000000..ff382a894 --- /dev/null +++ b/src/trans/gpu/internal/tpm_stats.F90 @@ -0,0 +1,52 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TPM_STATS + +IMPLICIT NONE + +CHARACTER(LEN=32) :: DESCRIPTIONS(100) + +CONTAINS + +SUBROUTINE GSTATS_LABEL_NVTX(KNUM,CTYPE,CDESC) +USE EC_PARKIND ,ONLY : JPIM +IMPLICIT NONE +INTEGER(KIND=JPIM) :: KNUM +CHARACTER(*) CDESC +CHARACTER(*) CTYPE + +IF (KNUM >= 400 .AND. KNUM < 500) THEN + DESCRIPTIONS(KNUM-400+1) = CDESC +ENDIF +CALL GSTATS_LABEL(KNUM,CTYPE,CDESC) +END SUBROUTINE + +SUBROUTINE GSTATS_NVTX(KNUM,KSWITCH) + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT + USE NVTX + + IMPLICIT NONE + + INTEGER(KIND=JPIM),INTENT(IN) :: KNUM + INTEGER(KIND=JPIM),INTENT(IN) :: KSWITCH + + IF (KNUM >= 400 .AND. KNUM < 500) THEN + IF (KSWITCH == 0) THEN + CALL NVTXSTARTRANGE(DESCRIPTIONS(KNUM-400+1)) + ELSEIF (KSWITCH == 1) THEN + CALL NVTXENDRANGE() + ENDIF + ENDIF + CALL GSTATS(KNUM,KSWITCH) +END SUBROUTINE GSTATS_NVTX + +END MODULE TPM_STATS + diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index cda804443..4f250a444 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -81,6 +81,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& USE MPL_DATA_MODULE ,ONLY : MPL_COMM_OML USE OML_MOD ,ONLY : OML_MY_THREAD USE MPI + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX USE TPM_TRANS ,ONLY : NPROMA diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 7d4d44b1c..f15fceb68 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -83,6 +83,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, USE OML_MOD ,ONLY : OML_MY_THREAD USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS USE MPI + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, NPROMA diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index c6c967d7e..4e331d8a1 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -69,6 +69,7 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) USE TPM_DISTR ,ONLY : D, NPRTRW, NPROC, MYPROC, MYSETW USE TPM_GEN ,ONLY : LSYNC_TRANS USE MPI +USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX IMPLICIT NONE diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index f7c045c69..593994059 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -69,6 +69,7 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) USE TPM_DISTR ,ONLY : D, NPRTRW, NPROC, MYPROC, MYSETW USE TPM_GEN ,ONLY : LSYNC_TRANS USE MPI +USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX IMPLICIT NONE From d692ed5f63a520e2c734b463ee44d2ac7726eed8 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:14 -0700 Subject: [PATCH 123/263] Add missing GEMM label --- src/trans/gpu/internal/leinv_mod.F90 | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 3d1b8d77a..57703d59a 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -203,6 +203,12 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) !C=A*B => ! C^T=B^T*A^T +IF (LSYNC_TRANS) THEN + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) +ENDIF +CALL GSTATS(424,0) !$ACC HOST_DATA USE_DEVICE(ZAS,ZINP,ZOUTS) CALL CUDA_GEMM_BATCHED( & & 'N', 'T', & @@ -214,6 +220,12 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & ZOUTS, KFIELDS, R_NDGNH, & & D_NUMP) !$ACC END HOST_DATA +IF (LSYNC_TRANS) THEN + CALL GSTATS(444,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(444,1) +ENDIF +CALL GSTATS(424,1) ALLOCATE(FOUBUF_IN(D%NLENGT1B*KFIELDS)) !$ACC ENTER DATA CREATE(FOUBUF_IN) From 67263d74bfffe8ce964b44b4da64d8bdae75df15 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:14 -0700 Subject: [PATCH 124/263] Incase parallelism again for some slow kernels in DIR --- src/trans/gpu/internal/updspb_mod.F90 | 30 +++++++++++++-------------- src/trans/gpu/internal/uvtvd_mod.F90 | 30 +++++++++++++-------------- 2 files changed, 28 insertions(+), 32 deletions(-) diff --git a/src/trans/gpu/internal/updspb_mod.F90 b/src/trans/gpu/internal/updspb_mod.F90 index 25dc71fbf..877895c45 100755 --- a/src/trans/gpu/internal/updspb_mod.F90 +++ b/src/trans/gpu/internal/updspb_mod.F90 @@ -96,27 +96,25 @@ SUBROUTINE UPDSPB(KFIELD,POA,PSPEC,KFLDPTR) !loop over wavenumber !$ACC DATA PRESENT(PSPEC,POA,R,D) - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IASM0,INM,IR,II) DEFAULT(NONE) + !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM,IASM0,INM,IR,II) DEFAULT(NONE) DO KMLOC=1,D%NUMP - DO JFLD=1,KFIELD - KM = D%MYMS(KMLOC) - IASM0 = D%NASM0(KM) + DO JN=3,R%NTMAX+3 + DO JFLD=1,KFIELD + KM = D%MYMS(KMLOC) + IASM0 = D%NASM0(KM) - IF(KM == 0) THEN - !$ACC LOOP SEQ - DO JN=3,R%NTMAX+3 - INM = IASM0+(R%NTMAX+3-JN)*2 - PSPEC(JFLD,INM) = POA(2*JFLD-1,JN,KMLOC) - PSPEC(JFLD,INM+1) = 0.0_JPRBT - ENDDO - ELSE - !$ACC LOOP SEQ - DO JN=3,R%NTMAX+3-KM + IF(KM /= 0 .AND. JN <= R%NTMAX+3-KM) THEN + !(DO JN=3,R%NTMAX+3-KM) INM = IASM0+((R%NTMAX+3-JN)-KM)*2 PSPEC(JFLD,INM) = POA(2*JFLD-1,JN,KMLOC) PSPEC(JFLD,INM+1) = POA(2*JFLD ,JN,KMLOC) - ENDDO - END IF + ELSEIF (KM == 0) THEN + !(DO JN=3,R%NTMAX+3) + INM = IASM0+(R%NTMAX+3-JN)*2 + PSPEC(JFLD,INM) = POA(2*JFLD-1,JN,KMLOC) + PSPEC(JFLD,INM+1) = 0.0_JPRBT + END IF + ENDDO ENDDO ENDDO !$ACC END PARALLEL diff --git a/src/trans/gpu/internal/uvtvd_mod.F90 b/src/trans/gpu/internal/uvtvd_mod.F90 index 2cffd6ba0..3833fef28 100755 --- a/src/trans/gpu/internal/uvtvd_mod.F90 +++ b/src/trans/gpu/internal/uvtvd_mod.F90 @@ -103,17 +103,17 @@ SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) !* 1.2 COMPUTE VORTICITY AND DIVERGENCE. -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IR,II,IN,KM,ZKM,JN,ZJN) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(IR,II,IN,KM,ZKM,JN,ZJN) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO J=1,KF_UV - IR = 2*J-1 - II = IR+1 - KM = D_MYMS(KMLOC) - ZKM = REAL(KM,JPRBT) - - IF(KM /= 0) THEN - !$ACC LOOP SEQ - DO JN=KM,R_NTMAX + DO JN=0,R_NTMAX + DO J=1,KF_UV + IR = 2*J-1 + II = IR+1 + KM = D_MYMS(KMLOC) + ZKM = REAL(KM,JPRBT) + + IF(KM /= 0 .AND. JN >= KM) THEN + ! (DO JN=KN,R_NTMAX) IN = R_NTMAX+3-JN ZJN = JN @@ -129,10 +129,8 @@ SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) PDIV(II,IN,kmloc) = +ZKM*PU(IR,IN,kmloc)+& &ZJN*ZEPSNM(KMLOC,JN+1)*PV(II,IN-1,kmloc)-& &(ZJN+1)*ZEPSNM(KMLOC,JN)*PV(II,IN+1,kmloc) - ENDDO - ELSE - !$ACC LOOP SEQ - DO JN=0,R_NTMAX + ELSEIF(KM == 0) THEN + ! (DO JN=0,R_NTMAX) IN = R_NTMAX+3-JN ZJN = JN @@ -142,8 +140,8 @@ SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) PDIV(IR,IN,kmloc) = & &ZJN*ZEPSNM(KMLOC,JN+1)*PV(IR,IN-1,kmloc)-& &(ZJN+1)*ZEPSNM(KMLOC,JN)*PV(IR,IN+1,kmloc) - ENDDO - ENDIF + ENDIF + ENDDO ENDDO ENDDO !$acc end data From 34580665db4c4c6a281edbfaa59c5b3e030d816f Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:14 -0700 Subject: [PATCH 125/263] Pimp a bit the NVTX coloring --- src/trans/gpu/internal/tpm_stats.F90 | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/trans/gpu/internal/tpm_stats.F90 b/src/trans/gpu/internal/tpm_stats.F90 index ff382a894..27c6dcfa8 100644 --- a/src/trans/gpu/internal/tpm_stats.F90 +++ b/src/trans/gpu/internal/tpm_stats.F90 @@ -37,10 +37,19 @@ SUBROUTINE GSTATS_NVTX(KNUM,KSWITCH) INTEGER(KIND=JPIM),INTENT(IN) :: KNUM INTEGER(KIND=JPIM),INTENT(IN) :: KSWITCH + INTEGER(KIND=JPIM) :: ICOLOR IF (KNUM >= 400 .AND. KNUM < 500) THEN IF (KSWITCH == 0) THEN - CALL NVTXSTARTRANGE(DESCRIPTIONS(KNUM-400+1)) + ICOLOR=0 + IF (KNUM>=430) ICOLOR=10 !LB markers + IF (KNUM==410) ICOLOR=13 !DIR COMPLETE + IF (KNUM==420) ICOLOR=14 !INV COMPLETE + IF (ICOLOR /= 0) THEN + CALL NVTXSTARTRANGE(DESCRIPTIONS(KNUM-400+1),ICOLOR) + ELSE + CALL NVTXSTARTRANGE(DESCRIPTIONS(KNUM-400+1)) + ENDIF ELSEIF (KSWITCH == 1) THEN CALL NVTXENDRANGE() ENDIF From 699cf3ef07be60ce39a440716e5a1926e4bda7cd Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:14 -0700 Subject: [PATCH 126/263] Try improve LEDIR GEMM array packing --- src/trans/gpu/internal/ledir_mod.F90 | 36 +++++++++++++++------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 24d83d77b..8ef2a2285 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -110,25 +110,27 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !$ACC END KERNELS !$ACC DATA PRESENT(FOUBUF) -!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) +!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) DO KMLOC=1,D_NUMP - DO JF=1,KF_FS*2 - KM = D_MYMS(KMLOC) - ISL = R_NDGNH-G_NDGLU(KM)+1 - !$ACC LOOP SEQ - DO JGL=ISL,R_NDGNH - IGLS = R_NDGL+1-JGL - OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL) )+D_NPNTGTB1(KMLOC,JGL ))*2*KF_FS - OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS))+D_NPNTGTB1(KMLOC,IGLS))*2*KF_FS - PAIA = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) - PAIS = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) - IF (JF .LE. 4*KF_UV) THEN - ! Multiply in case of velocity - PAIA = PAIA*F%RACTHE(JGL) - PAIS = PAIS*F%RACTHE(JGL) + DO JGL=1,R_NDGNH + DO JF=1,KF_FS*2 + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + IF (JGL >= ISL) THEN + !(DO JGL=ISL,R_NDGNH) + IGLS = R_NDGL+1-JGL + OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL) )+D_NPNTGTB1(KMLOC,JGL ))*2*KF_FS + OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS))+D_NPNTGTB1(KMLOC,IGLS))*2*KF_FS + PAIA = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) + PAIS = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) + IF (JF <= 4*KF_UV) THEN + ! Multiply in case of velocity + PAIA = PAIA*F%RACTHE(JGL) + PAIS = PAIS*F%RACTHE(JGL) + ENDIF + ZINPA(JF+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIA*F%RW(JGL) + ZINPS(JF+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIS*F%RW(JGL) ENDIF - ZINPA(JF+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIA*F%RW(JGL) - ZINPS(JF+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIS*F%RW(JGL) ENDDO ENDDO END DO From 7574013db4d5380512db00153e6f6a42870b88a7 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:14 -0700 Subject: [PATCH 127/263] Remove scalar copyins --- src/trans/gpu/external/setup_trans.F90 | 2 +- src/trans/gpu/internal/leinv_mod.F90 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index 4b20bacb2..0e17ae0b2 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -633,7 +633,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& END DO !$ACC ENTER DATA COPYIN(D_NSTAGT0B,D_NSTAGT1B,& -!$ACC& D_NPNTGTB1,D_NPROCL,D_NUMP,D_MYMS,D_NASM0,D_NSTAGTF,D_MSTABF,& +!$ACC& D_NPNTGTB1,D_NPROCL,D_MYMS,D_NASM0,D_NSTAGTF,D_MSTABF,& !$ACC& D_NPNTGTB0,D_NPROCM,D_NPTRLS,G_NDGLU,G_NMEN,G_NLOEN,& !$ACC& F_RW) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 57703d59a..0edb7e15d 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -99,7 +99,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ALLOCATE(ZOUTS(KFIELDS*R_NDGNH*D_NUMP)) ALLOCATE(ZOUTA(KFIELDS*R_NDGNH*D_NUMP)) -!$ACC DATA COPYIN(D,D_MYMS,G,G_NDGLU,D_NUMP,R,R_NDGNH,R_NSMAX) & +!$ACC DATA PRESENT(D,D_MYMS,G,G_NDGLU,R) & !$ACC& CREATE (ZINP,ZOUTS,ZOUTA) & !$ACC& PRESENT(ZAA,ZAS,PIA) & !$ACC& PRESENT(D_MYMS,D_NPROCL,D_NSTAGT1B,D_NPNTGTB1,G_NDGLU) From 8f4a7d37845bd312eda960b5139cba8043822dd7 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:15 -0700 Subject: [PATCH 128/263] clang-format --- .../external/fourier/create_plan_fftc.cu | 246 +++++++++--------- .../external/fourier/destroy_plan_fftc.cu | 78 +++--- .../external/fourier/execute_plan_fftc.cu | 118 ++++----- .../algor/external/fourier/storage_fftc.cu | 42 ++- 4 files changed, 228 insertions(+), 256 deletions(-) diff --git a/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu index b59fc2e00..97efeb782 100644 --- a/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu +++ b/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu @@ -1,166 +1,154 @@ #define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__) #include "cufft.h" #include "stdio.h" - static const char *_cudaGetErrorEnum(cufftResult error) - { - switch (error) - { - case CUFFT_SUCCESS: +static const char *_cudaGetErrorEnum(cufftResult error) { + switch (error) { + case CUFFT_SUCCESS: return "CUFFT_SUCCESS"; - case CUFFT_INVALID_PLAN: + case CUFFT_INVALID_PLAN: return "CUFFT_INVALID_PLAN"; - case CUFFT_ALLOC_FAILED: + case CUFFT_ALLOC_FAILED: return "CUFFT_ALLOC_FAILED"; - case CUFFT_INVALID_TYPE: + case CUFFT_INVALID_TYPE: return "CUFFT_INVALID_TYPE"; - case CUFFT_INVALID_VALUE: + case CUFFT_INVALID_VALUE: return "CUFFT_INVALID_VALUE"; - case CUFFT_INTERNAL_ERROR: + case CUFFT_INTERNAL_ERROR: return "CUFFT_INTERNAL_ERROR"; - case CUFFT_EXEC_FAILED: + case CUFFT_EXEC_FAILED: return "CUFFT_EXEC_FAILED"; - case CUFFT_SETUP_FAILED: + case CUFFT_SETUP_FAILED: return "CUFFT_SETUP_FAILED"; - case CUFFT_INVALID_SIZE: + case CUFFT_INVALID_SIZE: return "CUFFT_INVALID_SIZE"; - case CUFFT_UNALIGNED_DATA: + case CUFFT_UNALIGNED_DATA: return "CUFFT_UNALIGNED_DATA"; - } + } - return ""; - } + return ""; +} - inline void __cufftSafeCall(cufftResult err, const char *file, const int line) - { - if( CUFFT_SUCCESS != err) { +inline void __cufftSafeCall(cufftResult err, const char *file, const int line) { + if (CUFFT_SUCCESS != err) { fprintf(stderr, "CUFFT error at 1\n"); - fprintf(stderr, "CUFFT error in file '%s'\n",__FILE__); + fprintf(stderr, "CUFFT error in file '%s'\n", __FILE__); fprintf(stderr, "CUFFT error at 2\n"); /*fprintf(stderr, "CUFFT error line '%s'\n",__LINE__);*/ fprintf(stderr, "CUFFT error at 3\n"); - /*fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \ + /*fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: + %s\nterminating!\n",__FILE__, __LINE__,err, \ _cudaGetErrorEnum(err)); \*/ - fprintf(stderr, "CUFFT error %d: %s\nterminating!\n",err,_cudaGetErrorEnum(err)); \ - cudaDeviceReset(); return; \ - } - } - - -static int allocatedWorkspace=0; -static void* planWorkspace; -static int planWorkspaceSize=100*1024*1024; //100MB - -extern "C" -void -create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, int *LOTp, int *stridep) -{ -int ISIGN = *ISIGNp; -int N = *Np; -int LOT = *LOTp; -int stride = *stridep; - -cufftHandle plan; - -if (cudaDeviceSynchronize() != cudaSuccess){ - fprintf(stderr, "Cuda error: Failed to synchronize\n"); - return; + fprintf(stderr, "CUFFT error %d: %s\nterminating!\n", err, + _cudaGetErrorEnum(err)); + cudaDeviceReset(); + return; + } } - -// //create a single re-usable workspace -// if(!allocatedWorkspace){ -// allocatedWorkspace=1; -// //allocate plan workspace -// cudaMalloc(&planWorkspace,planWorkspaceSize); -// } -// -// //disable auto allocation so we can re-use a single workspace (created above) -// cufftSetAutoAllocation(plan, false); - -int embed[1]; -int dist; +static int allocatedWorkspace = 0; +static void *planWorkspace; +static int planWorkspaceSize = 100 * 1024 * 1024; // 100MB + +extern "C" void create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, + int *LOTp, int *stridep) { + int ISIGN = *ISIGNp; + int N = *Np; + int LOT = *LOTp; + int stride = *stridep; + + cufftHandle plan; + + if (cudaDeviceSynchronize() != cudaSuccess) { + fprintf(stderr, "Cuda error: Failed to synchronize\n"); + return; + } + + // //create a single re-usable workspace + // if(!allocatedWorkspace){ + // allocatedWorkspace=1; + // //allocate plan workspace + // cudaMalloc(&planWorkspace,planWorkspaceSize); + // } + // + // //disable auto allocation so we can re-use a single workspace (created + // above) + // cufftSetAutoAllocation(plan, false); + + int embed[1]; + int dist; #ifdef TRANS_SINGLE -cufftType cufft_1 = CUFFT_R2C; -cufftType cufft_2 = CUFFT_C2R; + cufftType cufft_1 = CUFFT_R2C; + cufftType cufft_2 = CUFFT_C2R; #else -cufftType cufft_1 = CUFFT_D2Z; -cufftType cufft_2 = CUFFT_Z2D; + cufftType cufft_1 = CUFFT_D2Z; + cufftType cufft_2 = CUFFT_Z2D; #endif -embed[0] = 1; -dist = 1; - -cufftSafeCall(cufftCreate(&plan)); - -//printf("CreatePlan cuFFT\n","N=",N); -//printf("%s %d \n","plan=",plan); -//printf("%s %d \n","LOT=",LOT); -//printf("%s %d \n","ISIGN=",ISIGN); -//printf("%s %d \n","Np=",*Np); - -if( ISIGN== -1 ){ - cufftSafeCall(cufftPlanMany(&plan, 1, &N, - embed, stride, dist, - embed, stride, dist, - cufft_1, LOT)); - //cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_D2Z, LOT)); -} -else if( ISIGN== 1){ - cufftSafeCall(cufftPlanMany(&plan, 1, &N, - embed, stride, dist, - embed, stride, dist, - cufft_2, LOT)); - //cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_Z2D, LOT)); -} -else { - abort(); -} - -// // use our reusaable work area for the plan -// cufftSetWorkArea(plan,planWorkspace); - -/* -if( ISIGN== -1 ){ - cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_D2Z, LOT)); -} -else if( ISIGN== 1){ - cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_Z2D, LOT)); + embed[0] = 1; + dist = 1; + + cufftSafeCall(cufftCreate(&plan)); + + // printf("CreatePlan cuFFT\n","N=",N); + // printf("%s %d \n","plan=",plan); + // printf("%s %d \n","LOT=",LOT); + // printf("%s %d \n","ISIGN=",ISIGN); + // printf("%s %d \n","Np=",*Np); + + if (ISIGN == -1) { + cufftSafeCall(cufftPlanMany(&plan, 1, &N, embed, stride, dist, embed, + stride, dist, cufft_1, LOT)); + // cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_D2Z, LOT)); + } else if (ISIGN == 1) { + cufftSafeCall(cufftPlanMany(&plan, 1, &N, embed, stride, dist, embed, + stride, dist, cufft_2, LOT)); + // cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_Z2D, LOT)); + } else { + abort(); + } + + // // use our reusaable work area for the plan + // cufftSetWorkArea(plan,planWorkspace); + + /* + if( ISIGN== -1 ){ + cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_D2Z, LOT)); + } + else if( ISIGN== 1){ + cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_Z2D, LOT)); + } + else { + abort(); + } + */ + + if (cudaDeviceSynchronize() != cudaSuccess) { + fprintf(stderr, "Cuda error: Failed to synchronize\n"); + return; + } + + *PLANp = plan; + + // // get size used by this plan + // size_t workSize; + // cufftGetSize(plan,&workSize); + // + // // exit if we don't have enough space for the work area in the re-usable + // workspace if(workSize > planWorkspaceSize){ + // printf("create_plan_fftc: plan workspace size not large enough - + // exiting\n"); + // exit(1); + // } + + return; } -else { - abort(); -} -*/ - -if (cudaDeviceSynchronize() != cudaSuccess){ - fprintf(stderr, "Cuda error: Failed to synchronize\n"); - return; -} - -*PLANp=plan; - -// // get size used by this plan -// size_t workSize; -// cufftGetSize(plan,&workSize); -// -// // exit if we don't have enough space for the work area in the re-usable workspace -// if(workSize > planWorkspaceSize){ -// printf("create_plan_fftc: plan workspace size not large enough - exiting\n"); -// exit(1); -// } - - -return; - - -} - diff --git a/src/trans/gpu/algor/external/fourier/destroy_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/destroy_plan_fftc.cu index d0dd94201..df8478345 100644 --- a/src/trans/gpu/algor/external/fourier/destroy_plan_fftc.cu +++ b/src/trans/gpu/algor/external/fourier/destroy_plan_fftc.cu @@ -1,77 +1,71 @@ #define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__) #include "cufft.h" #include "stdio.h" - static const char *_cudaGetErrorEnum(cufftResult error) - { - switch (error) - { - case CUFFT_SUCCESS: +static const char *_cudaGetErrorEnum(cufftResult error) { + switch (error) { + case CUFFT_SUCCESS: return "CUFFT_SUCCESS"; - case CUFFT_INVALID_PLAN: + case CUFFT_INVALID_PLAN: return "CUFFT_INVALID_PLAN"; - case CUFFT_ALLOC_FAILED: + case CUFFT_ALLOC_FAILED: return "CUFFT_ALLOC_FAILED"; - case CUFFT_INVALID_TYPE: + case CUFFT_INVALID_TYPE: return "CUFFT_INVALID_TYPE"; - case CUFFT_INVALID_VALUE: + case CUFFT_INVALID_VALUE: return "CUFFT_INVALID_VALUE"; - case CUFFT_INTERNAL_ERROR: + case CUFFT_INTERNAL_ERROR: return "CUFFT_INTERNAL_ERROR"; - case CUFFT_EXEC_FAILED: + case CUFFT_EXEC_FAILED: return "CUFFT_EXEC_FAILED"; - case CUFFT_SETUP_FAILED: + case CUFFT_SETUP_FAILED: return "CUFFT_SETUP_FAILED"; - case CUFFT_INVALID_SIZE: + case CUFFT_INVALID_SIZE: return "CUFFT_INVALID_SIZE"; - case CUFFT_UNALIGNED_DATA: + case CUFFT_UNALIGNED_DATA: return "CUFFT_UNALIGNED_DATA"; - } + } - return ""; - } + return ""; +} - inline void __cufftSafeCall(cufftResult err, const char *file, const int line) - { - if( CUFFT_SUCCESS != err) { +inline void __cufftSafeCall(cufftResult err, const char *file, const int line) { + if (CUFFT_SUCCESS != err) { fprintf(stderr, "CUFFT error at 1\n"); - fprintf(stderr, "CUFFT error in file '%s'\n",__FILE__); + fprintf(stderr, "CUFFT error in file '%s'\n", __FILE__); fprintf(stderr, "CUFFT error at 2\n"); /*fprintf(stderr, "CUFFT error line '%s'\n",__LINE__);*/ fprintf(stderr, "CUFFT error at 3\n"); - /*fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \ + /*fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: + %s\nterminating!\n",__FILE__, __LINE__,err, \ _cudaGetErrorEnum(err)); \*/ - fprintf(stderr, "CUFFT error %d: %s\nterminating!\n",err,_cudaGetErrorEnum(err)); \ - cudaDeviceReset(); return; \ - } - } - -extern "C" -void -destroy_plan_fftc_(cufftHandle *PLANp) -{ -cufftHandle plan = *PLANp; - -if (cudaDeviceSynchronize() != cudaSuccess){ - fprintf(stderr, "Cuda error: Failed to synchronize\n"); - return; + fprintf(stderr, "CUFFT error %d: %s\nterminating!\n", err, + _cudaGetErrorEnum(err)); + cudaDeviceReset(); + return; + } } -cufftSafeCall(cufftDestroy(plan)); +extern "C" void destroy_plan_fftc_(cufftHandle *PLANp) { + cufftHandle plan = *PLANp; -if (cudaDeviceSynchronize() != cudaSuccess){ - fprintf(stderr, "Cuda error: Failed to synchronize\n"); - return; -} + if (cudaDeviceSynchronize() != cudaSuccess) { + fprintf(stderr, "Cuda error: Failed to synchronize\n"); + return; + } + cufftSafeCall(cufftDestroy(plan)); + if (cudaDeviceSynchronize() != cudaSuccess) { + fprintf(stderr, "Cuda error: Failed to synchronize\n"); + return; + } } - diff --git a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu index 51a069705..7f33705a2 100644 --- a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu +++ b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu @@ -1,100 +1,96 @@ #define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__) #include "cufft.h" #include "stdio.h" - static const char *_cudaGetErrorEnum(cufftResult error) - { - switch (error) - { - case CUFFT_SUCCESS: +static const char *_cudaGetErrorEnum(cufftResult error) { + switch (error) { + case CUFFT_SUCCESS: return "CUFFT_SUCCESS"; - case CUFFT_INVALID_PLAN: + case CUFFT_INVALID_PLAN: return "CUFFT_INVALID_PLAN"; - case CUFFT_ALLOC_FAILED: + case CUFFT_ALLOC_FAILED: return "CUFFT_ALLOC_FAILED"; - case CUFFT_INVALID_TYPE: + case CUFFT_INVALID_TYPE: return "CUFFT_INVALID_TYPE"; - case CUFFT_INVALID_VALUE: + case CUFFT_INVALID_VALUE: return "CUFFT_INVALID_VALUE"; - case CUFFT_INTERNAL_ERROR: + case CUFFT_INTERNAL_ERROR: return "CUFFT_INTERNAL_ERROR"; - case CUFFT_EXEC_FAILED: + case CUFFT_EXEC_FAILED: return "CUFFT_EXEC_FAILED"; - case CUFFT_SETUP_FAILED: + case CUFFT_SETUP_FAILED: return "CUFFT_SETUP_FAILED"; - case CUFFT_INVALID_SIZE: + case CUFFT_INVALID_SIZE: return "CUFFT_INVALID_SIZE"; - case CUFFT_UNALIGNED_DATA: + case CUFFT_UNALIGNED_DATA: return "CUFFT_UNALIGNED_DATA"; - } + } - return ""; - } + return ""; +} - inline void __cufftSafeCall(cufftResult err, const char *file, const int line) - { - if( CUFFT_SUCCESS != err) { +inline void __cufftSafeCall(cufftResult err, const char *file, const int line) { + if (CUFFT_SUCCESS != err) { fprintf(stderr, "CUFFT error at 1\n"); - fprintf(stderr, "CUFFT error in file '%s'\n",__FILE__); + fprintf(stderr, "CUFFT error in file '%s'\n", __FILE__); fprintf(stderr, "CUFFT error at 2\n"); /*fprintf(stderr, "CUFFT error line '%s'\n",__LINE__);*/ fprintf(stderr, "CUFFT error at 3\n"); - /*fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \ + /*fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: + %s\nterminating!\n",__FILE__, __LINE__,err, \ _cudaGetErrorEnum(err)); \*/ - fprintf(stderr, "CUFFT error %d: %s\nterminating!\n",err,_cudaGetErrorEnum(err)); \ - cudaDeviceReset(); return; \ - } - } + fprintf(stderr, "CUFFT error %d: %s\nterminating!\n", err, + _cudaGetErrorEnum(err)); + cudaDeviceReset(); + return; + } +} -extern "C" -void +extern "C" void #ifdef TRANS_SINGLE -execute_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, cufftComplex *data_in, cufftComplex *data_out) +execute_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, cufftComplex *data_in, + cufftComplex *data_out) #else -execute_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, cufftDoubleComplex *data_in, cufftDoubleComplex *data_out) +execute_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, cufftDoubleComplex *data_in, + cufftDoubleComplex *data_out) #endif { -cufftHandle plan = *PLANp; -int ISIGN = *ISIGNp; - -/*if (cudaDeviceSynchronize() != cudaSuccess){ - fprintf(stderr, "Cuda error: Failed to synchronize\n"); - return; -}*/ - -if( ISIGN== -1 ){ - #ifdef TRANS_SINGLE - cufftSafeCall(cufftExecR2C(plan, (cufftReal*)data_in, data_out)); - #else - cufftSafeCall(cufftExecD2Z(plan, (cufftDoubleReal*)data_in, data_out)); - #endif -} -else if( ISIGN== 1){ - #ifdef TRANS_SINGLE - cufftSafeCall(cufftExecC2R(plan, data_in, (cufftReal*)data_out)); - #else - cufftSafeCall(cufftExecZ2D(plan, data_in, (cufftDoubleReal*)data_out)); - #endif -} -else { - abort(); -} + cufftHandle plan = *PLANp; + int ISIGN = *ISIGNp; -// cudaDeviceSynchronize(); + /*if (cudaDeviceSynchronize() != cudaSuccess){ + fprintf(stderr, "Cuda error: Failed to synchronize\n"); + return; + }*/ -//if (cudaDeviceSynchronize() != cudaSuccess){ -// fprintf(stderr, "Cuda error: Failed to synchronize\n"); -// return; -//} + if (ISIGN == -1) { +#ifdef TRANS_SINGLE + cufftSafeCall(cufftExecR2C(plan, (cufftReal *)data_in, data_out)); +#else + cufftSafeCall(cufftExecD2Z(plan, (cufftDoubleReal *)data_in, data_out)); +#endif + } else if (ISIGN == 1) { +#ifdef TRANS_SINGLE + cufftSafeCall(cufftExecC2R(plan, data_in, (cufftReal *)data_out)); +#else + cufftSafeCall(cufftExecZ2D(plan, data_in, (cufftDoubleReal *)data_out)); +#endif + } else { + abort(); + } + // cudaDeviceSynchronize(); + // if (cudaDeviceSynchronize() != cudaSuccess){ + // fprintf(stderr, "Cuda error: Failed to synchronize\n"); + // return; + //} } - diff --git a/src/trans/gpu/algor/external/fourier/storage_fftc.cu b/src/trans/gpu/algor/external/fourier/storage_fftc.cu index 7badd5fab..729195338 100644 --- a/src/trans/gpu/algor/external/fourier/storage_fftc.cu +++ b/src/trans/gpu/algor/external/fourier/storage_fftc.cu @@ -1,28 +1,22 @@ #include "cufft.h" #include "stdio.h" -extern "C" -cufftDoubleComplex *create_storage_(int *Np) -{ - int N = *Np; - cufftDoubleComplex *data; - /*cudaMalloc((void**)&data,sizeof(cufftDoubleComplex)*N); - if (cudaGetLastError() != cudaSuccess){ - fprintf(stderr, "Cuda error: Failed to allocate\n"); - return 0; - } - return data;*/ - printf("%s %d \n","sizeof(cufftDoubleComplex)=",sizeof(cufftDoubleComplex)); - printf("%s %d \n","N=",N); - if (cudaMalloc(&data, sizeof(cufftDoubleComplex)*N) == cudaSuccess){ - printf("%s %X \n","data ",data); - return data; - } - fprintf(stderr, "Cuda error: Failed to allocate\n"); - return 0; +extern "C" cufftDoubleComplex *create_storage_(int *Np) { + int N = *Np; + cufftDoubleComplex *data; + /*cudaMalloc((void**)&data,sizeof(cufftDoubleComplex)*N); + if (cudaGetLastError() != cudaSuccess){ + fprintf(stderr, "Cuda error: Failed to allocate\n"); + return 0; + } + return data;*/ + printf("%s %d \n", "sizeof(cufftDoubleComplex)=", sizeof(cufftDoubleComplex)); + printf("%s %d \n", "N=", N); + if (cudaMalloc(&data, sizeof(cufftDoubleComplex) * N) == cudaSuccess) { + printf("%s %X \n", "data ", data); + return data; + } + fprintf(stderr, "Cuda error: Failed to allocate\n"); + return 0; } -extern "C" -void destroy_storage_(int *ptr) -{ - cudaFree(ptr); -} +extern "C" void destroy_storage_(int *ptr) { cudaFree(ptr); } From 77b7658ad54b9fd632cf50cf60dda95cb531ab0b Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:15 -0700 Subject: [PATCH 129/263] CUFFT: Use workspace --- .../external/fourier/create_plan_fftc.cu | 56 ++++++------------- .../external/fourier/execute_plan_fftc.cu | 14 +---- src/trans/gpu/internal/tpm_fftc.F90 | 8 +-- 3 files changed, 23 insertions(+), 55 deletions(-) diff --git a/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu index 97efeb782..e8fa3e661 100644 --- a/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu +++ b/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu @@ -54,12 +54,11 @@ inline void __cufftSafeCall(cufftResult err, const char *file, const int line) { } } -static int allocatedWorkspace = 0; -static void *planWorkspace; -static int planWorkspaceSize = 100 * 1024 * 1024; // 100MB +void *planWorkspace; +static int currentWorkspaceSize = 0; extern "C" void create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, - int *LOTp, int *stridep) { + int *LOTp, int *stridep, int *plan_size) { int ISIGN = *ISIGNp; int N = *Np; int LOT = *LOTp; @@ -72,17 +71,6 @@ extern "C" void create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, return; } - // //create a single re-usable workspace - // if(!allocatedWorkspace){ - // allocatedWorkspace=1; - // //allocate plan workspace - // cudaMalloc(&planWorkspace,planWorkspaceSize); - // } - // - // //disable auto allocation so we can re-use a single workspace (created - // above) - // cufftSetAutoAllocation(plan, false); - int embed[1]; int dist; @@ -99,6 +87,9 @@ extern "C" void create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, cufftSafeCall(cufftCreate(&plan)); + // Disable auto allocation + cufftSetAutoAllocation(plan, false); + // printf("CreatePlan cuFFT\n","N=",N); // printf("%s %d \n","plan=",plan); // printf("%s %d \n","LOT=",LOT); @@ -108,29 +99,24 @@ extern "C" void create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, if (ISIGN == -1) { cufftSafeCall(cufftPlanMany(&plan, 1, &N, embed, stride, dist, embed, stride, dist, cufft_1, LOT)); - // cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_D2Z, LOT)); } else if (ISIGN == 1) { cufftSafeCall(cufftPlanMany(&plan, 1, &N, embed, stride, dist, embed, stride, dist, cufft_2, LOT)); - // cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_Z2D, LOT)); } else { abort(); } - // // use our reusaable work area for the plan - // cufftSetWorkArea(plan,planWorkspace); + // get size used by this plan + size_t thisWorkplanSize; + cufftGetSize(plan, &thisWorkplanSize); - /* - if( ISIGN== -1 ){ - cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_D2Z, LOT)); - } - else if( ISIGN== 1){ - cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_Z2D, LOT)); + // check if this the work space is sufficiently large + if (thisWorkplanSize > currentWorkspaceSize) { + cudaDeviceSynchronize(); + cudaFree(planWorkspace); + cudaMalloc(&planWorkspace, thisWorkplanSize); + currentWorkspaceSize = thisWorkplanSize; } - else { - abort(); - } - */ if (cudaDeviceSynchronize() != cudaSuccess) { fprintf(stderr, "Cuda error: Failed to synchronize\n"); @@ -138,17 +124,7 @@ extern "C" void create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, } *PLANp = plan; - - // // get size used by this plan - // size_t workSize; - // cufftGetSize(plan,&workSize); - // - // // exit if we don't have enough space for the work area in the re-usable - // workspace if(workSize > planWorkspaceSize){ - // printf("create_plan_fftc: plan workspace size not large enough - - // exiting\n"); - // exit(1); - // } + *plan_size = thisWorkplanSize; return; } diff --git a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu index 7f33705a2..ce1ba14f9 100644 --- a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu +++ b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu @@ -54,6 +54,8 @@ inline void __cufftSafeCall(cufftResult err, const char *file, const int line) { } } +extern void *planWorkspace; + extern "C" void #ifdef TRANS_SINGLE execute_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, cufftComplex *data_in, @@ -66,10 +68,7 @@ execute_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, cufftDoubleComplex *data_in, cufftHandle plan = *PLANp; int ISIGN = *ISIGNp; - /*if (cudaDeviceSynchronize() != cudaSuccess){ - fprintf(stderr, "Cuda error: Failed to synchronize\n"); - return; - }*/ + cufftSafeCall(cufftSetWorkArea(plan, planWorkspace)); if (ISIGN == -1) { #ifdef TRANS_SINGLE @@ -86,11 +85,4 @@ execute_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, cufftDoubleComplex *data_in, } else { abort(); } - - // cudaDeviceSynchronize(); - - // if (cudaDeviceSynchronize() != cudaSuccess){ - // fprintf(stderr, "Cuda error: Failed to synchronize\n"); - // return; - //} } diff --git a/src/trans/gpu/internal/tpm_fftc.F90 b/src/trans/gpu/internal/tpm_fftc.F90 index 4586832fe..3f9c6ad00 100755 --- a/src/trans/gpu/internal/tpm_fftc.F90 +++ b/src/trans/gpu/internal/tpm_fftc.F90 @@ -73,7 +73,7 @@ SUBROUTINE CREATE_PLAN_FFT(KPLAN,KTYPE,KN,KLOT,KSTRIDE) INTEGER(KIND=JPIM),INTENT(OUT) :: KPLAN INTEGER(KIND=JPIM),INTENT(IN) :: KTYPE,KN,KLOT,KSTRIDE -INTEGER(KIND=JPIM) :: IPLAN +INTEGER(KIND=JPIM) :: IPLAN, IPLAN_SIZE INTEGER(KIND=JPIM) :: IRANK, ISTRIDE INTEGER(KIND=JPIM) :: JL, JN INTEGER(KIND=JPIM) :: IRDIST,ICDIST,IN(1),IEMBED(1) @@ -81,10 +81,10 @@ SUBROUTINE CREATE_PLAN_FFT(KPLAN,KTYPE,KN,KLOT,KSTRIDE) LOGICAL :: LLRESTRICT_PLANS=.TRUE. TYPE(FFTC_PLAN),POINTER :: CURR_FFTC_PLAN,START_FFTC_PLAN INTERFACE - SUBROUTINE CREATE_PLAN_FFTC(KPLAN,KTYPE,KN,KLOT,KSTRIDE) BIND(C,NAME="create_plan_fftc_") + SUBROUTINE CREATE_PLAN_FFTC(KPLAN,KTYPE,KN,KLOT,KSTRIDE,PLAN_SIZE) BIND(C,NAME="create_plan_fftc_") USE, INTRINSIC :: ISO_C_BINDING INTEGER(C_INT) :: KPLAN - INTEGER(C_INT) :: KTYPE,KN,KLOT,KSTRIDE + INTEGER(C_INT) :: KTYPE,KN,KLOT,KSTRIDE,PLAN_SIZE END SUBROUTINE CREATE_PLAN_FFTC END INTERFACE @@ -138,7 +138,7 @@ END SUBROUTINE CREATE_PLAN_FFTC ! WRITE(*,'("CREATE_PLAN_FFT: END: DESTROYING A PLAN AT THE START OF THE LIST")') ENDIF ENDIF - CALL CREATE_PLAN_FFTC(IPLAN,KTYPE,KN,KLOT,KSTRIDE) + CALL CREATE_PLAN_FFTC(IPLAN,KTYPE,KN,KLOT,KSTRIDE,IPLAN_SIZE) KPLAN=IPLAN TC%N_PLANS(KN)=TC%N_PLANS(KN)+1 IF( TC%N_PLANS(KN) /= 1 )THEN From 38d98aaea75bfb03d87fda9cfb48527e4552c60c Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:15 -0700 Subject: [PATCH 130/263] The complex part of ZGTF is compact now --- .../algor/external/fourier/create_plan_fftc.cu | 4 ++-- src/trans/gpu/internal/fourier_in_mod.F90 | 4 ++-- src/trans/gpu/internal/fourier_out_mod.F90 | 4 ++-- src/trans/gpu/internal/fsc_mod.F90 | 16 ++++++++-------- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu index e8fa3e661..6a42e9ed8 100644 --- a/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu +++ b/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu @@ -98,9 +98,9 @@ extern "C" void create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, if (ISIGN == -1) { cufftSafeCall(cufftPlanMany(&plan, 1, &N, embed, stride, dist, embed, - stride, dist, cufft_1, LOT)); + stride/2, dist, cufft_1, LOT)); } else if (ISIGN == 1) { - cufftSafeCall(cufftPlanMany(&plan, 1, &N, embed, stride, dist, embed, + cufftSafeCall(cufftPlanMany(&plan, 1, &N, embed, stride/2, dist, embed, stride, dist, cufft_2, LOT)); } else { abort(); diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index d059cda77..6b3074a92 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -83,8 +83,8 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL,KFIELDS) IPROC = D_NPROCM(JM) ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KFIELDS*2 - PREEL(2*JF-1,2*JM+IOFF) = FOUBUF(ISTA+2*JF-1) - PREEL(2*JF, 2*JM+IOFF) = FOUBUF(ISTA+2*JF ) + PREEL(2*JF-1,JM+IOFF) = FOUBUF(ISTA+2*JF-1) + PREEL(2*JF, JM+IOFF) = FOUBUF(ISTA+2*JF ) ENDDO ENDDO ENDDO diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 index 6983f32dc..218be5854 100755 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ b/src/trans/gpu/internal/fourier_out_mod.F90 @@ -85,8 +85,8 @@ SUBROUTINE FOURIER_OUT(PREEL,FOUBUF_IN,KFIELDS) ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KFIELDS*2 ! This is not contiguous in PREEL due to the memory layout. - FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL(2*JF-1, 2*JM+IOFF) - FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL(2*JF , 2*JM+IOFF) + FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL(2*JF-1, JM+IOFF) + FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL(2*JF , JM+IOFF) ENDDO ENDDO ENDDO diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 43f090a0d..80ea7cb06 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -104,8 +104,8 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PUV(2*JF-1,2*JM+IOFF) = PUV(2*JF-1,2*JM+IOFF)*ZACHTE2 - PUV(2*JF, 2*JM+IOFF) = PUV(2*JF ,2*JM+IOFF)*ZACHTE2 + PUV(2*JF-1,JM+IOFF) = PUV(2*JF-1,JM+IOFF)*ZACHTE2 + PUV(2*JF, JM+IOFF) = PUV(2*JF ,JM+IOFF)*ZACHTE2 ENDDO ENDDO ENDDO @@ -123,8 +123,8 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PSCALARS_NSDER(2*JF-1,2*JM+IOFF) = PSCALARS_NSDER(2*JF-1,2*JM+IOFF)*ZACHTE2 - PSCALARS_NSDER(2*JF, 2*JM+IOFF) = PSCALARS_NSDER(2*JF, 2*JM+IOFF)*ZACHTE2 + PSCALARS_NSDER(2*JF-1,JM+IOFF) = PSCALARS_NSDER(2*JF-1,JM+IOFF)*ZACHTE2 + PSCALARS_NSDER(2*JF, JM+IOFF) = PSCALARS_NSDER(2*JF, JM+IOFF)*ZACHTE2 ENDDO ENDDO ENDDO @@ -148,8 +148,8 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PUV_EWDER(2*JF-1,2*JM+IOFF) = -PUV(2*JF,2*JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) - PUV_EWDER(2*JF, 2*JM+IOFF) = PUV(2*JF-1,2*JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) + PUV_EWDER(2*JF-1,JM+IOFF) = -PUV(2*JF, JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) + PUV_EWDER(2*JF, JM+IOFF) = PUV(2*JF-1,JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) ENDDO ENDDO ENDDO @@ -167,8 +167,8 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PSCALARS_EWDER(2*JF-1,2*JM+IOFF) = -PSCALARS(2*JF,2*JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) - PSCALARS_EWDER(2*JF, 2*JM+IOFF) = PSCALARS(2*JF-1,2*JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) + PSCALARS_EWDER(2*JF-1,JM+IOFF) = -PSCALARS(2*JF ,JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) + PSCALARS_EWDER(2*JF, JM+IOFF) = PSCALARS(2*JF-1,JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) ENDDO ENDDO ENDDO From e076def184ba5ff4e79d70aef4e0d9829e78a27e Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:15 -0700 Subject: [PATCH 131/263] CUFFT: Fix memory layout and reduce memory overhead for dirtrans (CHANGE2: 5) --- .../external/fourier/create_plan_fftc.cu | 39 +++++++------------ .../external/fourier/execute_plan_fftc.cu | 34 +++++++--------- src/trans/gpu/internal/fourier_out_mod.F90 | 16 ++++---- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 24 +++++++----- src/trans/gpu/internal/ftdir_mod.F90 | 33 +++++++--------- src/trans/gpu/internal/sump_trans_mod.F90 | 5 +++ 6 files changed, 69 insertions(+), 82 deletions(-) diff --git a/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu index 6a42e9ed8..a96d36078 100644 --- a/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu +++ b/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu @@ -1,4 +1,3 @@ -#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__) #include "cufft.h" #include "stdio.h" static const char *_cudaGetErrorEnum(cufftResult error) { @@ -36,25 +35,17 @@ static const char *_cudaGetErrorEnum(cufftResult error) { return ""; } - -inline void __cufftSafeCall(cufftResult err, const char *file, const int line) { - if (CUFFT_SUCCESS != err) { - fprintf(stderr, "CUFFT error at 1\n"); - fprintf(stderr, "CUFFT error in file '%s'\n", __FILE__); - fprintf(stderr, "CUFFT error at 2\n"); - /*fprintf(stderr, "CUFFT error line '%s'\n",__LINE__);*/ - fprintf(stderr, "CUFFT error at 3\n"); - /*fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: - %s\nterminating!\n",__FILE__, __LINE__,err, \ - _cudaGetErrorEnum(err)); \*/ - fprintf(stderr, "CUFFT error %d: %s\nterminating!\n", err, - _cudaGetErrorEnum(err)); - cudaDeviceReset(); - return; - } +#define CUFFT_CHECK(e) { \ + cufftResult_t err = (e); \ + if (err != CUFFT_SUCCESS) \ + { \ + fprintf(stderr, "CUFFT error: %s, line %d, %s: %s\n", \ + __FILE__, __LINE__, #e, _cudaGetErrorEnum(err)); \ + exit(EXIT_FAILURE); \ + } \ } -void *planWorkspace; +void *planWorkspace = nullptr; static int currentWorkspaceSize = 0; extern "C" void create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, @@ -85,10 +76,10 @@ extern "C" void create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, embed[0] = 1; dist = 1; - cufftSafeCall(cufftCreate(&plan)); + CUFFT_CHECK(cufftCreate(&plan)); // Disable auto allocation - cufftSetAutoAllocation(plan, false); + CUFFT_CHECK(cufftSetAutoAllocation(plan, false)); // printf("CreatePlan cuFFT\n","N=",N); // printf("%s %d \n","plan=",plan); @@ -97,10 +88,10 @@ extern "C" void create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, // printf("%s %d \n","Np=",*Np); if (ISIGN == -1) { - cufftSafeCall(cufftPlanMany(&plan, 1, &N, embed, stride, dist, embed, - stride/2, dist, cufft_1, LOT)); + CUFFT_CHECK(cufftPlanMany(&plan, 1, &N, embed, stride, dist, embed, + stride, dist, cufft_1, LOT)); } else if (ISIGN == 1) { - cufftSafeCall(cufftPlanMany(&plan, 1, &N, embed, stride/2, dist, embed, + CUFFT_CHECK(cufftPlanMany(&plan, 1, &N, embed, stride/2, dist, embed, stride, dist, cufft_2, LOT)); } else { abort(); @@ -108,7 +99,7 @@ extern "C" void create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, // get size used by this plan size_t thisWorkplanSize; - cufftGetSize(plan, &thisWorkplanSize); + CUFFT_CHECK(cufftGetSize(plan, &thisWorkplanSize)); // check if this the work space is sufficiently large if (thisWorkplanSize > currentWorkspaceSize) { diff --git a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu index ce1ba14f9..3acf1afbf 100644 --- a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu +++ b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu @@ -1,4 +1,3 @@ -#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__) #include "cufft.h" #include "stdio.h" static const char *_cudaGetErrorEnum(cufftResult error) { @@ -37,21 +36,14 @@ static const char *_cudaGetErrorEnum(cufftResult error) { return ""; } -inline void __cufftSafeCall(cufftResult err, const char *file, const int line) { - if (CUFFT_SUCCESS != err) { - fprintf(stderr, "CUFFT error at 1\n"); - fprintf(stderr, "CUFFT error in file '%s'\n", __FILE__); - fprintf(stderr, "CUFFT error at 2\n"); - /*fprintf(stderr, "CUFFT error line '%s'\n",__LINE__);*/ - fprintf(stderr, "CUFFT error at 3\n"); - /*fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: - %s\nterminating!\n",__FILE__, __LINE__,err, \ - _cudaGetErrorEnum(err)); \*/ - fprintf(stderr, "CUFFT error %d: %s\nterminating!\n", err, - _cudaGetErrorEnum(err)); - cudaDeviceReset(); - return; - } +#define CUFFT_CHECK(e) { \ + cufftResult_t err = (e); \ + if (err != CUFFT_SUCCESS) \ + { \ + fprintf(stderr, "CUFFT error: %s, line %d, %s: %s\n", \ + __FILE__, __LINE__, #e, _cudaGetErrorEnum(err)); \ + exit(EXIT_FAILURE); \ + } \ } extern void *planWorkspace; @@ -68,19 +60,19 @@ execute_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, cufftDoubleComplex *data_in, cufftHandle plan = *PLANp; int ISIGN = *ISIGNp; - cufftSafeCall(cufftSetWorkArea(plan, planWorkspace)); + CUFFT_CHECK(cufftSetWorkArea(plan, planWorkspace)); if (ISIGN == -1) { #ifdef TRANS_SINGLE - cufftSafeCall(cufftExecR2C(plan, (cufftReal *)data_in, data_out)); + CUFFT_CHECK(cufftExecR2C(plan, (cufftReal *)data_in, data_out)); #else - cufftSafeCall(cufftExecD2Z(plan, (cufftDoubleReal *)data_in, data_out)); + CUFFT_CHECK(cufftExecD2Z(plan, (cufftDoubleReal *)data_in, data_out)); #endif } else if (ISIGN == 1) { #ifdef TRANS_SINGLE - cufftSafeCall(cufftExecC2R(plan, data_in, (cufftReal *)data_out)); + CUFFT_CHECK(cufftExecC2R(plan, data_in, (cufftReal *)data_out)); #else - cufftSafeCall(cufftExecZ2D(plan, data_in, (cufftDoubleReal *)data_out)); + CUFFT_CHECK(cufftExecZ2D(plan, data_in, (cufftDoubleReal *)data_out)); #endif } else { abort(); diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 index 218be5854..f15ee973a 100755 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ b/src/trans/gpu/internal/fourier_out_mod.F90 @@ -10,7 +10,7 @@ MODULE FOURIER_OUT_MOD CONTAINS -SUBROUTINE FOURIER_OUT(PREEL,FOUBUF_IN,KFIELDS) +SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) !**** *FOURIER_OUT* - Copy fourier data from local array to buffer @@ -45,11 +45,11 @@ SUBROUTINE FOURIER_OUT(PREEL,FOUBUF_IN,KFIELDS) IMPLICIT NONE -REAL(KIND=JPRBT), INTENT(IN) :: PREEL(:,:) +REAL(KIND=JPRBT), INTENT(IN) :: PREEL_COMPLEX(:,:) REAL(KIND=JPRBT), ALLOCATABLE, INTENT(OUT) :: FOUBUF_IN(:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA,OFFSET_VAR,IOFF,KGL +INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA,OFFSET_VAR,IOFF_COMPLEX,KGL INTEGER(KIND=JPIM) :: IBEG,IEND,IINC REAL(KIND=JPRBT) :: SCAL @@ -67,16 +67,16 @@ SUBROUTINE FOURIER_OUT(PREEL,FOUBUF_IN,KFIELDS) ALLOCATE(FOUBUF_IN(D%NLENGT0B*KFIELDS*2)) !$ACC ENTER DATA CREATE(FOUBUF_IN) -!$ACC DATA PRESENT(G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF_IN,PREEL,D_NSTAGTF,G_NLOEN) ASYNC(1) +!$ACC DATA PRESENT(G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF_IN,PREEL_COMPLEX,D_NSTAGTF,G_NLOEN) ASYNC(1) ! scale results and move into next transformation buffer OFFSET_VAR=D_NPTRLS(MYSETW) -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF,JM,IPROC,ISTA,SCAL) DEFAULT(NONE) ASYNC(1) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF_COMPLEX,JM,IPROC,ISTA,SCAL) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=1,KFIELDS IGLG = OFFSET_VAR+KGL-1 - IOFF = D_NSTAGTF(KGL)+1 + IOFF_COMPLEX = (D_NSTAGTF(KGL)/2)+1 SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) !$ACC LOOP SEQ @@ -85,8 +85,8 @@ SUBROUTINE FOURIER_OUT(PREEL,FOUBUF_IN,KFIELDS) ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KFIELDS*2 ! This is not contiguous in PREEL due to the memory layout. - FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL(2*JF-1, JM+IOFF) - FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL(2*JF , JM+IOFF) + FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL_COMPLEX(2*JF-1, JM+IOFF_COMPLEX) + FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL_COMPLEX(2*JF , JM+IOFF_COMPLEX) ENDDO ENDDO ENDDO diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index cfe04fc3d..b1ea52d74 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -92,7 +92,8 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & REAL(KIND=JPRBT), ALLOCATABLE, INTENT(OUT) :: FOUBUF_IN(:) ! Local variables -REAL(KIND=JPRBT), POINTER :: ZGTF(:,:) +REAL(KIND=JPRBT), POINTER :: ZGTF_REAL(:,:) +REAL(KIND=JPRBT), POINTER :: ZGTF_COMPLEX(:,:) INTEGER(KIND=JPIM) :: IST,JGL,IGL,JF_FS INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) @@ -149,9 +150,8 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & IST = IST+KF_SCALARS_G ENDIF -! Note that this buffer is 2X too large, we will need to transpose ZGTF to get rid of this -ALLOCATE(ZGTF(2*KF_FS, D%NLENGTF)) -!$ACC ENTER DATA CREATE(ZGTF) +ALLOCATE(ZGTF_REAL(KF_FS, D%NLENGTF)) +!$ACC ENTER DATA CREATE(ZGTF_REAL) ! Transposition @@ -175,7 +175,7 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & CALL GSTATS(432,1) ENDIF CALL GSTATS(412,1) -CALL TRGTOL_CUDAAWARE(ZGTF,KF_FS,KF_GP,IVSET,KPTRGP,& +CALL TRGTOL_CUDAAWARE(ZGTF_REAL,KF_FS,KF_GP,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) !$ACC END DATA !$ACC END DATA @@ -183,7 +183,7 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & !$ACC END DATA !$ACC END DATA #else -CALL TRGTOL(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& +CALL TRGTOL(ZGTF_REAL,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) #endif @@ -194,13 +194,17 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & CALL GSTATS(1640,0) IF (KF_FS > 0) THEN - CALL FTDIR(ZGTF,KF_FS) - CALL FOURIER_OUT(ZGTF,FOUBUF_IN,KF_FS) + CALL FTDIR(ZGTF_REAL,ZGTF_COMPLEX,KF_FS) + CALL FOURIER_OUT(ZGTF_COMPLEX,FOUBUF_IN,KF_FS) +ELSE + ZGTF_COMPLEX => ZGTF_REAL ENDIF CALL GSTATS(1640,1) -!$ACC EXIT DATA DELETE(ZGTF) -DEALLOCATE(ZGTF) +IF (ALLOCATED(ZGTF_COMPLEX)) THEN + !$ACC EXIT DATA DELETE(ZGTF_COMPLEX) + DEALLOCATE(ZGTF_COMPLEX) +ENDIF CALL GSTATS(106,1) ! ------------------------------------------------------------------ END SUBROUTINE FTDIR_CTL diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index db7c1535f..4240b8189 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -10,7 +10,7 @@ MODULE FTDIR_MOD CONTAINS -SUBROUTINE FTDIR(PREEL,KFIELD) +SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) !**** *FTDIR - Direct Fourier transform @@ -57,12 +57,12 @@ SUBROUTINE FTDIR(PREEL,KFIELD) IMPLICIT NONE INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD -REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL(:,:) +REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:,:) +REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:,:) -INTEGER(KIND=JPIM) :: IGLG,IOFF,KGL,IRET +INTEGER(KIND=JPIM) :: IGLG,IOFF_REAL,IOFF_COMPLEX,KGL,IRET INTEGER(KIND=JPIM) :: IPLAN_R2C INTEGER(KIND=JPIM) :: IBEG,IEND,IINC -REAL(KIND=JPRBT), POINTER :: PREEL2(:,:), TMP(:,:) ! ------------------------------------------------------------------ @@ -77,10 +77,10 @@ SUBROUTINE FTDIR(PREEL,KFIELD) ENDIF -ALLOCATE(PREEL2(SIZE(PREEL,1),SIZE(PREEL,2))) -!$ACC ENTER DATA CREATE(PREEL2) +ALLOCATE(PREEL_COMPLEX(2*KFIELD,D%NLENGTF/2)) +!$ACC ENTER DATA CREATE(PREEL_COMPLEX) -!$ACC DATA PRESENT(PREEL,PREEL2) +!$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX) IF (LSYNC_TRANS) THEN CALL GSTATS(430,0) @@ -94,12 +94,13 @@ SUBROUTINE FTDIR(PREEL,KFIELD) ! NSTAGTF gives us space for NLOEN+3 elements ! In reality, at this point we need space for at most NLOEN+2 elements ! (in case NLOEN is even, otherwise NLOEN+1, due to the R2C definition) - IOFF=D%NSTAGTF(KGL)+1 + IOFF_REAL=D%NSTAGTF(KGL)+1 + IOFF_COMPLEX=D%NSTAGTF(KGL)/2+1 IGLG = D%NPTRLS(MYSETW)+KGL-1 - CALL CREATE_PLAN_FFT(IPLAN_R2C,-1,G%NLOEN(IGLG),KFIELD,KFIELD*2) - !$ACC HOST_DATA USE_DEVICE(PREEL,PREEL2) - CALL EXECUTE_PLAN_FFTC(IPLAN_R2C,-1,PREEL(1,IOFF),PREEL2(1,IOFF)) + CALL CREATE_PLAN_FFT(IPLAN_R2C,-1,G%NLOEN(IGLG),KFIELD,KFIELD) + !$ACC HOST_DATA USE_DEVICE(PREEL_REAL,PREEL_COMPLEX) + CALL EXECUTE_PLAN_FFTC(IPLAN_R2C,-1,PREEL_REAL(1,IOFF_REAL),PREEL_COMPLEX(1,IOFF_COMPLEX)) !$ACC END HOST_DATA END DO IRET = CUDA_SYNCHRONIZE() @@ -113,14 +114,8 @@ SUBROUTINE FTDIR(PREEL,KFIELD) !$ACC END DATA -! Swap pointers -TMP => PREEL -PREEL => PREEL2 -PREEL2 => TMP - -! and deallocate the local pointer -!$ACC EXIT DATA DELETE(PREEL2) -DEALLOCATE(PREEL2) +!$ACC EXIT DATA DELETE(PREEL_REAL) +DEALLOCATE(PREEL_REAL) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/sump_trans_mod.F90 b/src/trans/gpu/internal/sump_trans_mod.F90 index 6800b7c71..806b0e4f9 100755 --- a/src/trans/gpu/internal/sump_trans_mod.F90 +++ b/src/trans/gpu/internal/sump_trans_mod.F90 @@ -263,6 +263,11 @@ SUBROUTINE SUMP_TRANS D%NSTAGTF(JGL) = IOFF IGL = D%NPTRLS(MYSETW) + JGL - 1 IOFF = IOFF + G%NLOEN(IGL)+3 + ! Make sure IOFF is even. This could really lead to slightly too large buffers + ! esp because the (+3) above (needed?), but it is crucial to have those even + ! because with these offsets we can store complex numbers, and CUFFT won't accept + ! unaligned complex buffers + IOFF = (IOFF+1)/2*2 ENDDO D%NLENGTF = IOFF ENDIF From 983769c55d317b778a796889cfb97d8d8af252f2 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:15 -0700 Subject: [PATCH 132/263] Fix memory layout and reduce memory overhead for invtrans (CHANGE2: 6) --- .../external/fourier/create_plan_fftc.cu | 2 +- src/trans/gpu/internal/fourier_in_mod.F90 | 20 +++++----- src/trans/gpu/internal/fsc_mod.F90 | 26 ++++++------- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 38 ++++++++++--------- src/trans/gpu/internal/ftinv_mod.F90 | 33 +++++++--------- 5 files changed, 58 insertions(+), 61 deletions(-) diff --git a/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu index a96d36078..d47c28cec 100644 --- a/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu +++ b/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu @@ -91,7 +91,7 @@ extern "C" void create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, CUFFT_CHECK(cufftPlanMany(&plan, 1, &N, embed, stride, dist, embed, stride, dist, cufft_1, LOT)); } else if (ISIGN == 1) { - CUFFT_CHECK(cufftPlanMany(&plan, 1, &N, embed, stride/2, dist, embed, + CUFFT_CHECK(cufftPlanMany(&plan, 1, &N, embed, stride, dist, embed, stride, dist, cufft_2, LOT)); } else { abort(); diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index 6b3074a92..136b2da3a 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -10,7 +10,7 @@ MODULE FOURIER_IN_MOD CONTAINS -SUBROUTINE FOURIER_IN(FOUBUF,PREEL,KFIELDS) +SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KFIELDS) !**** *FOURIER_IN* - Copy fourier data from buffer to local array @@ -22,7 +22,7 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL,KFIELDS) ! ---------- ! CALL FOURIER_IN(...) -! Explicit arguments : PREEL - local fourier/GP array +! Explicit arguments : PREEL_COMPLEX - local fourier/GP array ! -------------------- KFIELDS - number of fields ! ! Externals. None. @@ -46,10 +46,10 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL,KFIELDS) IMPLICIT NONE REAL(KIND=JPRBT), ALLOCATABLE, INTENT(IN) :: FOUBUF(:) -REAL(KIND=JPRBT), INTENT(OUT) :: PREEL(:,:) +REAL(KIND=JPRBT), INTENT(OUT) :: PREEL_COMPLEX(:,:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA,OFFSET_VAR,IOFF,KGL +INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA,OFFSET_VAR,IOFF_COMPLEX,KGL INTEGER(KIND=JPIM) :: IBEG,IEND,IINC IF(MYPROC > NPROC/2)THEN @@ -62,29 +62,29 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL,KFIELDS) IINC=-1 ENDIF -!$ACC DATA PRESENT(G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF,PREEL,D_NSTAGTF) ASYNC(1) +!$ACC DATA PRESENT(G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF) ASYNC(1) ! TODO: We don't need to zero out the full array here but we need to zero out because implicit ! truncation happens. We cannot rely on previous iterations that they had the same configuration. !$ACC KERNELS ASYNC(1) -PREEL(:,:) = 0 +PREEL_COMPLEX(:,:) = 0 !$ACC END KERNELS OFFSET_VAR=D_NPTRLS(MYSETW) -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF,JM,IPROC,ISTA) DEFAULT(NONE) ASYNC(1) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF_COMPLEX,JM,IPROC,ISTA) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=1,KFIELDS IGLG = OFFSET_VAR+KGL-1 - IOFF = D_NSTAGTF(KGL)+1 + IOFF_COMPLEX = (D_NSTAGTF(KGL)/2)+1 !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) IPROC = D_NPROCM(JM) ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KFIELDS*2 - PREEL(2*JF-1,JM+IOFF) = FOUBUF(ISTA+2*JF-1) - PREEL(2*JF, JM+IOFF) = FOUBUF(ISTA+2*JF ) + PREEL_COMPLEX(2*JF-1,JM+IOFF_COMPLEX) = FOUBUF(ISTA+2*JF-1) + PREEL_COMPLEX(2*JF, JM+IOFF_COMPLEX) = FOUBUF(ISTA+2*JF ) ENDDO ENDDO ENDDO diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 80ea7cb06..6fb5b3bdc 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -60,7 +60,7 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA INTEGER(KIND=JPIM) :: KGL REAL(KIND=JPRBT) :: ZACHTE2 -INTEGER(KIND=JPIM) :: IOFF,OFFSET_VAR +INTEGER(KIND=JPIM) :: IOFF_COMPLEX,OFFSET_VAR INTEGER(KIND=JPIM) :: JF,IGLG,JM INTEGER(KIND=JPIM) :: IBEG,IEND,IINC @@ -98,14 +98,14 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA DO KGL=IBEG,IEND,IINC DO JF=1,2*KF_UV IGLG = OFFSET_VAR+KGL-1 - IOFF = D_NSTAGTF(KGL)+1 + IOFF_COMPLEX = D_NSTAGTF(KGL)/2+1 ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PUV(2*JF-1,JM+IOFF) = PUV(2*JF-1,JM+IOFF)*ZACHTE2 - PUV(2*JF, JM+IOFF) = PUV(2*JF ,JM+IOFF)*ZACHTE2 + PUV(2*JF-1,JM+IOFF_COMPLEX) = PUV(2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2 + PUV(2*JF, JM+IOFF_COMPLEX) = PUV(2*JF ,JM+IOFF_COMPLEX)*ZACHTE2 ENDDO ENDDO ENDDO @@ -117,14 +117,14 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA DO KGL=IBEG,IEND,IINC DO JF=1,KF_SCALARS IGLG = OFFSET_VAR+KGL-1 - IOFF = D_NSTAGTF(KGL)+1 + IOFF_COMPLEX = D_NSTAGTF(KGL)/2+1 ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PSCALARS_NSDER(2*JF-1,JM+IOFF) = PSCALARS_NSDER(2*JF-1,JM+IOFF)*ZACHTE2 - PSCALARS_NSDER(2*JF, JM+IOFF) = PSCALARS_NSDER(2*JF, JM+IOFF)*ZACHTE2 + PSCALARS_NSDER(2*JF-1,JM+IOFF_COMPLEX) = PSCALARS_NSDER(2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2 + PSCALARS_NSDER(2*JF, JM+IOFF_COMPLEX) = PSCALARS_NSDER(2*JF, JM+IOFF_COMPLEX)*ZACHTE2 ENDDO ENDDO ENDDO @@ -142,14 +142,14 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA DO KGL=IBEG,IEND,IINC DO JF=1,2*KF_UV IGLG = OFFSET_VAR+KGL-1 - IOFF = D_NSTAGTF(KGL)+1 + IOFF_COMPLEX = D_NSTAGTF(KGL)/2+1 ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PUV_EWDER(2*JF-1,JM+IOFF) = -PUV(2*JF, JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) - PUV_EWDER(2*JF, JM+IOFF) = PUV(2*JF-1,JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) + PUV_EWDER(2*JF-1,JM+IOFF_COMPLEX) = -PUV(2*JF, JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) + PUV_EWDER(2*JF, JM+IOFF_COMPLEX) = PUV(2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) ENDDO ENDDO ENDDO @@ -161,14 +161,14 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA DO KGL=IBEG,IEND,IINC DO JF=1,KF_SCALARS IGLG = OFFSET_VAR+KGL-1 - IOFF = D_NSTAGTF(KGL)+1 + IOFF_COMPLEX = D_NSTAGTF(KGL)/2+1 ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PSCALARS_EWDER(2*JF-1,JM+IOFF) = -PSCALARS(2*JF ,JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) - PSCALARS_EWDER(2*JF, JM+IOFF) = PSCALARS(2*JF-1,JM+IOFF)*ZACHTE2*REAL(JM,JPRBT) + PSCALARS_EWDER(2*JF-1,JM+IOFF_COMPLEX) = -PSCALARS(2*JF ,JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) + PSCALARS_EWDER(2*JF, JM+IOFF_COMPLEX) = PSCALARS(2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) ENDDO ENDDO ENDDO diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index d101cc766..ebabe79cd 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -97,7 +97,8 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP2(:,:,:) REAL(KIND=JPRBT), ALLOCATABLE, INTENT(IN) :: FOUBUF(:) -REAL(KIND=JPRBT), POINTER :: ZGTF(:,:) +REAL(KIND=JPRBT), POINTER :: ZGTF_REAL(:,:) +REAL(KIND=JPRBT), POINTER :: ZGTF_COMPLEX(:,:) REAL(KIND=JPRBT), POINTER :: & & PUV(:,:), PSCALARS(:,:), PSCALARS_NSDER(:,:), PUV_EWDER(:,:), PSCALARS_EWDER(:,:) @@ -117,7 +118,7 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& CALL GSTATS(107,0) CALL GSTATS(1639,0) -! Compute ZGTF Domain decomposition +! Compute ZGTF_COMPLEX Domain decomposition IFIRST = 0 IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence @@ -130,43 +131,44 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives KF_FS = IFIRST -! Note that this buffer is 2X too large, we will need to transpose ZGTF to get rid of this -ALLOCATE(ZGTF(2*KF_FS, D%NLENGTF)) -!$ACC ENTER DATA CREATE(ZGTF) +ALLOCATE(ZGTF_COMPLEX(2*KF_FS, D%NLENGTF/2)) +!$ACC ENTER DATA CREATE(ZGTF_COMPLEX) ! And reiterate domain decomposition to assign pointers IFIRST = 0 IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence -PUV => ZGTF(2*IFIRST+1:2*(IFIRST+2*KF_UV),:) +PUV => ZGTF_COMPLEX(2*IFIRST+1:2*(IFIRST+2*KF_UV),:) IFIRST = IFIRST + 2*KF_UV ! U and V -PSCALARS => ZGTF(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) +PSCALARS => ZGTF_COMPLEX(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) IFIRST = IFIRST + KF_SCALARS ! Scalars IF (LSCDERS) THEN - PSCALARS_NSDER => ZGTF(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) + PSCALARS_NSDER => ZGTF_COMPLEX(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives ENDIF IF (LUVDER) THEN - PUV_EWDER => ZGTF(2*IFIRST+1:2*(IFIRST+2*KF_UV),:) + PUV_EWDER => ZGTF_COMPLEX(2*IFIRST+1:2*(IFIRST+2*KF_UV),:) IFIRST = IFIRST+2*KF_UV ! U and V derivatives ENDIF IF (LSCDERS) THEN - PSCALARS_EWDER => ZGTF(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) + PSCALARS_EWDER => ZGTF_COMPLEX(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives ENDIF -! from FOUBUF to ZGTF. Divide by two because we move into complex space now -CALL FOURIER_IN(FOUBUF,ZGTF,KF_INPUT/2) +! from FOUBUF to ZGTF_COMPLEX. Divide by two because we consider this complex space now +CALL FOURIER_IN(FOUBUF,ZGTF_COMPLEX,KF_INPUT/2) ! 2. Fourier space computations -! fill the rest of ZGTF +! fill the rest of ZGTF_COMPLEX CALL FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCALARS_EWDER) ! 3. Fourier transform ! inplace operation IF(KF_FS > 0) THEN - CALL FTINV(ZGTF,KF_FS) + CALL FTINV(ZGTF_COMPLEX,ZGTF_REAL,KF_FS) +ELSE + ZGTF_REAL => ZGTF_COMPLEX ENDIF CALL GSTATS(1639,1) @@ -257,7 +259,7 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& !$ACC DATA IF(PRESENT(PGP2)) COPYOUT(PGP2) !$ACC DATA IF(PRESENT(PGP3A)) COPYOUT(PGP3A) !$ACC DATA IF(PRESENT(PGP3B)) COPYOUT(PGP3B) -CALL TRLTOG_CUDAAWARE(ZGTF,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,IVSET,KPTRGP,& +CALL TRLTOG_CUDAAWARE(ZGTF_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) IF (LSYNC_TRANS) THEN CALL GSTATS(440,0) @@ -278,14 +280,14 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& CALL GSTATS(422,1) #else !WRITE(NOUT,*) 'ftinv_ctl:TRLTOG' -CALL TRLTOG(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& +CALL TRLTOG(ZGTF_REAL,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) #endif CALL GSTATS(157,1) ! ------------------------------------------------------------------ -!$ACC EXIT DATA DELETE(ZGTF) -DEALLOCATE(ZGTF) +!$ACC EXIT DATA DELETE(ZGTF_REAL) +DEALLOCATE(ZGTF_REAL) END SUBROUTINE FTINV_CTL END MODULE FTINV_CTL_MOD diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index e35401929..db9aa64d5 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -10,7 +10,7 @@ MODULE FTINV_MOD CONTAINS -SUBROUTINE FTINV(PREEL,KFIELD) +SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) !**** *FTINV - Inverse Fourier transform @@ -57,12 +57,12 @@ SUBROUTINE FTINV(PREEL,KFIELD) IMPLICIT NONE INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD -REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL(:,:) +REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:,:) +REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_COMPLEX(:,:) -INTEGER(KIND=JPIM) :: IGLG,IOFF,KGL,IRET +INTEGER(KIND=JPIM) :: IGLG,IOFF_REAL,IOFF_COMPLEX,KGL,IRET INTEGER(KIND=JPIM) :: IPLAN_C2R INTEGER(KIND=JPIM) :: IBEG,IEND,IINC -REAL(KIND=JPRBT), POINTER :: PREEL2(:,:), TMP(:,:) ! ------------------------------------------------------------------ @@ -77,10 +77,10 @@ SUBROUTINE FTINV(PREEL,KFIELD) ENDIF -ALLOCATE(PREEL2(SIZE(PREEL,1),SIZE(PREEL,2))) -!$ACC ENTER DATA CREATE(PREEL2) +ALLOCATE(PREEL_REAL(KFIELD,D%NLENGTF)) +!$ACC ENTER DATA CREATE(PREEL_REAL) -!$ACC DATA PRESENT(PREEL,PREEL2) +!$ACC DATA PRESENT(PREEL_COMPLEX,PREEL_REAL) IF (LSYNC_TRANS) THEN CALL GSTATS(440,0) @@ -91,12 +91,13 @@ SUBROUTINE FTINV(PREEL,KFIELD) DO KGL=IBEG,IEND,IINC - IOFF=D%NSTAGTF(KGL)+1 + IOFF_REAL=D%NSTAGTF(KGL)+1 + IOFF_COMPLEX=D%NSTAGTF(KGL)/2+1 IGLG = D%NPTRLS(MYSETW)+KGL-1 - CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),KFIELD,KFIELD*2) - !$ACC HOST_DATA USE_DEVICE(PREEL,PREEL2) - CALL EXECUTE_PLAN_FFTC(IPLAN_C2R,1,PREEL(1,IOFF),PREEL2(1,IOFF)) + CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),KFIELD,KFIELD) + !$ACC HOST_DATA USE_DEVICE(PREEL_COMPLEX,PREEL_REAL) + CALL EXECUTE_PLAN_FFTC(IPLAN_C2R,1,PREEL_COMPLEX(1,IOFF_COMPLEX),PREEL_REAL(1,IOFF_REAL)) !$ACC END HOST_DATA END DO IRET = CUDA_SYNCHRONIZE() @@ -110,14 +111,8 @@ SUBROUTINE FTINV(PREEL,KFIELD) !$ACC END DATA -! Swap pointers -TMP => PREEL -PREEL => PREEL2 -PREEL2 => TMP - -! and deallocate the local pointer -!$ACC EXIT DATA DELETE(PREEL2) -DEALLOCATE(PREEL2) +!$ACC EXIT DATA DELETE(PREEL_COMPLEX) +DEALLOCATE(PREEL_COMPLEX) ! ------------------------------------------------------------------ From cac89b3a77f77c7285c5eaad1d539d6b368f6ce9 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:16 -0700 Subject: [PATCH 133/263] slightly reduce data regions overlap in ftdir --- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 22 --------- src/trans/gpu/internal/trgtol_mod.F90 | 58 ++++++++++++++++-------- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index b1ea52d74..14aef5346 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -158,30 +158,8 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & CALL GSTATS(158,0) #ifdef USE_CUDA_AWARE_MPI_FT -IF (LSYNC_TRANS) THEN - CALL GSTATS(430,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(430,1) -ENDIF -CALL GSTATS(412,0) -!$ACC DATA IF(PRESENT(PGP)) COPYIN(PGP) -!$ACC DATA IF(PRESENT(PGPUV)) COPYIN(PGPUV) -!$ACC DATA IF(PRESENT(PGP2)) COPYIN(PGP2) -!$ACC DATA IF(PRESENT(PGP3A)) COPYIN(PGP3A) -!$ACC DATA IF(PRESENT(PGP3B)) COPYIN(PGP3B) -IF (LSYNC_TRANS) THEN - CALL GSTATS(432,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(432,1) -ENDIF -CALL GSTATS(412,1) CALL TRGTOL_CUDAAWARE(ZGTF_REAL,KF_FS,KF_GP,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) -!$ACC END DATA -!$ACC END DATA -!$ACC END DATA -!$ACC END DATA -!$ACC END DATA #else CALL TRGTOL(ZGTF_REAL,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 4f250a444..592b8a8f5 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -224,14 +224,30 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& IRECVTOT(JROC) = IPOS*KF_FS ENDDO + !$ACC DATA COPYIN(IRECV_BUFR_TO_OUT,PGP_INDICES) PRESENT(PGLAT) ASYNC(1) + CALL GSTATS(1805,1) - !$ACC DATA PRESENT(PGLAT) COPYIN(IRECV_BUFR_TO_OUT,PGP_INDICES) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) - !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) ASYNC(1) + ! Put data on device for copyin + IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) + ENDIF + CALL GSTATS(412,0) + !$ACC DATA IF(PRESENT(PGP)) COPYIN(PGP) ASYNC(1) + !$ACC DATA IF(PRESENT(PGPUV)) COPYIN(PGPUV) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP2)) COPYIN(PGP2) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3A)) COPYIN(PGP3A) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3B)) COPYIN(PGP3B) ASYNC(1) + IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) + CALL GSTATS(432,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(432,1) + ENDIF + CALL GSTATS(412,1) ! TODO We should do the local contribution *WHILE* sending the data... ! Copy local contribution @@ -330,12 +346,12 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(IRECV_TO_PROC(JROC)) ENDDO + ! Do this with "enter data" syntax because we are in the PGP data clause IF (ISEND_COUNTS > 0) ALLOCATE(ZCOMBUFS(ICOMBUFS_OFFSET(ISEND_COUNTS+1))) - IF (IRECV_COUNTS > 0) ALLOCATE(ZCOMBUFR(ICOMBUFR_OFFSET(IRECV_COUNTS+1))) - !$ACC DATA IF(ISEND_COUNTS > 0) CREATE(ZCOMBUFS) ASYNC(1) - !$ACC DATA IF(IRECV_COUNTS > 0) CREATE(ZCOMBUFR) ASYNC(1) + !$ACC ENTER DATA IF(ISEND_COUNTS > 0) CREATE(ZCOMBUFS) ASYNC(1) !....Pack loop......................................................... + !$ACC DATA IF(ISEND_COUNTS > 0) PRESENT(ZCOMBUFS) ASYNC(1) CALL GSTATS(1602,0) DO INS=1,ISEND_COUNTS @@ -402,6 +418,16 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& ENDIF !$ACC END DATA ENDDO + !$ACC END DATA !ZCOMBUFS (present) + !$ACC END DATA !PGP3B + !$ACC END DATA !PGP3A + !$ACC END DATA !PGP2 + !$ACC END DATA !PGPUV + !$ACC END DATA !PGP + + IF (IRECV_COUNTS > 0) ALLOCATE(ZCOMBUFR(ICOMBUFR_OFFSET(IRECV_COUNTS+1))) + !$ACC DATA IF(IRECV_COUNTS > 0) CREATE(ZCOMBUFR) ASYNC(1) + !$ACC WAIT(1) CALL GSTATS(1602,1) @@ -444,6 +470,9 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& ENDIF CALL GSTATS(411,1) + !$ACC EXIT DATA IF(ISEND_COUNTS > 0) DELETE(ZCOMBUFS) + IF (ISEND_COUNTS > 0) DEALLOCATE(ZCOMBUFS) + ! Unpack loop......................................................... CALL GSTATS(1603,0) @@ -462,20 +491,13 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& ENDDO ENDDO ENDDO - !$ACC END DATA CALL GSTATS(1603,1) - !$ACC END DATA !! ZCOMBUFS - !$ACC END DATA !! ZCOMBUFS - !$ACC END DATA !! PRESENT(PGP3B) - !$ACC END DATA !! PRESENT(PGP3A) - !$ACC END DATA !! PRESENT(PGP2) - !$ACC END DATA !! PRESENT(PGPUV) - !$ACC END DATA !! PRESENT(PGP) + !$ACC END DATA ! ZCOMBUFR + !$ACC END DATA ! IRECV_BUFR_TO_OUT,PGPINDICES !$ACC WAIT(1) - IF (ISEND_COUNTS > 0) DEALLOCATE(ZCOMBUFS) IF (IRECV_COUNTS > 0) DEALLOCATE(ZCOMBUFR) IF (LHOOK) CALL DR_HOOK('TRGTOL_CUDAAWARE',1,ZHOOK_HANDLE) From c507767e73550f518dea45d0e1002c9103ba2635 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:16 -0700 Subject: [PATCH 134/263] slightly reduce data region overlap in ftinv --- src/trans/gpu/internal/ftdir_ctlad_mod.F90 | 6 +- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 25 ------- src/trans/gpu/internal/trltog_mod.F90 | 77 +++++++++++++++------- 3 files changed, 56 insertions(+), 52 deletions(-) diff --git a/src/trans/gpu/internal/ftdir_ctlad_mod.F90 b/src/trans/gpu/internal/ftdir_ctlad_mod.F90 index 5e227fc2b..aa14925ed 100755 --- a/src/trans/gpu/internal/ftdir_ctlad_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctlad_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -172,8 +173,9 @@ SUBROUTINE FTDIR_CTLAD(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) IST = IST+KF_SCALARS_G ENDIF -CALL TRLTOG(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& - &PGP,PGPUV,PGP3A,PGP3B,PGP2) +stop 4 +!CALL TRLTOG(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& +! &PGP,PGPUV,PGP3A,PGP3B,PGP2) CALL GSTATS(183,1) diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index ebabe79cd..b006bfb1c 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -254,30 +254,8 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& CALL GSTATS(157,0) #ifdef USE_CUDA_AWARE_MPI_FT WRITE(NOUT,*) 'ftinv_ctl:TRLTOG_CUDAAWARE' -!$ACC DATA IF(PRESENT(PGP)) COPYOUT(PGP) -!$ACC DATA IF(PRESENT(PGPUV)) COPYOUT(PGPUV) -!$ACC DATA IF(PRESENT(PGP2)) COPYOUT(PGP2) -!$ACC DATA IF(PRESENT(PGP3A)) COPYOUT(PGP3A) -!$ACC DATA IF(PRESENT(PGP3B)) COPYOUT(PGP3B) CALL TRLTOG_CUDAAWARE(ZGTF_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) -IF (LSYNC_TRANS) THEN - CALL GSTATS(440,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(440,1) -ENDIF -CALL GSTATS(422,0) -!$ACC END DATA -!$ACC END DATA -!$ACC END DATA -!$ACC END DATA -!$ACC END DATA -IF (LSYNC_TRANS) THEN - CALL GSTATS(442,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(442,1) -ENDIF -CALL GSTATS(422,1) #else !WRITE(NOUT,*) 'ftinv_ctl:TRLTOG' CALL TRLTOG(ZGTF_REAL,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& @@ -286,8 +264,5 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& CALL GSTATS(157,1) ! ------------------------------------------------------------------ -!$ACC EXIT DATA DELETE(ZGTF_REAL) -DEALLOCATE(ZGTF_REAL) - END SUBROUTINE FTINV_CTL END MODULE FTINV_CTL_MOD diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index f15fceb68..f9b2bf771 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -89,7 +89,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, IMPLICIT NONE - REAL(KIND=JPRBT), INTENT(IN) :: PGLAT(:,:) + REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PGLAT(:,:) INTEGER(KIND=JPIM),INTENT(IN) :: KVSET(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV_G, KF_SCALARS_G @@ -175,7 +175,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, CALL ABORT_TRANS("INCONSISTENCY IN SCALARS") ENDIF - ! ! This is only relevant if we use the split interface (i.e. not PGP) + ! This is only relevant if we use the split interface (i.e. not PGP) IUVPAR = 1 IOFF=1 IF(LVORGP) THEN @@ -341,13 +341,16 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, ISENDTOT(JROC) = IPOS*KF_FS ENDDO - !$ACC DATA PRESENT(PGLAT) COPYIN(IIN_TO_SEND_BUFR,IGP_OFFSETS) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) - !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) ASYNC(1) + !$ACC DATA COPYIN(IIN_TO_SEND_BUFR,IGP_OFFSETS) ASYNC(1) + + !$ACC DATA IF(PRESENT(PGP)) COPYOUT(PGP) ASYNC(1) + !$ACC DATA IF(PRESENT(PGPUV)) COPYOUT(PGPUV) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP2)) COPYOUT(PGP2) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3A)) COPYOUT(PGP3A) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3B)) COPYOUT(PGP3B) ASYNC(1) + ! Present until self contribution and packing are done + !$ACC DATA PRESENT(PGLAT) ASYNC(1) CALL GSTATS(1806,1) ! Copy local contribution @@ -441,12 +444,10 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, ENDDO IF (ISEND_COUNTS > 0) ALLOCATE(ZCOMBUFS(ICOMBUFS_OFFSET(ISEND_COUNTS+1))) - IF (IRECV_COUNTS > 0) ALLOCATE(ZCOMBUFR(ICOMBUFR_OFFSET(IRECV_COUNTS+1))) - !$ACC DATA IF(ISEND_COUNTS > 0) CREATE(ZCOMBUFS) - !$ACC DATA IF(IRECV_COUNTS > 0) CREATE(ZCOMBUFR) + !$ACC ENTER DATA IF(ISEND_COUNTS > 0) CREATE(ZCOMBUFS) ASYNC(1) + !$ACC DATA PRESENT(ZCOMBUFS) ASYNC(1) CALL GSTATS(1605,0) - DO INS=1,ISEND_COUNTS IPROC = ISEND_TO_PROC(INS) ILEN = ISENDTOT(IPROC)/KF_FS @@ -460,9 +461,17 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, ENDDO ENDDO ENDDO + CALL GSTATS(1605,1) + !$ACC END DATA ! ZCOMBUFS + + !$ACC END DATA ! PGLAT + !$ACC EXIT DATA DELETE(PGLAT) ASYNC(1) + + IF (IRECV_COUNTS > 0) ALLOCATE(ZCOMBUFR(ICOMBUFR_OFFSET(IRECV_COUNTS+1))) + !$ACC DATA IF(IRECV_COUNTS > 0) CREATE(ZCOMBUFR) ASYNC(1) !$ACC WAIT(1) - CALL GSTATS(1605,1) + DEALLOCATE(PGLAT) CALL GSTATS(805,0) @@ -507,6 +516,9 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, ENDIF CALL GSTATS(421,1) + !$ACC EXIT DATA IF(ISEND_COUNTS > 0) DELETE(ZCOMBUFS) + IF (ISEND_COUNTS > 0) DEALLOCATE(ZCOMBUFS) + CALL GSTATS(805,1) ! Unpack loop......................................................... @@ -565,24 +577,36 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, ENDIF ENDDO ENDDO - ENDIF + ENDIF !$ACC END DATA ENDDO - !$ACC END DATA - !$ACC END DATA - !$ACC END DATA - !$ACC END DATA - !$ACC END DATA - !$ACC END DATA + !$ACC END DATA ! ZOMBUFR + IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) + ENDIF + CALL GSTATS(422,0) + !$ACC END DATA ! PGP3B + !$ACC END DATA ! PGP3A + !$ACC END DATA ! PGP2 + !$ACC END DATA ! PGPUV + !$ACC END DATA ! PGP + IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) + CALL GSTATS(442,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(442,1) + ENDIF + CALL GSTATS(422,1) + !$ACC END DATA ! IRECVBUFR_TO_OUT,PGPINDICES - !$ACC END DATA !! CREATE ZCOMBUFR - !$ACC END DATA !! CREATE ZCOMBUFS !$ACC WAIT(1) + IF (IBUFLENR > 0) DEALLOCATE(ZCOMBUFR) CALL GSTATS(1606,1) - IF (IBUFLENS > 0) DEALLOCATE(ZCOMBUFS) - IF (IBUFLENR > 0) DEALLOCATE(ZCOMBUFR) IF (LHOOK) CALL DR_HOOK('TRLTOG',1,ZHOOK_HANDLE) @@ -669,7 +693,7 @@ SUBROUTINE TRLTOG(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IMPLICIT NONE - REAL(KIND=JPRBT), INTENT(IN) :: PGLAT(:,:) + REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PGLAT(:,:) INTEGER(KIND=JPIM),INTENT(IN) :: KVSET(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP INTEGER(KIND=JPIM),INTENT(IN) :: KF_SCALARS_G @@ -1246,6 +1270,9 @@ SUBROUTINE TRLTOG(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IF (IBUFLENS > 0) DEALLOCATE(ZCOMBUFS) IF (IBUFLENR > 0) DEALLOCATE(ZCOMBUFR) + !$ACC EXIT DATA DELETE(PGLAT) + DEALLOCATE(PGLAT) + IF (LHOOK) CALL DR_HOOK('TRLTOG',1,ZHOOK_HANDLE) END SUBROUTINE TRLTOG From 9dfdc8865f38bf58206ad401fbe5c7861458d4c9 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:16 -0700 Subject: [PATCH 135/263] Do not zero out full PREEL, but only the parts that will not be set --- src/trans/gpu/internal/fourier_in_mod.F90 | 19 ++++++++++--------- src/trans/gpu/internal/fsc_mod.F90 | 14 ++++++++++++-- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index 136b2da3a..8d23f2880 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -40,7 +40,7 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KFIELDS) USE PARKIND_ECTRANS ,ONLY : JPIM,JPRBT USE TPM_DISTR ,ONLY : D,MYSETW,MYPROC, NPROC, D_NSTAGTF, D_NSTAGT0B,D_NPNTGTB0,D_NPROCM,D_NPTRLS -USE TPM_GEOMETRY ,ONLY : G,G_NMEN +USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NLOEN ! IMPLICIT NONE @@ -62,14 +62,7 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KFIELDS) IINC=-1 ENDIF -!$ACC DATA PRESENT(G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF) ASYNC(1) - -! TODO: We don't need to zero out the full array here but we need to zero out because implicit -! truncation happens. We cannot rely on previous iterations that they had the same configuration. - -!$ACC KERNELS ASYNC(1) -PREEL_COMPLEX(:,:) = 0 -!$ACC END KERNELS +!$ACC DATA PRESENT(G_NLOEN,G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF) ASYNC(1) OFFSET_VAR=D_NPTRLS(MYSETW) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF_COMPLEX,JM,IPROC,ISTA) DEFAULT(NONE) ASYNC(1) @@ -86,6 +79,14 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KFIELDS) PREEL_COMPLEX(2*JF-1,JM+IOFF_COMPLEX) = FOUBUF(ISTA+2*JF-1) PREEL_COMPLEX(2*JF, JM+IOFF_COMPLEX) = FOUBUF(ISTA+2*JF ) ENDDO + !$ACC LOOP SEQ + DO JM=G_NMEN(IGLG)+1,(G_NLOEN(IGLG)+4)/2-1 + ! Truncation (not sure what is the exact upper bound here...) + ! Same is also in FSC for the new fields. I *think* it should be N/2+1 elements in total + ! TODO: Make sure this is correct + PREEL_COMPLEX(2*JF-1,JM+IOFF_COMPLEX) = 0._JPRBT + PREEL_COMPLEX(2*JF, JM+IOFF_COMPLEX) = 0._JPRBT + ENDDO ENDDO ENDDO !$ACC END DATA diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 6fb5b3bdc..3aae18f96 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -47,7 +47,7 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NSTAGTF -USE TPM_GEOMETRY ,ONLY : G, G_NMEN +USE TPM_GEOMETRY ,ONLY : G, G_NMEN, G_NLOEN USE TPM_FIELDS ,ONLY : F USE TPM_FLT ,ONLY: S USE TPM_GEN, ONLY: NOUT @@ -80,7 +80,7 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA IINC=-1 ENDIF -!$ACC DATA PRESENT(D,G,F,D_NSTAGTF,G_NMEN,PUV,PSCALARS,PSCALARS_NSDER,PUV_EWDER,PSCALARS_EWDER) +!$ACC DATA PRESENT(D,G,F,D_NSTAGTF,G_NMEN,PUV,PSCALARS,PSCALARS_NSDER,PUV_EWDER,PSCALARS_EWDER,G_NLOEN) IF( LATLON.AND.S%LDLL.AND.S%LSHIFTLL ) THEN PRINT *, "This is not implemented yet! LATLON.AND.S%LDLL.AND.S%LSHIFTLL" @@ -151,6 +151,11 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA PUV_EWDER(2*JF-1,JM+IOFF_COMPLEX) = -PUV(2*JF, JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) PUV_EWDER(2*JF, JM+IOFF_COMPLEX) = PUV(2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) ENDDO + !$ACC LOOP SEQ + DO JM=G_NMEN(IGLG)+1,(G_NLOEN(IGLG)+4)/2-1 + PUV_EWDER(2*JF-1,JM+IOFF_COMPLEX) = 0._JPRBT + PUV_EWDER(2*JF, JM+IOFF_COMPLEX) = 0._JPRBT + ENDDO ENDDO ENDDO ENDIF @@ -170,6 +175,11 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA PSCALARS_EWDER(2*JF-1,JM+IOFF_COMPLEX) = -PSCALARS(2*JF ,JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) PSCALARS_EWDER(2*JF, JM+IOFF_COMPLEX) = PSCALARS(2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) ENDDO + !$ACC LOOP SEQ + DO JM=G_NMEN(IGLG)+1,(G_NLOEN(IGLG)+4)/2-1 + PSCALARS_EWDER(2*JF-1,JM+IOFF_COMPLEX) = 0._JPRBT + PSCALARS_EWDER(2*JF, JM+IOFF_COMPLEX) = 0._JPRBT + ENDDO ENDDO ENDDO ENDIF From 958d0f82b4b7dc61ddee955a431f852ca9d81427 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:16 -0700 Subject: [PATCH 136/263] use proper size for FOUBUFS/R --- src/trans/gpu/internal/sump_trans_mod.F90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/trans/gpu/internal/sump_trans_mod.F90 b/src/trans/gpu/internal/sump_trans_mod.F90 index 806b0e4f9..04dda6722 100755 --- a/src/trans/gpu/internal/sump_trans_mod.F90 +++ b/src/trans/gpu/internal/sump_trans_mod.F90 @@ -163,8 +163,8 @@ SUBROUTINE SUMP_TRANS D%NSTAGT0B(JA) = D%NSTAGT0B(JA-1)+D%NLTSGTB(JA-1) D%NSTAGT1B(JA) = D%NSTAGT1B(JA-1)+D%NLTSFTB(JA-1) ENDDO - D%NLENGT0B = IAUX0*NPRTRNS - D%NLENGT1B = IAUX1*NPRTRNS + D%NLENGT0B = D%NSTAGT0B(NPRTRNS)+D%NLTSGTB(NPRTRNS) + D%NLENGT1B = D%NSTAGT1B(NPRTRNS)+D%NLTSFTB(NPRTRNS) ENDIF ! GRIDPOINT SPACE From 8d4f385547e9624f30f604e47feed8b17fcbf729 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:16 -0700 Subject: [PATCH 137/263] Cleanup copyins --- src/trans/gpu/external/setup_trans.F90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index 0e17ae0b2..dbd1a9fad 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -464,8 +464,8 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& print*,'R%NSMAX=',R%NSMAX !$ACC ENTER DATA & -!$ACC& COPYIN(F,F%RN,F%RLAPIN,S,S%FA,S%ITHRESHOLD,S%LUSEFLT,D,D%NUMP,D%MYMS,R,R%NDGNH,R%NSMAX,G,G%NDGLU) & -!$ACC& COPYIN(D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,G%NMEN,D%NPROCM,D%NPTRLS,G,G%NLOEN,D%MSTABF) +!$ACC& COPYIN(F,F%RN,F%RLAPIN,D,D%MYMS,R,G,G%NDGLU) & +!$ACC& COPYIN(D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,G%NMEN,D%NPROCM,D%NPTRLS,G,D%MSTABF) ! Initialize A arrays From 740075f8f3a38324f100ff79f07ea78b11ae2cf4 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:17 -0700 Subject: [PATCH 138/263] Fix intent when allocatable state is being changed --- src/trans/gpu/internal/fourier_in_mod.F90 | 2 +- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 2 +- src/trans/gpu/internal/ledir_mod.F90 | 2 +- src/trans/gpu/internal/ltdir_ctl_mod.F90 | 2 +- src/trans/gpu/internal/ltdir_mod.F90 | 2 +- src/trans/gpu/internal/trltom_mod.F90 | 2 +- src/trans/gpu/internal/trmtol_mod.F90 | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index 8d23f2880..60b31b43d 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -45,7 +45,7 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KFIELDS) IMPLICIT NONE -REAL(KIND=JPRBT), ALLOCATABLE, INTENT(IN) :: FOUBUF(:) +REAL(KIND=JPRBT), ALLOCATABLE, INTENT(INOUT) :: FOUBUF(:) REAL(KIND=JPRBT), INTENT(OUT) :: PREEL_COMPLEX(:,:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index b006bfb1c..41cffee3c 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -95,7 +95,7 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3A(:,:,:,:) REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP2(:,:,:) -REAL(KIND=JPRBT), ALLOCATABLE, INTENT(IN) :: FOUBUF(:) +REAL(KIND=JPRBT), ALLOCATABLE, INTENT(INOUT) :: FOUBUF(:) REAL(KIND=JPRBT), POINTER :: ZGTF_REAL(:,:) REAL(KIND=JPRBT), POINTER :: ZGTF_COMPLEX(:,:) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 8ef2a2285..2dd2793fd 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -72,7 +72,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ! DUMMY ARGUMENTS -REAL(KIND=JPRBT), ALLOCATABLE, INTENT(IN) :: FOUBUF(:) +REAL(KIND=JPRBT), ALLOCATABLE, INTENT(INOUT) :: FOUBUF(:) REAL(KIND=JPRBT), INTENT(OUT) :: POA1(:,:,:) INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV diff --git a/src/trans/gpu/internal/ltdir_ctl_mod.F90 b/src/trans/gpu/internal/ltdir_ctl_mod.F90 index eaff425c8..e0667f5de 100755 --- a/src/trans/gpu/internal/ltdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltdir_ctl_mod.F90 @@ -57,7 +57,7 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN,& IMPLICIT NONE INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_UV,KF_SCALARS - REAL(KIND=JPRBT), ALLOCATABLE, INTENT(IN) :: FOUBUF_IN(:) + REAL(KIND=JPRBT), ALLOCATABLE, INTENT(INOUT) :: FOUBUF_IN(:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index e4d6b21aa..57bb0a1ac 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -103,7 +103,7 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& REAL(KIND=JPRB) ,OPTIONAL,INTENT(OUT) :: PSPSC3B(:,:,:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) - REAL(KIND=JPRBT), ALLOCATABLE, INTENT(IN) :: FOUBUF(:) + REAL(KIND=JPRBT), ALLOCATABLE, INTENT(INOUT) :: FOUBUF(:) ! LOCAL INTEGER SCALARS INTEGER(KIND=JPIM) :: IFC, IIFC, IDGLU, IFIRST diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index 4e331d8a1..71f204788 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -75,7 +75,7 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_FS REAL(KIND=JPRBT) ,INTENT(OUT), ALLOCATABLE :: PFBUF(:) -REAL(KIND=JPRBT) ,INTENT(IN), ALLOCATABLE :: PFBUF_IN(:) +REAL(KIND=JPRBT) ,INTENT(INOUT), ALLOCATABLE :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 593994059..c5acfac8f 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -75,7 +75,7 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) INTEGER(KIND=JPIM) ,INTENT(IN) :: KFIELD REAL(KIND=JPRBT), INTENT(OUT), ALLOCATABLE :: PFBUF(:) -REAL(KIND=JPRBT), INTENT(IN), ALLOCATABLE :: PFBUF_IN(:) +REAL(KIND=JPRBT), INTENT(INOUT), ALLOCATABLE :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK From a6873aa3181e5dedde6d1d0588333412e1baaae4 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:17 -0700 Subject: [PATCH 139/263] FIX: Do not over-compute ZOUT0 --- src/trans/gpu/internal/ledir_mod.F90 | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 2dd2793fd..1eb92bf46 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -93,8 +93,8 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ALLOCATE(ZINPA(2*KF_FS*R_NDGNH*D_NUMP)) ALLOCATE(ZINPS(2*KF_FS*R_NDGNH*D_NUMP)) ALLOCATE(ZOUT(2*KF_FS*TDZAS*D_NUMP)) -ALLOCATE(ZINP0(2*KF_FS*R_NDGNH)) -ALLOCATE(ZOUT0(2*KF_FS*TDZAS)) +ALLOCATE(ZINP0(KF_FS*R_NDGNH)) +ALLOCATE(ZOUT0(KF_FS*TDZAS)) !$ACC DATA & !$ACC& CREATE(ZINPS,ZINPA,ZOUT,ZINP0,ZOUT0) & @@ -189,7 +189,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) DO JGL=1,G_NDGLU(0) DO JF=1,2*KF_FS,2 - ZINP0((JF-1)/2+1+(JGL-1)*2*KF_FS) & + ZINP0((JF-1)/2+1+(JGL-1)*KF_FS) & & = ZINPA((JF-1)+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*2*KF_FS) ENDDO ENDDO @@ -202,12 +202,12 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !$ACC HOST_DATA USE_DEVICE(ZAA0,ZINP0,ZOUT0) CALL CUDA_DGEMM_BATCHED( & & 'N', 'N', & - & 2*KF_FS, TDZAA, R_NDGNH, & + & KF_FS, TDZAA, R_NDGNH, & & 1.0_JPRD, & - & ZINP0, 2*KF_FS, R_NDGNH, & + & ZINP0, KF_FS, R_NDGNH, & & ZAA0, R_NDGNH, TDZAA, & & 0.0_JPRD, & - & ZOUT0, 2*KF_FS, TDZAA, & + & ZOUT0, KF_FS, TDZAA, & & 1) !$ACC END HOST_DATA @@ -215,7 +215,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) DO J=1,(R_NSMAX+2)/2 DO JK=1,2*KF_FS,2 IA = 1+MOD(R_NTMAX+2,2) - POA1(JK,IA+1+(J-1)*2,KMLOC0) = ZOUT0((JK-1)/2+1+(J-1)*2*KF_FS) + POA1(JK,IA+1+(J-1)*2,KMLOC0) = ZOUT0((JK-1)/2+1+(J-1)*KF_FS) ENDDO ENDDO ENDIF @@ -267,7 +267,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) DO JGL=1,G_NDGLU(0) DO JF=1,2*KF_FS,2 - ZINP0((JF-1)/2+1+(JGL-1)*2*KF_FS) & + ZINP0((JF-1)/2+1+(JGL-1)*KF_FS) & & = ZINPS((JF-1)+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*2*KF_FS) ENDDO ENDDO @@ -279,12 +279,12 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !$ACC host_data use_device(ZAS0,ZINP0,ZOUT0) call CUDA_DGEMM_BATCHED( & & 'N', 'N', & - & 2*KF_FS, TDZAS, R_NDGNH, & + & KF_FS, TDZAS, R_NDGNH, & & 1.0_JPRD, & - & ZINP0, 2*KF_FS, R_NDGNH, & + & ZINP0, KF_FS, R_NDGNH, & & ZAS0, R_NDGNH, TDZAS, & & 0.0_JPRD, & - & ZOUT0, 2*KF_FS, TDZAS, & + & ZOUT0, KF_FS, TDZAS, & & 1) !$ACC end host_data @@ -292,7 +292,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) DO J=1,(R_NSMAX+3)/2 DO JK=1,2*KF_FS,2 IS = 1+MOD(R_NTMAX+1,2) - POA1(JK,IS+1+(J-1)*2,KMLOC0) = ZOUT0((JK-1)/2+1+(J-1)*2*KF_FS) + POA1(JK,IS+1+(J-1)*2,KMLOC0) = ZOUT0((JK-1)/2+1+(J-1)*KF_FS) ENDDO ENDDO From 34f5c63a4633b4dcbf76a49879541421f38005b8 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:17 -0700 Subject: [PATCH 140/263] Compute m=0 in double precision for inverse transform (CHANGE1: 5) (CHANGE2: 7) --- src/trans/gpu/internal/leinv_mod.F90 | 72 +++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 2 deletions(-) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 0edb7e15d..77138ebfa 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -55,10 +55,11 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX, R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU -USE TPM_FIELDS ,ONLY : ZAA,ZAS,TDZAA,TDZAS +USE TPM_FIELDS ,ONLY : ZAA,ZAS,ZAA0,ZAS0,TDZAA,TDZAS,KMLOC0 USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPROCL,D_NPNTGTB1,D_NSTAGT1B USE TPM_GEN, ONLY: NOUT USE CUDA_GEMM_BATCHED_MOD +USE CUBLAS_MOD, ONLY : CUDA_DGEMM_BATCHED USE, INTRINSIC :: ISO_C_BINDING USE MPL_MODULE ,ONLY : MPL_BARRIER USE IEEE_ARITHMETIC @@ -77,6 +78,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ! LOCAL REAL(KIND=JPRBT), ALLOCATABLE :: ZINP(:), ZOUTS(:), ZOUTA(:) REAL(KIND=JPRBT) :: ZAOA, ZSOA +REAL(KIND=JPRD), ALLOCATABLE :: ZINP0(:), ZOUT0(:) INTEGER(KIND=JPIM) :: IA, IS, ISL, J1, JGL, JK, J, IGLS, ISTAS, OFFSET1, OFFSET2 INTEGER(KIND=JPIM) :: KFIELDS @@ -98,9 +100,11 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ALLOCATE(ZINP(KFIELDS*TDZAS*D_NUMP)) ALLOCATE(ZOUTS(KFIELDS*R_NDGNH*D_NUMP)) ALLOCATE(ZOUTA(KFIELDS*R_NDGNH*D_NUMP)) +ALLOCATE(ZINP0(KFIELDS/2*TDZAS)) +ALLOCATE(ZOUT0(KFIELDS/2*R_NDGNH)) !$ACC DATA PRESENT(D,D_MYMS,G,G_NDGLU,R) & -!$ACC& CREATE (ZINP,ZOUTS,ZOUTA) & +!$ACC& CREATE (ZINP,ZOUTS,ZOUTA,ZINP0,ZOUT0) & !$ACC& PRESENT(ZAA,ZAS,PIA) & !$ACC& PRESENT(D_MYMS,D_NPROCL,D_NSTAGT1B,D_NPNTGTB1,G_NDGLU) @@ -171,6 +175,39 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDIF CALL GSTATS(424,1) +IF (KMLOC0 > 0) THEN + print*,'computing m=0 in double precision' + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IA) DEFAULT(NONE) + DO J=1,(R_NSMAX+2)/2 + DO JK=1,KFIELDS,2 + IA = 1+MOD(R_NSMAX+2,2) + ZINP0((JK-1)/2+1+(J-1)*KFIELDS/2) = PIA(JK,IA+1+(J-1)*2,KMLOC0) + ENDDO + ENDDO + + !$ACC HOST_DATA USE_DEVICE(ZAA0,ZINP0,ZOUT0) + CALL CUDA_DGEMM_BATCHED( & + & 'N', 'T', & + & KFIELDS/2, R_NDGNH, TDZAA, & + & 1.0_JPRD, & + & ZINP0, KFIELDS/2, TDZAA, & + & ZAA0, R_NDGNH, TDZAA, & + & 0.0_JPRD, & + & ZOUT0, KFIELDS/2, R_NDGNH, & + & 1) + !$ACC END HOST_DATA + + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) + DO JGL=1,G_NDGLU(0) + DO JK=1,KFIELDS,2 + ! ZAOA = ZOUTA((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) + ZOUTA((JK-1)/2+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*KFIELDS) & + & = ZOUT0((JK-1)/2+1+(JGL-1)*KFIELDS/2) + ENDDO + ENDDO + +ENDIF + ! 2. +++++++++++++ symmetric !IF KM=0 and NSMAX is 6: ! IS=2 @@ -227,6 +264,37 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDIF CALL GSTATS(424,1) +IF (KMLOC0 > 0) THEN + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IA) DEFAULT(NONE) + DO J=1,(R_NSMAX+3)/2 + DO JK=1,KFIELDS,2 + IS = 1+MOD(R_NSMAX+1,2) + ZINP0((JK-1)/2+1+(J-1)*KFIELDS/2) = PIA(JK,IS+1+(J-1)*2,KMLOC0) + ENDDO + ENDDO + + !$ACC HOST_DATA USE_DEVICE(ZAS0,ZINP0,ZOUT0) + CALL CUDA_DGEMM_BATCHED( & + & 'N', 'T', & + & KFIELDS/2, R_NDGNH, TDZAS, & + & 1.0_JPRD, & + & ZINP0, KFIELDS/2, TDZAS, & + & ZAS0, R_NDGNH, TDZAS, & + & 0.0_JPRD, & + & ZOUT0, KFIELDS/2, R_NDGNH, & + & 1) + !$ACC END HOST_DATA + + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) + DO JGL=1,G_NDGLU(0) + DO JK=1,KFIELDS,2 + ZOUTS((JK-1)/2+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*KFIELDS) & + & = ZOUT0((JK-1)/2+1+(JGL-1)*KFIELDS/2) + ENDDO + ENDDO + +ENDIF + ALLOCATE(FOUBUF_IN(D%NLENGT1B*KFIELDS)) !$ACC ENTER DATA CREATE(FOUBUF_IN) From 93746eb29c440dc2d2ceca4813f90812608da135 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:17 -0700 Subject: [PATCH 141/263] Use cudaGraphs for FFTs --- .../external/fourier/execute_plan_fftc.cu | 148 ++++++++++++++++++ src/trans/gpu/internal/ftdir_mod.F90 | 13 +- src/trans/gpu/internal/ftinv_mod.F90 | 23 +-- src/trans/gpu/internal/tpm_fftc.F90 | 146 ++++++++++++++++- 4 files changed, 313 insertions(+), 17 deletions(-) diff --git a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu index 3acf1afbf..e5434de92 100644 --- a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu +++ b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu @@ -1,5 +1,9 @@ #include "cufft.h" #include "stdio.h" +#include +#include +#include + static const char *_cudaGetErrorEnum(cufftResult error) { switch (error) { case CUFFT_SUCCESS: @@ -35,6 +39,15 @@ static const char *_cudaGetErrorEnum(cufftResult error) { return ""; } +#define CUDA_CHECK(e) \ + { \ + cudaError_t err = (e); \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error: %s, line %d, %s: %s\n", __FILE__, __LINE__, \ + #e, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } #define CUFFT_CHECK(e) { \ cufftResult_t err = (e); \ @@ -78,3 +91,138 @@ execute_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, cufftDoubleComplex *data_in, abort(); } } + +namespace { +struct Double { + using real = double; + using cmplx = cufftDoubleComplex; +}; +struct Float { + using real = float; + using cmplx = cufftComplex; +}; +} +template +void execute_fft(typename Type::real *data_real, typename Type::cmplx *data_complex, + int kfield, int *loens, int *offsets, int nfft) { + using real = typename Type::real; + using cmplx = typename Type::cmplx; + + /* static std::unordered_map allocationCache; // nloens -> ptr */ + static std::unordered_map> fftPlansCache; // kfield -> handles + static std::unordered_map graphCache; // kfield -> graphs + + // if the pointers are changed, we need to update the graph + static std::unordered_map> ptrCache; // kfield -> ptrs + + auto ptrs = ptrCache.find(kfield); + if (ptrs != ptrCache.end() && ( + ptrs->second.first != data_real || ptrs->second.second != data_complex)) { + // the plan is cached, but the pointers are not correct. we remove and delete the graph, + // but we keep the FFT plans, if this happens more often, we should cache this... + std::cout << "WARNING: POINTER CHANGE --> THIS MIGHT BE SLOW" << std::endl; + CUDA_CHECK(cudaGraphExecDestroy(graphCache[kfield])); + graphCache.erase(kfield); + ptrCache.erase(kfield); + } + + auto graph = graphCache.find(kfield); + if (graph == graphCache.end()) { + // this graph does not exist yet + + auto fftPlans = fftPlansCache.find(kfield); + if (fftPlans == fftPlansCache.end()) { + // the fft plans do not exist yet + std::vector newPlans; + newPlans.resize(nfft); + for (int i = 0; i < nfft; ++i) { + int nloen = loens[i]; + + cufftHandle plan; + CUFFT_CHECK(cufftCreate(&plan)); + int dist = 1; + int embed[] = {1}; + CUFFT_CHECK(cufftPlanMany(&plan, 1, &nloen, embed, kfield, dist, embed, + kfield, dist, Direction, kfield)); + newPlans[i] = plan; + } + fftPlansCache.insert({kfield, newPlans}); + } + fftPlans = fftPlansCache.find(kfield); + + // create a temporary stream + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + for (auto &plan : fftPlans->second) // set the streams + CUFFT_CHECK(cufftSetStream(plan, stream)); + + // now create the cuda graph + cudaGraph_t new_graph; + cudaGraphCreate(&new_graph, 0); + for (int i = 0; i < nfft; ++i) { + int offset = offsets[i]; + real *data_real_l = &data_real[kfield * offset]; + cmplx *data_complex_l = &data_complex[kfield * offset / 2]; + CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); + if constexpr(Direction == CUFFT_R2C) + CUFFT_CHECK(cufftExecR2C(fftPlans->second[i], data_real_l, data_complex_l)) + else if constexpr(Direction == CUFFT_C2R) + CUFFT_CHECK(cufftExecC2R(fftPlans->second[i], data_complex_l, data_real_l)) + else if constexpr(Direction == CUFFT_D2Z) + CUFFT_CHECK(cufftExecD2Z(fftPlans->second[i], data_real_l, data_complex_l)) + else if constexpr(Direction == CUFFT_Z2D) + CUFFT_CHECK(cufftExecZ2D(fftPlans->second[i], data_complex_l, data_real_l)); + cudaGraph_t my_graph; + CUDA_CHECK(cudaStreamEndCapture(stream, &my_graph)); + cudaGraphNode_t my_node; + CUDA_CHECK(cudaGraphAddChildGraphNode(&my_node, new_graph, nullptr, 0, my_graph)); + } + cudaGraphExec_t instance; + CUDA_CHECK(cudaGraphInstantiate(&instance, new_graph, NULL, NULL, 0)); + CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaGraphDestroy(new_graph)); + + graphCache.insert({kfield, instance}); + ptrCache.insert({kfield, std::make_pair(data_real, data_complex)}); + } + + CUDA_CHECK(cudaGraphLaunch(graphCache.at(kfield), 0)); + /* for (int i = 0; i < nfft; ++i) { */ + /* int nloen = loens[i]; */ + + /* cufftHandle plan; */ + /* CUFFT_CHECK(cufftCreate(&plan)); */ + /* int dist = 1; */ + /* int embed[] = {1}; */ + /* CUFFT_CHECK(cufftPlanMany(&plan, 1, &nloen, embed, kfield, dist, embed, */ + /* kfield, dist, Direction, kfield)); */ + /* int offset = offsets[i]; */ + /* real *data_real_l = &data_real[kfield * offset]; */ + /* cmplx *data_complex_l = &data_complex[kfield * offset / 2]; */ + /* if (Direction == CUFFT_R2C) */ + /* CUFFT_CHECK(cufftExecR2C(plan, data_real_l, data_complex_l)) */ + /* else */ + /* CUFFT_CHECK(cufftExecC2R(plan, data_complex_l, data_real_l)); */ + /* CUFFT_CHECK(cufftDestroy(plan)); */ + /* } */ + CUDA_CHECK(cudaDeviceSynchronize()); +} +extern "C" { +void execute_dir_fft_float(float *data_real, cufftComplex *data_complex, + int kfield, int *loens, int *offsets, int nfft) { + execute_fft(data_real, data_complex, kfield, loens, offsets, nfft); +} +void execute_inv_fft_float(cufftComplex *data_complex, float *data_real, + int kfield, int *loens, int *offsets, int nfft) { + execute_fft(data_real, data_complex, kfield, loens, offsets, nfft); +} +void execute_dir_fft_double(double *data_real, cufftDoubleComplex *data_complex, + int kfield, int *loens, int *offsets, int nfft) { + execute_fft(data_real, data_complex, kfield, loens, offsets, nfft); +} +void execute_inv_fft_double(cufftDoubleComplex *data_complex, double *data_real, + int kfield, int *loens, int *offsets, int nfft) { + execute_fft(data_real, data_complex, kfield, loens, offsets, nfft); +} +} diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 4240b8189..37db64cde 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -49,7 +49,7 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC USE TPM_GEOMETRY ,ONLY : G -USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT +USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT, EXECUTE_DIR_FFT USE CUDA_DEVICE_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX @@ -88,6 +88,9 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) CALL GSTATS(430,1) ENDIF CALL GSTATS(413,0) +CALL EXECUTE_DIR_FFT(PREEL_REAL(:,:),PREEL_COMPLEX(:,:),KFIELD, & + & LOENS=G%NLOEN(D%NPTRLS(MYSETW):D%NPTRLS(MYSETW)+D%NDGL_FS-1), & + & OFFSETS=D%NSTAGTF(1:D%NDGL_FS)) DO KGL=IBEG,IEND,IINC @@ -98,10 +101,10 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) IOFF_COMPLEX=D%NSTAGTF(KGL)/2+1 IGLG = D%NPTRLS(MYSETW)+KGL-1 - CALL CREATE_PLAN_FFT(IPLAN_R2C,-1,G%NLOEN(IGLG),KFIELD,KFIELD) - !$ACC HOST_DATA USE_DEVICE(PREEL_REAL,PREEL_COMPLEX) - CALL EXECUTE_PLAN_FFTC(IPLAN_R2C,-1,PREEL_REAL(1,IOFF_REAL),PREEL_COMPLEX(1,IOFF_COMPLEX)) - !$ACC END HOST_DATA + ! CALL CREATE_PLAN_FFT(IPLAN_R2C,-1,G%NLOEN(IGLG),KFIELD,KFIELD) + ! !$ACC HOST_DATA USE_DEVICE(PREEL_REAL,PREEL_COMPLEX) + ! CALL EXECUTE_PLAN_FFTC(IPLAN_R2C,-1,PREEL_REAL(1,IOFF_REAL),PREEL_COMPLEX(1,IOFF_COMPLEX)) + ! !$ACC END HOST_DATA END DO IRET = CUDA_SYNCHRONIZE() diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index db9aa64d5..6c085e1cf 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -49,7 +49,7 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC USE TPM_GEOMETRY ,ONLY : G -USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT +USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT, EXECUTE_INV_FFT USE CUDA_DEVICE_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX @@ -88,18 +88,21 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) CALL GSTATS(440,1) ENDIF CALL GSTATS(423,0) +CALL EXECUTE_INV_FFT(PREEL_COMPLEX(:,:),PREEL_REAL(:,:),KFIELD, & + & LOENS=G%NLOEN(D%NPTRLS(MYSETW):D%NPTRLS(MYSETW)+D%NDGL_FS-1), & + & OFFSETS=D%NSTAGTF(1:D%NDGL_FS)) -DO KGL=IBEG,IEND,IINC +! DO KGL=IBEG,IEND,IINC - IOFF_REAL=D%NSTAGTF(KGL)+1 - IOFF_COMPLEX=D%NSTAGTF(KGL)/2+1 - IGLG = D%NPTRLS(MYSETW)+KGL-1 + ! IOFF_REAL=D%NSTAGTF(KGL)+1 + ! IOFF_COMPLEX=D%NSTAGTF(KGL)/2+1 + ! IGLG = D%NPTRLS(MYSETW)+KGL-1 - CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),KFIELD,KFIELD) - !$ACC HOST_DATA USE_DEVICE(PREEL_COMPLEX,PREEL_REAL) - CALL EXECUTE_PLAN_FFTC(IPLAN_C2R,1,PREEL_COMPLEX(1,IOFF_COMPLEX),PREEL_REAL(1,IOFF_REAL)) - !$ACC END HOST_DATA -END DO + ! CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),KFIELD,KFIELD) + ! !$ACC HOST_DATA USE_DEVICE(PREEL_COMPLEX,PREEL_REAL) + ! CALL EXECUTE_PLAN_FFTC(IPLAN_C2R,1,PREEL_COMPLEX(1,IOFF_COMPLEX),PREEL_REAL(1,IOFF_REAL)) + ! !$ACC END HOST_DATA +! END DO IRET = CUDA_SYNCHRONIZE() IF (LSYNC_TRANS) THEN diff --git a/src/trans/gpu/internal/tpm_fftc.F90 b/src/trans/gpu/internal/tpm_fftc.F90 index 3f9c6ad00..a3f9c0e49 100755 --- a/src/trans/gpu/internal/tpm_fftc.F90 +++ b/src/trans/gpu/internal/tpm_fftc.F90 @@ -20,7 +20,7 @@ MODULE TPM_FFTC USE, INTRINSIC :: ISO_C_BINDING -USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT +USE PARKIND_ECTRANS, ONLY: JPIM USE MPL_MODULE, ONLY : MPL_MYRANK IMPLICIT NONE @@ -29,7 +29,7 @@ MODULE TPM_FFTC PRIVATE PUBLIC CREATE_PLAN_FFT, DESTROY_PLAN_FFT, DESTROY_ALL_PLANS_FFT, INIT_PLANS_FFT, & - & FFTC_RESOL, TC + & FFTC_RESOL, TC, EXECUTE_DIR_FFT, EXECUTE_INV_FFT TYPE FFTC_TYPE INTEGER(KIND=JPIM),POINTER :: N_PLANS(:) @@ -52,6 +52,48 @@ MODULE TPM_FFTC TYPE(FFTC_TYPE),POINTER :: TC +INTERFACE EXECUTE_DIR_FFT + SUBROUTINE EXECUTE_DIR_FFT_FLOAT(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS) + USE PARKIND_ECTRANS ,ONLY : JPIM + USE ISO_C_BINDING + IMPLICIT NONE + REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(:,:) + REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_COMPLEX(:) + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) + END SUBROUTINE + SUBROUTINE EXECUTE_DIR_FFT_DOUBLE(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS) + USE PARKIND_ECTRANS ,ONLY : JPIM + USE ISO_C_BINDING + IMPLICIT NONE + REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(:,:) + REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_COMPLEX(:) + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) + END SUBROUTINE +END INTERFACE + +INTERFACE EXECUTE_INV_FFT + SUBROUTINE EXECUTE_INV_FFT_FLOAT(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS) + USE PARKIND_ECTRANS ,ONLY : JPIM + USE ISO_C_BINDING + IMPLICIT NONE + REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(:,:) + REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(:,:) + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) + END SUBROUTINE + SUBROUTINE EXECUTE_INV_FFT_DOUBLE(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS) + USE PARKIND_ECTRANS ,ONLY : JPIM + USE ISO_C_BINDING + IMPLICIT NONE + REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(:,:) + REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(:,:) + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) + END SUBROUTINE +END INTERFACE + ! ------------------------------------------------------------------ CONTAINS @@ -187,5 +229,105 @@ SUBROUTINE DESTROY_ALL_PLANS_FFT RETURN END SUBROUTINE DESTROY_ALL_PLANS_FFT +SUBROUTINE EXECUTE_DIR_FFT_FLOAT(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS) + USE PARKIND_ECTRANS ,ONLY : JPIM + + IMPLICIT NONE + + REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(:,:) + REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_COMPLEX(:,:) + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) + INTERFACE + SUBROUTINE EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,NFFT) BIND(C, name="execute_dir_fft_float") + USE ISO_C_BINDING + REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(:,:) + REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_COMPLEX(:,:) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD + INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(:), OFFSETS(:) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT + END SUBROUTINE + END INTERFACE + + !$ACC HOST_DATA USE_DEVICE(PREEL_REAL,PREEL_COMPLEX) + CALL EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,SIZE(LOENS)) + !$ACC END HOST_DATA + +END SUBROUTINE +SUBROUTINE EXECUTE_DIR_FFT_DOUBLE(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS) + USE PARKIND_ECTRANS ,ONLY : JPIM + + IMPLICIT NONE + + REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(:,:) + REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_COMPLEX(:,:) + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) + INTERFACE + SUBROUTINE EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,NFFT) BIND(C, name="execute_dir_fft_double") + USE ISO_C_BINDING + REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(:,:) + REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_COMPLEX(:,:) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD + INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(:), OFFSETS(:) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT + END SUBROUTINE + END INTERFACE + + !$ACC HOST_DATA USE_DEVICE(PREEL_REAL,PREEL_COMPLEX) + CALL EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,SIZE(LOENS)) + !$ACC END HOST_DATA + +END SUBROUTINE + +SUBROUTINE EXECUTE_INV_FFT_FLOAT(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS) + USE PARKIND_ECTRANS ,ONLY : JPIM + + IMPLICIT NONE + + REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(:,:) + REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(:,:) + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) + INTERFACE + SUBROUTINE EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,NFFT) BIND(C, name="execute_inv_fft_float") + USE ISO_C_BINDING + REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(:,:) + REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(:,:) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD + INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(:), OFFSETS(:) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT + END SUBROUTINE + END INTERFACE + + !$ACC HOST_DATA USE_DEVICE(PREEL_COMPLEX,PREEL_REAL) + CALL EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,SIZE(LOENS)) + !$ACC END HOST_DATA +END SUBROUTINE + +SUBROUTINE EXECUTE_INV_FFT_DOUBLE(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS) + USE PARKIND_ECTRANS ,ONLY : JPIM + + IMPLICIT NONE + + REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(:,:) + REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(:,:) + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) + INTERFACE + SUBROUTINE EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,NFFT) BIND(C, name="execute_inv_fft_double") + USE ISO_C_BINDING + REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(:,:) + REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(:,:) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD + INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(:), OFFSETS(:) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT + END SUBROUTINE + END INTERFACE + + !$ACC HOST_DATA USE_DEVICE(PREEL_COMPLEX,PREEL_REAL) + CALL EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,SIZE(LOENS)) + !$ACC END HOST_DATA +END SUBROUTINE END MODULE TPM_FFTC From dc3c6f24364dbd05f78cb7d9d9a14f5a8ea973b0 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:17 -0700 Subject: [PATCH 142/263] Linearize PREEL_XXX for dir_trans --- src/trans/gpu/external/gpnorm_trans.F90 | 6 +- src/trans/gpu/internal/fourier_out_mod.F90 | 8 +-- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 26 ++++---- src/trans/gpu/internal/ftdir_mod.F90 | 8 +-- src/trans/gpu/internal/ftinv_ctlad_mod.F90 | 8 ++- src/trans/gpu/internal/tpm_fftc.F90 | 20 +++--- src/trans/gpu/internal/trgtol_mod.F90 | 71 +++++++++------------- 7 files changed, 70 insertions(+), 77 deletions(-) diff --git a/src/trans/gpu/external/gpnorm_trans.F90 b/src/trans/gpu/external/gpnorm_trans.F90 index 3f92ba0d2..78d78e56a 100755 --- a/src/trans/gpu/external/gpnorm_trans.F90 +++ b/src/trans/gpu/external/gpnorm_trans.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2008- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -166,7 +167,10 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) ! done in setup_trans LGPNORM=.TRUE. -CALL TRGTOL(ZGTF,IF_FS,IF_GP,IF_SCALARS_G,IVSET,PGP=PGP) +print *, "not supported" +flush(6) +stop 1 +! CALL TRGTOL(ZGTF,IF_FS,IF_GP,IF_SCALARS_G,IVSET,PGP=PGP) LGPNORM=.FALSE. ! ZGTF is now on GPU diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 index f15ee973a..bd70fb256 100755 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ b/src/trans/gpu/internal/fourier_out_mod.F90 @@ -45,7 +45,7 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) IMPLICIT NONE -REAL(KIND=JPRBT), INTENT(IN) :: PREEL_COMPLEX(:,:) +REAL(KIND=JPRBT), INTENT(IN) :: PREEL_COMPLEX(:) REAL(KIND=JPRBT), ALLOCATABLE, INTENT(OUT) :: FOUBUF_IN(:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS @@ -76,7 +76,7 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) DO KGL=IBEG,IEND,IINC DO JF=1,KFIELDS IGLG = OFFSET_VAR+KGL-1 - IOFF_COMPLEX = (D_NSTAGTF(KGL)/2)+1 + IOFF_COMPLEX = D_NSTAGTF(KGL)/2 SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) !$ACC LOOP SEQ @@ -85,8 +85,8 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KFIELDS*2 ! This is not contiguous in PREEL due to the memory layout. - FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL_COMPLEX(2*JF-1, JM+IOFF_COMPLEX) - FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL_COMPLEX(2*JF , JM+IOFF_COMPLEX) + FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL_COMPLEX(2*JF-1+KFIELDS*2*(JM+IOFF_COMPLEX)) + FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL_COMPLEX(2*JF +KFIELDS*2*(JM+IOFF_COMPLEX)) ENDDO ENDDO ENDDO diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index 14aef5346..0d6031a1f 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -92,8 +92,8 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & REAL(KIND=JPRBT), ALLOCATABLE, INTENT(OUT) :: FOUBUF_IN(:) ! Local variables -REAL(KIND=JPRBT), POINTER :: ZGTF_REAL(:,:) -REAL(KIND=JPRBT), POINTER :: ZGTF_COMPLEX(:,:) +REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:) +REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) INTEGER(KIND=JPIM) :: IST,JGL,IGL,JF_FS INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) @@ -150,18 +150,18 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & IST = IST+KF_SCALARS_G ENDIF -ALLOCATE(ZGTF_REAL(KF_FS, D%NLENGTF)) -!$ACC ENTER DATA CREATE(ZGTF_REAL) +ALLOCATE(PREEL_REAL(KF_FS*D%NLENGTF)) +!$ACC ENTER DATA CREATE(PREEL_REAL) ! Transposition CALL GSTATS(158,0) #ifdef USE_CUDA_AWARE_MPI_FT -CALL TRGTOL_CUDAAWARE(ZGTF_REAL,KF_FS,KF_GP,IVSET,KPTRGP,& +CALL TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) #else -CALL TRGTOL(ZGTF_REAL,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& +CALL TRGTOL(PREEL_REAL,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) #endif @@ -172,16 +172,16 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & CALL GSTATS(1640,0) IF (KF_FS > 0) THEN - CALL FTDIR(ZGTF_REAL,ZGTF_COMPLEX,KF_FS) - CALL FOURIER_OUT(ZGTF_COMPLEX,FOUBUF_IN,KF_FS) + CALL FTDIR(PREEL_REAL,PREEL_COMPLEX,KF_FS) + CALL FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KF_FS) ELSE - ZGTF_COMPLEX => ZGTF_REAL + PREEL_COMPLEX => PREEL_REAL ENDIF - CALL GSTATS(1640,1) -IF (ALLOCATED(ZGTF_COMPLEX)) THEN - !$ACC EXIT DATA DELETE(ZGTF_COMPLEX) - DEALLOCATE(ZGTF_COMPLEX) + +IF (ALLOCATED(PREEL_COMPLEX)) THEN + !$ACC EXIT DATA DELETE(PREEL_COMPLEX) + DEALLOCATE(PREEL_COMPLEX) ENDIF CALL GSTATS(106,1) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 37db64cde..a23859d4e 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -57,8 +57,8 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) IMPLICIT NONE INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD -REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:,:) -REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:,:) +REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) +REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:) INTEGER(KIND=JPIM) :: IGLG,IOFF_REAL,IOFF_COMPLEX,KGL,IRET INTEGER(KIND=JPIM) :: IPLAN_R2C @@ -77,7 +77,7 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) ENDIF -ALLOCATE(PREEL_COMPLEX(2*KFIELD,D%NLENGTF/2)) +ALLOCATE(PREEL_COMPLEX(KFIELD*D%NLENGTF)) !$ACC ENTER DATA CREATE(PREEL_COMPLEX) !$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX) @@ -88,7 +88,7 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) CALL GSTATS(430,1) ENDIF CALL GSTATS(413,0) -CALL EXECUTE_DIR_FFT(PREEL_REAL(:,:),PREEL_COMPLEX(:,:),KFIELD, & +CALL EXECUTE_DIR_FFT(PREEL_REAL(:),PREEL_COMPLEX(:),KFIELD, & & LOENS=G%NLOEN(D%NPTRLS(MYSETW):D%NPTRLS(MYSETW)+D%NDGL_FS-1), & & OFFSETS=D%NSTAGTF(1:D%NDGL_FS)) diff --git a/src/trans/gpu/internal/ftinv_ctlad_mod.F90 b/src/trans/gpu/internal/ftinv_ctlad_mod.F90 index 55b8cb377..dfc1094f0 100755 --- a/src/trans/gpu/internal/ftinv_ctlad_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctlad_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -195,8 +196,11 @@ SUBROUTINE FTINV_CTLAD(KF_UV_G,KF_SCALARS_G,& ENDIF CALL GSTATS(182,0) -CALL TRGTOL(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& - &PGP,PGPUV,PGP3A,PGP3B,PGP2) +print *, "not supported..." +flush(6) +stop +! CALL TRGTOL(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& + ! &PGP,PGPUV,PGP3A,PGP3B,PGP2) CALL GSTATS(182,1) ! 3. Fourier transform diff --git a/src/trans/gpu/internal/tpm_fftc.F90 b/src/trans/gpu/internal/tpm_fftc.F90 index a3f9c0e49..d82d33d01 100755 --- a/src/trans/gpu/internal/tpm_fftc.F90 +++ b/src/trans/gpu/internal/tpm_fftc.F90 @@ -57,7 +57,7 @@ SUBROUTINE EXECUTE_DIR_FFT_FLOAT(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS) USE PARKIND_ECTRANS ,ONLY : JPIM USE ISO_C_BINDING IMPLICIT NONE - REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(:,:) + REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(:) REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_COMPLEX(:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) @@ -66,7 +66,7 @@ SUBROUTINE EXECUTE_DIR_FFT_DOUBLE(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS) USE PARKIND_ECTRANS ,ONLY : JPIM USE ISO_C_BINDING IMPLICIT NONE - REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(:,:) + REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(:) REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_COMPLEX(:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) @@ -234,15 +234,15 @@ SUBROUTINE EXECUTE_DIR_FFT_FLOAT(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS) IMPLICIT NONE - REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(:,:) - REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_COMPLEX(:,:) + REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(:) + REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_COMPLEX(:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) INTERFACE SUBROUTINE EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,NFFT) BIND(C, name="execute_dir_fft_float") USE ISO_C_BINDING - REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(:,:) - REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_COMPLEX(:,:) + REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(:) + REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_COMPLEX(:) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(:), OFFSETS(:) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT @@ -259,15 +259,15 @@ SUBROUTINE EXECUTE_DIR_FFT_DOUBLE(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS) IMPLICIT NONE - REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(:,:) - REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_COMPLEX(:,:) + REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(:) + REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_COMPLEX(:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) INTERFACE SUBROUTINE EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,NFFT) BIND(C, name="execute_dir_fft_double") USE ISO_C_BINDING - REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(:,:) - REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_COMPLEX(:,:) + REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(:) + REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_COMPLEX(:) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(:), OFFSETS(:) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 592b8a8f5..f82d26230 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -10,7 +10,7 @@ MODULE TRGTOL_MOD CONTAINS - SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& + SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) !**** *TRGTOL * - transposition of grid point data from column @@ -29,7 +29,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& ! Explicit arguments : ! -------------------- - ! PGLAT - Latitudinal data ready for direct FFT (output) + ! PREEL_REAL - Latitudinal data ready for direct FFT (output) ! PGP - Blocked grid point data (input) ! Implicit arguments : @@ -87,7 +87,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& IMPLICIT NONE - REAL(KIND=JPRBT),INTENT(OUT) :: PGLAT(:,:) + REAL(KIND=JPRBT),INTENT(OUT) :: PREEL_REAL(:) INTEGER(KIND=JPIM),INTENT(IN) :: KVSET(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) @@ -224,7 +224,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& IRECVTOT(JROC) = IPOS*KF_FS ENDDO - !$ACC DATA COPYIN(IRECV_BUFR_TO_OUT,PGP_INDICES) PRESENT(PGLAT) ASYNC(1) + !$ACC DATA COPYIN(IRECV_BUFR_TO_OUT,PGP_INDICES) PRESENT(PREEL_REAL) ASYNC(1) CALL GSTATS(1805,1) @@ -281,7 +281,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) IPOS = IRECV_BUFR_TO_OUT_V+JL - PGLAT(JFLD,IRECV_BUFR_TO_OUT(IPOS)) = PGP(JK,IFLD,JBLK) + PREEL_REAL(JFLD+KF_FS*(IRECV_BUFR_TO_OUT(IPOS)-1)) = PGP(JK,IFLD,JBLK) ENDDO ENDDO ELSE @@ -291,22 +291,22 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) - IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL) + IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL)-1 IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) PBOUND=UBOUND(PGPUV,2) - PGLAT(JFLD,IPOS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PREEL_REAL(JFLD+KF_FS*IPOS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) - PGLAT(JFLD,IPOS) = PGP2(JK,IOFF+1,JBLK) + PREEL_REAL(JFLD+KF_FS*IPOS) = PGP2(JK,IOFF+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) PBOUND=UBOUND(PGP3A,2) - PGLAT(JFLD,IPOS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PREEL_REAL(JFLD+KF_FS*IPOS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) PBOUND=UBOUND(PGP3B,2) - PGLAT(JFLD,IPOS) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PREEL_REAL(JFLD+KF_FS*IPOS) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ENDIF ENDDO ENDDO @@ -486,8 +486,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(II) ASYNC(1) DO JFLD=1,KF_FS DO JL=1,ILEN - II = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL) - PGLAT(JFLD,II) = ZCOMBUFR(ICOMBUFR_OFFSET_V+JL+(JFLD-1)*ILEN) + II = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL)-1 + PREEL_REAL(JFLD+KF_FS*II) = ZCOMBUFR(ICOMBUFR_OFFSET_V+JL+(JFLD-1)*ILEN) ENDDO ENDDO ENDDO @@ -504,7 +504,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KVSET,KPTRGP,& END SUBROUTINE TRGTOL_CUDAAWARE - SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& + SUBROUTINE TRGTOL(PREEL_REAL,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) !**** *TRGTOL * - transposition of grid point data from column @@ -521,7 +521,7 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ! Explicit arguments : ! -------------------- - ! PGLAT - Latitudinal data ready for direct FFT (output) + ! PREEL_REAL - Latitudinal data ready for direct FFT (output) ! PGP - Blocked grid point data (input) ! Implicit arguments : @@ -583,7 +583,7 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IMPLICIT NONE - REAL(KIND=JPRBT),INTENT(OUT) :: PGLAT(:,:) + REAL(KIND=JPRBT),INTENT(OUT) :: PREEL_REAL(:) INTEGER(KIND=JPIM),INTENT(IN) :: KVSET(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP INTEGER(KIND=JPIM),INTENT(IN) :: KF_SCALARS_G @@ -941,10 +941,10 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ILAST = IGPTRSEND(2,JBLK,MYSETW) IF(LLPGPONLY) THEN DO JK=IFIRST,ILAST - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 + IPOS = KINDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 DO JFLD=1,IFLDS IFLD = IFLDOFF(JFLD) - PGLAT(JFLD,KINDEX(IPOS)) = PGP(JK,IFLD,JBLK) + PREEL_REAL(JFLD+KF_FS*IPOS) = PGP(JK,IFLD,JBLK) ENDDO ENDDO ELSE @@ -952,26 +952,23 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IFLD = IFLDOFF(JFLD) IF(LLUV(IFLD)) THEN DO JK=IFIRST,ILAST - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 - PGLAT(JFLD,KINDEX(IPOS)) = PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) - !if(jfld<=5 .and. kindex(ipos)<5) write(nout,*)'trgtol: ipos=',ipos,' idx=',kindex(ipos),' jk=',jk,' lev=',iuvlevs(ifld),' pars=',iuvpars(ifld),' pglat=',PGLAT(JFLD,KINDEX(IPOS)) - !if( jfld.eq.1 ) write(nout,*)'trgtoluv: ',kindex(ipos),' lev=',iuvlevs(ifld),' pars=',iuvpars(ifld),' pglat=',PGLAT(JFLD,KINDEX(IPOS)) + IPOS = KINDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 + PREEL_REAL(JFLD+KF_FS*IPOS) = PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) ENDDO ELSEIF(LLGP2(IFLD)) THEN DO JK=IFIRST,ILAST - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 - PGLAT(JFLD,KINDEX(IPOS)) = PGP2(JK,IGP2PARS(IFLD),JBLK) + IPOS = KINDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 + PREEL_REAL(JFLD+KF_FS*IPOS) = PGP2(JK,IGP2PARS(IFLD),JBLK) ENDDO ELSEIF(LLGP3A(IFLD)) THEN DO JK=IFIRST,ILAST - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 - PGLAT(JFLD,KINDEX(IPOS)) = PGP3A(JK,IGP3ALEVS(IFLD),IGP3APARS(IFLD),JBLK) - !if( jk.eq.ifirst ) write(iunit,*)'trgtol: ',JK,JFLD,IFLD,kindex(ipos),' lev=',IGP3ALEVS(ifld),' pars=',IGP3APARS(ifld),' pglat=',PGLAT(JFLD,KINDEX(IPOS)) + IPOS = KINDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 + PREEL_REAL(JFLD+KF_FS*IPOS) = PGP3A(JK,IGP3ALEVS(IFLD),IGP3APARS(IFLD),JBLK) ENDDO ELSEIF(LLGP3B(IFLD)) THEN DO JK=IFIRST,ILAST - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 - PGLAT(JFLD,KINDEX(IPOS)) = PGP3B(JK,IGP3BLEVS(IFLD),IGP3BPARS(IFLD),JBLK) + IPOS = KINDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 + PREEL_REAL(JFLD+KF_FS*IPOS) = PGP3B(JK,IGP3BLEVS(IFLD),IGP3BPARS(IFLD),JBLK) ENDDO ELSE CALL ABORT_TRANS('TRLTOG_MOD: ERROR') @@ -1142,31 +1139,19 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IRECV_FLD_END = ZCOMBUFR(0,INR) DO JFLD=IRECV_FLD_START,IRECV_FLD_END DO JL=1,ILEN - II = KINDEX(INDOFF(IRECV)+JL) - PGLAT(JFLD,II) = ZCOMBUFR(JL+(JFLD-IRECV_FLD_START)*ILEN,INR) + II = KINDEX(INDOFF(IRECV)+JL)-1 + PREEL_REAL(JFLD+KF_FS*II) = ZCOMBUFR(JL+(JFLD-IRECV_FLD_START)*ILEN,INR) ENDDO ENDDO ENDDO !$OMP END PARALLEL DO - ! this appears to be important (otherwise, old data picked in PGLAT) - ! in particular, one would have thought that above ACC copy and update on the - ! device is the same as OMP loop + update device command below, but it seems not, and winds still in field index 1 from prev inv_trans !!! - !$ACC update device(PGLAT) - !$ACC wait - - !#ifdef COMVERBOSE - ! call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - ! Tc=(TIMEF()-Tc)/1000.0_JPRBT - ! !IF(irank==0) WRITE(*,*) "unpacking (trgtol) in sec: ", Tc - !#endif - CALL GSTATS(1603,1) IF (IBUFLENS > 0) DEALLOCATE(ZCOMBUFS) IF (IBUFLENR > 0) DEALLOCATE(ZCOMBUFR) - !$ACC UPDATE DEVICE(PGLAT) + !$ACC UPDATE DEVICE(PREEL_REAL) IF (LHOOK) CALL DR_HOOK('TRGTOL',1,ZHOOK_HANDLE) From 16e4320388e8638d4f0b6e227f4e2eb686a018a2 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:18 -0700 Subject: [PATCH 143/263] linearize large parts of preel for inv --- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 36 +++++++------ src/trans/gpu/internal/ftinv_mod.F90 | 8 +-- src/trans/gpu/internal/tpm_fftc.F90 | 24 ++++----- src/trans/gpu/internal/trltog_mod.F90 | 68 ++++++++++++------------ 4 files changed, 69 insertions(+), 67 deletions(-) diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index 41cffee3c..9c65c3301 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -97,8 +97,9 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP2(:,:,:) REAL(KIND=JPRBT), ALLOCATABLE, INTENT(INOUT) :: FOUBUF(:) -REAL(KIND=JPRBT), POINTER :: ZGTF_REAL(:,:) -REAL(KIND=JPRBT), POINTER :: ZGTF_COMPLEX(:,:) +REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:) +REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) +REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX_2D(:,:) REAL(KIND=JPRBT), POINTER :: & & PUV(:,:), PSCALARS(:,:), PSCALARS_NSDER(:,:), PUV_EWDER(:,:), PSCALARS_EWDER(:,:) @@ -118,7 +119,7 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& CALL GSTATS(107,0) CALL GSTATS(1639,0) -! Compute ZGTF_COMPLEX Domain decomposition +! Compute PREEL_COMPLEX Domain decomposition IFIRST = 0 IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence @@ -131,44 +132,45 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives KF_FS = IFIRST -ALLOCATE(ZGTF_COMPLEX(2*KF_FS, D%NLENGTF/2)) -!$ACC ENTER DATA CREATE(ZGTF_COMPLEX) +ALLOCATE(PREEL_COMPLEX((2*KF_FS)*(D%NLENGTF/2))) +!$ACC ENTER DATA CREATE(PREEL_COMPLEX) ! And reiterate domain decomposition to assign pointers +CALL C_F_POINTER(C_LOC(PREEL_COMPLEX), PREEL_COMPLEX_2D, [2*KF_FS,D%NLENGTF/2]) IFIRST = 0 IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence -PUV => ZGTF_COMPLEX(2*IFIRST+1:2*(IFIRST+2*KF_UV),:) +PUV => PREEL_COMPLEX_2D(2*IFIRST+1:2*(IFIRST+2*KF_UV),:) IFIRST = IFIRST + 2*KF_UV ! U and V -PSCALARS => ZGTF_COMPLEX(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) +PSCALARS => PREEL_COMPLEX_2D(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) IFIRST = IFIRST + KF_SCALARS ! Scalars IF (LSCDERS) THEN - PSCALARS_NSDER => ZGTF_COMPLEX(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) + PSCALARS_NSDER => PREEL_COMPLEX_2D(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives ENDIF IF (LUVDER) THEN - PUV_EWDER => ZGTF_COMPLEX(2*IFIRST+1:2*(IFIRST+2*KF_UV),:) + PUV_EWDER => PREEL_COMPLEX_2D(2*IFIRST+1:2*(IFIRST+2*KF_UV),:) IFIRST = IFIRST+2*KF_UV ! U and V derivatives ENDIF IF (LSCDERS) THEN - PSCALARS_EWDER => ZGTF_COMPLEX(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) + PSCALARS_EWDER => PREEL_COMPLEX_2D(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives ENDIF -! from FOUBUF to ZGTF_COMPLEX. Divide by two because we consider this complex space now -CALL FOURIER_IN(FOUBUF,ZGTF_COMPLEX,KF_INPUT/2) +! from FOUBUF to PREEL_COMPLEX. Divide by two because we consider this complex space now +CALL FOURIER_IN(FOUBUF,PREEL_COMPLEX_2D,KF_INPUT/2) ! 2. Fourier space computations -! fill the rest of ZGTF_COMPLEX +! fill the rest of PREEL_COMPLEX CALL FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCALARS_EWDER) ! 3. Fourier transform ! inplace operation IF(KF_FS > 0) THEN - CALL FTINV(ZGTF_COMPLEX,ZGTF_REAL,KF_FS) + CALL FTINV(PREEL_COMPLEX,PREEL_REAL,KF_FS) ELSE - ZGTF_REAL => ZGTF_COMPLEX + PREEL_REAL => PREEL_COMPLEX ENDIF CALL GSTATS(1639,1) @@ -254,11 +256,11 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& CALL GSTATS(157,0) #ifdef USE_CUDA_AWARE_MPI_FT WRITE(NOUT,*) 'ftinv_ctl:TRLTOG_CUDAAWARE' -CALL TRLTOG_CUDAAWARE(ZGTF_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,IVSET,KPTRGP,& +CALL TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) #else !WRITE(NOUT,*) 'ftinv_ctl:TRLTOG' -CALL TRLTOG(ZGTF_REAL,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& +CALL TRLTOG(PREEL_REAL,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) #endif CALL GSTATS(157,1) diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index 6c085e1cf..bef3162f5 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -57,8 +57,8 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) IMPLICIT NONE INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD -REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:,:) -REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_COMPLEX(:,:) +REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) +REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_COMPLEX(:) INTEGER(KIND=JPIM) :: IGLG,IOFF_REAL,IOFF_COMPLEX,KGL,IRET INTEGER(KIND=JPIM) :: IPLAN_C2R @@ -77,7 +77,7 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) ENDIF -ALLOCATE(PREEL_REAL(KFIELD,D%NLENGTF)) +ALLOCATE(PREEL_REAL(KFIELD*D%NLENGTF)) !$ACC ENTER DATA CREATE(PREEL_REAL) !$ACC DATA PRESENT(PREEL_COMPLEX,PREEL_REAL) @@ -88,7 +88,7 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) CALL GSTATS(440,1) ENDIF CALL GSTATS(423,0) -CALL EXECUTE_INV_FFT(PREEL_COMPLEX(:,:),PREEL_REAL(:,:),KFIELD, & +CALL EXECUTE_INV_FFT(PREEL_COMPLEX(:),PREEL_REAL(:),KFIELD, & & LOENS=G%NLOEN(D%NPTRLS(MYSETW):D%NPTRLS(MYSETW)+D%NDGL_FS-1), & & OFFSETS=D%NSTAGTF(1:D%NDGL_FS)) diff --git a/src/trans/gpu/internal/tpm_fftc.F90 b/src/trans/gpu/internal/tpm_fftc.F90 index d82d33d01..dbad5f624 100755 --- a/src/trans/gpu/internal/tpm_fftc.F90 +++ b/src/trans/gpu/internal/tpm_fftc.F90 @@ -78,8 +78,8 @@ SUBROUTINE EXECUTE_INV_FFT_FLOAT(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS) USE PARKIND_ECTRANS ,ONLY : JPIM USE ISO_C_BINDING IMPLICIT NONE - REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(:,:) - REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(:,:) + REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(:) + REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) END SUBROUTINE @@ -87,8 +87,8 @@ SUBROUTINE EXECUTE_INV_FFT_DOUBLE(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS) USE PARKIND_ECTRANS ,ONLY : JPIM USE ISO_C_BINDING IMPLICIT NONE - REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(:,:) - REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(:,:) + REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(:) + REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) END SUBROUTINE @@ -285,15 +285,15 @@ SUBROUTINE EXECUTE_INV_FFT_FLOAT(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS) IMPLICIT NONE - REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(:,:) - REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(:,:) + REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(:) + REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) INTERFACE SUBROUTINE EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,NFFT) BIND(C, name="execute_inv_fft_float") USE ISO_C_BINDING - REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(:,:) - REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(:,:) + REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(:) + REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(:) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(:), OFFSETS(:) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT @@ -310,15 +310,15 @@ SUBROUTINE EXECUTE_INV_FFT_DOUBLE(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS) IMPLICIT NONE - REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(:,:) - REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(:,:) + REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(:) + REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) INTERFACE SUBROUTINE EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,NFFT) BIND(C, name="execute_inv_fft_double") USE ISO_C_BINDING - REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(:,:) - REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(:,:) + REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(:) + REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(:) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(:), OFFSETS(:) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index f9b2bf771..08165ec30 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -10,7 +10,7 @@ MODULE TRLTOG_MOD CONTAINS - SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP,& + SUBROUTINE TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) !**** *trltog * - transposition of grid point data from latitudinal @@ -30,7 +30,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, ! Explicit arguments : ! -------------------- - ! PGLAT - Latitudinal data ready for direct FFT (input) + ! PREEL_REAL - Latitudinal data ready for direct FFT (input) ! PGP - Blocked grid point data (output) ! KVSET - "v-set" for each field (input) @@ -89,7 +89,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, IMPLICIT NONE - REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PGLAT(:,:) + REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) INTEGER(KIND=JPIM),INTENT(IN) :: KVSET(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV_G, KF_SCALARS_G @@ -350,7 +350,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, !$ACC DATA IF(PRESENT(PGP3B)) COPYOUT(PGP3B) ASYNC(1) ! Present until self contribution and packing are done - !$ACC DATA PRESENT(PGLAT) ASYNC(1) + !$ACC DATA PRESENT(PREEL_REAL) ASYNC(1) CALL GSTATS(1806,1) ! Copy local contribution @@ -384,8 +384,8 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) - IPOS = IIN_TO_SEND_BUFR_V+JL - PGP(JK,IFLD,JBLK) = PGLAT(JFLD,IIN_TO_SEND_BUFR(IPOS)) + IPOS = KF_FS*(IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL)-1)+JFLD + PGP(JK,IFLD,JBLK) = PREEL_REAL(IPOS) ENDDO ENDDO ELSE @@ -395,15 +395,15 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) - IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL) + IPOS = KF_FS*(IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL)-1)+JFLD IF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_UV) THEN - PGPUV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PGLAT(JFLD,IPOS) + PGPUV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP2) THEN - PGP2(JK,IGP_OFFSETS(IFLD,2),JBLK)=PGLAT(JFLD,IPOS) + PGP2(JK,IGP_OFFSETS(IFLD,2),JBLK)=PREEL_REAL(IPOS) ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3A) THEN - PGP3A(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PGLAT(JFLD,IPOS) + PGP3A(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3B) THEN - PGP3B(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PGLAT(JFLD,IPOS) + PGP3B(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) ENDIF ENDDO ENDDO @@ -453,25 +453,25 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, ILEN = ISENDTOT(IPROC)/KF_FS IIN_TO_SEND_BUFR_V = IIN_TO_SEND_BUFR_OFFSET(IPROC) ICOMBUFS_OFFSET_V = ICOMBUFS_OFFSET(INS) - !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(II) COLLAPSE(2) ASYNC(1) + !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IPOS) COLLAPSE(2) ASYNC(1) DO JL=1,ILEN DO JFLD=1,KF_FS - II = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL) - ZCOMBUFS(ICOMBUFS_OFFSET_V+(JFLD-1)*ILEN+JL) = PGLAT(JFLD,II) + IPOS = KF_FS*(IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL)-1)+JFLD + ZCOMBUFS(ICOMBUFS_OFFSET_V+(JFLD-1)*ILEN+JL) = PREEL_REAL(IPOS) ENDDO ENDDO ENDDO CALL GSTATS(1605,1) !$ACC END DATA ! ZCOMBUFS - !$ACC END DATA ! PGLAT - !$ACC EXIT DATA DELETE(PGLAT) ASYNC(1) + !$ACC END DATA ! PREEL_REAL + !$ACC EXIT DATA DELETE(PREEL_REAL) ASYNC(1) IF (IRECV_COUNTS > 0) ALLOCATE(ZCOMBUFR(ICOMBUFR_OFFSET(IRECV_COUNTS+1))) !$ACC DATA IF(IRECV_COUNTS > 0) CREATE(ZCOMBUFR) ASYNC(1) !$ACC WAIT(1) - DEALLOCATE(PGLAT) + DEALLOCATE(PREEL_REAL) CALL GSTATS(805,0) @@ -612,7 +612,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP, END SUBROUTINE TRLTOG_CUDAAWARE - SUBROUTINE TRLTOG(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& + SUBROUTINE TRLTOG(PREEL_REAL,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) !**** *trltog * - transposition of grid point data from latitudinal @@ -630,7 +630,7 @@ SUBROUTINE TRLTOG(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ! Explicit arguments : ! -------------------- - ! PGLAT - Latitudinal data ready for direct FFT (input) + ! PREEL_REAL - Latitudinal data ready for direct FFT (input) ! PGP - Blocked grid point data (output) ! KVSET - "v-set" for each field (input) @@ -693,7 +693,7 @@ SUBROUTINE TRLTOG(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IMPLICIT NONE - REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PGLAT(:,:) + REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) INTEGER(KIND=JPIM),INTENT(IN) :: KVSET(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP INTEGER(KIND=JPIM),INTENT(IN) :: KF_SCALARS_G @@ -757,7 +757,7 @@ SUBROUTINE TRLTOG(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ! -------------------- IF (LHOOK) CALL DR_HOOK('TRLTOG',0,ZHOOK_HANDLE) - !$ACC UPDATE HOST(PGLAT) + !$ACC UPDATE HOST(PREEL_REAL) CALL GSTATS(1806,0) @@ -1051,7 +1051,7 @@ SUBROUTINE TRLTOG(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IFLD = KPTRGP(JFLD) DO JK=IFIRST,ILAST IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 - PGP(JK,IFLD,JBLK) = PGLAT(JFLD,INDEX(IPOS)) + PGP(JK,IFLD,JBLK) = PREEL_REAL((INDEX(IPOS)-1)*KF_FS+JFLD) ENDDO ENDDO ELSE @@ -1059,7 +1059,7 @@ SUBROUTINE TRLTOG(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IFLD = IFLDOFF(JFLD) DO JK=IFIRST,ILAST IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 - PGP(JK,IFLD,JBLK) = PGLAT(JFLD,INDEX(IPOS)) + PGP(JK,IFLD,JBLK) = PREEL_REAL((INDEX(IPOS)-1)*KF_FS+JFLD) ENDDO ENDDO ENDIF @@ -1068,23 +1068,23 @@ SUBROUTINE TRLTOG(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IFLD = IFLDOFF(JFLD) IF(LLUV(IFLD)) THEN DO JK=IFIRST,ILAST - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 - PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) = PGLAT(JFLD,INDEX(IPOS)) + IPOS = INDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 + PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) = PREEL_REAL(JFLD+KF_FS*IPOS) ENDDO ELSEIF(LLGP2(IFLD)) THEN DO JK=IFIRST,ILAST - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 - PGP2(JK,IGP2PARS(IFLD),JBLK)=PGLAT(JFLD,INDEX(IPOS)) + IPOS = INDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 + PGP2(JK,IGP2PARS(IFLD),JBLK)=PREEL_REAL(JFLD+KF_FS*IPOS) ENDDO ELSEIF(LLGP3A(IFLD)) THEN DO JK=IFIRST,ILAST - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 - PGP3A(JK,IGP3ALEVS(IFLD),IGP3APARS(IFLD),JBLK)=PGLAT(JFLD,INDEX(IPOS)) + IPOS = INDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 + PGP3A(JK,IGP3ALEVS(IFLD),IGP3APARS(IFLD),JBLK)=PREEL_REAL(JFLD+KF_FS*IPOS) ENDDO ELSEIF(LLGP3B(IFLD)) THEN DO JK=IFIRST,ILAST - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 - PGP3B(JK,IGP3BLEVS(IFLD),IGP3BPARS(IFLD),JBLK)=PGLAT(JFLD,INDEX(IPOS)) + IPOS = INDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 + PGP3B(JK,IGP3BLEVS(IFLD),IGP3BPARS(IFLD),JBLK)=PREEL_REAL(JFLD+KF_FS*IPOS) ENDDO ELSE WRITE(NOUT,*)'TRLTOG_MOD: ERROR',JFLD,IFLD @@ -1126,7 +1126,7 @@ SUBROUTINE TRLTOG(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& II = INDEX(INDOFF(ISEND)+JL) DO JFLD=ISEND_FLD_START(ISEND),ISEND_FLD_END #endif - ZCOMBUFS((JFLD-ISEND_FLD_START(ISEND))*ILEN+JL,INS) = PGLAT(JFLD,II) + ZCOMBUFS((JFLD-ISEND_FLD_START(ISEND))*ILEN+JL,INS) = PREEL_REAL((II-1)*KF_FS+JFLD) ENDDO ENDDO ZCOMBUFS(-1,INS) = 1 @@ -1270,8 +1270,8 @@ SUBROUTINE TRLTOG(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IF (IBUFLENS > 0) DEALLOCATE(ZCOMBUFS) IF (IBUFLENR > 0) DEALLOCATE(ZCOMBUFR) - !$ACC EXIT DATA DELETE(PGLAT) - DEALLOCATE(PGLAT) + !$ACC EXIT DATA DELETE(PREEL_REAL) + DEALLOCATE(PREEL_REAL) IF (LHOOK) CALL DR_HOOK('TRLTOG',1,ZHOOK_HANDLE) From cfb03c3ddb2d53a1d41c005e51a19bfec17bc324 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:18 -0700 Subject: [PATCH 144/263] linearize PREEL in FOURIER_IN for INV --- src/trans/gpu/internal/fourier_in_mod.F90 | 23 ++++++++++++----------- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 2 +- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index 60b31b43d..837a3b107 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -10,7 +10,7 @@ MODULE FOURIER_IN_MOD CONTAINS -SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KFIELDS) +SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) !**** *FOURIER_IN* - Copy fourier data from buffer to local array @@ -23,7 +23,8 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KFIELDS) ! CALL FOURIER_IN(...) ! Explicit arguments : PREEL_COMPLEX - local fourier/GP array -! -------------------- KFIELDS - number of fields +! -------------------- KF_CURRENT - number of fields that are read (from Legendre space) +! KF_TOTAL - total fields in PREEL ("stride") ! ! Externals. None. ! ---------- @@ -46,8 +47,8 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KFIELDS) IMPLICIT NONE REAL(KIND=JPRBT), ALLOCATABLE, INTENT(INOUT) :: FOUBUF(:) -REAL(KIND=JPRBT), INTENT(OUT) :: PREEL_COMPLEX(:,:) -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS +REAL(KIND=JPRBT), INTENT(OUT) :: PREEL_COMPLEX(:) +INTEGER(KIND=JPIM),INTENT(IN) :: KF_CURRENT, KF_TOTAL INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA,OFFSET_VAR,IOFF_COMPLEX,KGL INTEGER(KIND=JPIM) :: IBEG,IEND,IINC @@ -67,25 +68,25 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KFIELDS) OFFSET_VAR=D_NPTRLS(MYSETW) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF_COMPLEX,JM,IPROC,ISTA) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC - DO JF=1,KFIELDS + DO JF=1,KF_CURRENT IGLG = OFFSET_VAR+KGL-1 - IOFF_COMPLEX = (D_NSTAGTF(KGL)/2)+1 + IOFF_COMPLEX = D_NSTAGTF(KGL)/2 !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) IPROC = D_NPROCM(JM) - ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KFIELDS*2 + ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KF_CURRENT*2 - PREEL_COMPLEX(2*JF-1,JM+IOFF_COMPLEX) = FOUBUF(ISTA+2*JF-1) - PREEL_COMPLEX(2*JF, JM+IOFF_COMPLEX) = FOUBUF(ISTA+2*JF ) + PREEL_COMPLEX(2*JF-1+2*KF_TOTAL*(JM+IOFF_COMPLEX)) = FOUBUF(ISTA+2*JF-1) + PREEL_COMPLEX(2*JF +2*KF_TOTAL*(JM+IOFF_COMPLEX)) = FOUBUF(ISTA+2*JF ) ENDDO !$ACC LOOP SEQ DO JM=G_NMEN(IGLG)+1,(G_NLOEN(IGLG)+4)/2-1 ! Truncation (not sure what is the exact upper bound here...) ! Same is also in FSC for the new fields. I *think* it should be N/2+1 elements in total ! TODO: Make sure this is correct - PREEL_COMPLEX(2*JF-1,JM+IOFF_COMPLEX) = 0._JPRBT - PREEL_COMPLEX(2*JF, JM+IOFF_COMPLEX) = 0._JPRBT + PREEL_COMPLEX(2*JF-1+2*KF_TOTAL*(JM+IOFF_COMPLEX)) = 0._JPRBT + PREEL_COMPLEX(2*JF +2*KF_TOTAL*(JM+IOFF_COMPLEX)) = 0._JPRBT ENDDO ENDDO ENDDO diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index 9c65c3301..388ca81f7 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -158,7 +158,7 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& ENDIF ! from FOUBUF to PREEL_COMPLEX. Divide by two because we consider this complex space now -CALL FOURIER_IN(FOUBUF,PREEL_COMPLEX_2D,KF_INPUT/2) +CALL FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_INPUT/2,KF_FS) ! 2. Fourier space computations From 93158d13c7c9a5fd4bb9dba8e8f63fd432d72304 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:18 -0700 Subject: [PATCH 145/263] FSC is nomore pointer based (ready for transposition) --- src/trans/gpu/internal/fsc_mod.F90 | 47 ++++++++++++++---------- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 22 +++++++---- 2 files changed, 42 insertions(+), 27 deletions(-) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 3aae18f96..add4af50c 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -10,7 +10,8 @@ MODULE FSC_MOD CONTAINS -SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCALARS_EWDER) +SUBROUTINE FSC(PREEL_COMPLEX, KF_UV, KF_SCALARS, KUV_OFFSET, & + & KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) !**** *FSC - Division by a*cos(theta), east-west derivatives @@ -64,9 +65,9 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA INTEGER(KIND=JPIM) :: JF,IGLG,JM INTEGER(KIND=JPIM) :: IBEG,IEND,IINC +REAL(KIND=JPRBT), INTENT(INOUT) :: PREEL_COMPLEX(:,:) INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV, KF_SCALARS -REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: & - & PUV(:,:), PSCALARS(:,:), PSCALARS_NSDER(:,:), PUV_EWDER(:,:), PSCALARS_EWDER(:,:) +INTEGER(KIND=JPIM), INTENT(IN) :: KUV_OFFSET, KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET ! ------------------------------------------------------------------ @@ -80,7 +81,7 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA IINC=-1 ENDIF -!$ACC DATA PRESENT(D,G,F,D_NSTAGTF,G_NMEN,PUV,PSCALARS,PSCALARS_NSDER,PUV_EWDER,PSCALARS_EWDER,G_NLOEN) +!$ACC DATA PRESENT(D,G,F,D_NSTAGTF,G_NMEN,G_NLOEN,PREEL_COMPLEX) IF( LATLON.AND.S%LDLL.AND.S%LSHIFTLL ) THEN PRINT *, "This is not implemented yet! LATLON.AND.S%LDLL.AND.S%LSHIFTLL" @@ -104,15 +105,17 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PUV(2*JF-1,JM+IOFF_COMPLEX) = PUV(2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2 - PUV(2*JF, JM+IOFF_COMPLEX) = PUV(2*JF ,JM+IOFF_COMPLEX)*ZACHTE2 + PREEL_COMPLEX(KUV_OFFSET+2*JF-1,JM+IOFF_COMPLEX) = & + & PREEL_COMPLEX(KUV_OFFSET+2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2 + PREEL_COMPLEX(KUV_OFFSET+2*JF, JM+IOFF_COMPLEX) = & + & PREEL_COMPLEX(KUV_OFFSET+2*JF ,JM+IOFF_COMPLEX)*ZACHTE2 ENDDO ENDDO ENDDO !* 1.2 N-S DERIVATIVES -IF (ASSOCIATED(PSCALARS_NSDER)) THEN +IF (KSCALARS_NSDER_OFFSET >= 0) THEN !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=1,KF_SCALARS @@ -123,8 +126,10 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PSCALARS_NSDER(2*JF-1,JM+IOFF_COMPLEX) = PSCALARS_NSDER(2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2 - PSCALARS_NSDER(2*JF, JM+IOFF_COMPLEX) = PSCALARS_NSDER(2*JF, JM+IOFF_COMPLEX)*ZACHTE2 + PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+2*JF-1,JM+IOFF_COMPLEX) = & + & PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2 + PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+2*JF, JM+IOFF_COMPLEX) = & + & PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+2*JF, JM+IOFF_COMPLEX)*ZACHTE2 ENDDO ENDDO ENDDO @@ -137,7 +142,7 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA !* 2.1 U AND V. -IF (ASSOCIATED(PUV_EWDER)) THEN +IF (KUV_EWDER_OFFSET >= 0) THEN !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=1,2*KF_UV @@ -148,20 +153,22 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PUV_EWDER(2*JF-1,JM+IOFF_COMPLEX) = -PUV(2*JF, JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) - PUV_EWDER(2*JF, JM+IOFF_COMPLEX) = PUV(2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) + PREEL_COMPLEX(KUV_EWDER_OFFSET+2*JF-1,JM+IOFF_COMPLEX) = & + & -PREEL_COMPLEX(KUV_OFFSET+2*JF, JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) + PREEL_COMPLEX(KUV_EWDER_OFFSET+2*JF, JM+IOFF_COMPLEX) = & + & PREEL_COMPLEX(KUV_OFFSET+2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) ENDDO !$ACC LOOP SEQ DO JM=G_NMEN(IGLG)+1,(G_NLOEN(IGLG)+4)/2-1 - PUV_EWDER(2*JF-1,JM+IOFF_COMPLEX) = 0._JPRBT - PUV_EWDER(2*JF, JM+IOFF_COMPLEX) = 0._JPRBT + PREEL_COMPLEX(KUV_EWDER_OFFSET+2*JF-1,JM+IOFF_COMPLEX) = 0._JPRBT + PREEL_COMPLEX(KUV_EWDER_OFFSET+2*JF, JM+IOFF_COMPLEX) = 0._JPRBT ENDDO ENDDO ENDDO ENDIF !* 2.2 SCALAR VARIABLES -IF (ASSOCIATED(PSCALARS_EWDER)) THEN +IF (KSCALARS_EWDER_OFFSET > 0) THEN !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=1,KF_SCALARS @@ -172,13 +179,15 @@ SUBROUTINE FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCA !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PSCALARS_EWDER(2*JF-1,JM+IOFF_COMPLEX) = -PSCALARS(2*JF ,JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) - PSCALARS_EWDER(2*JF, JM+IOFF_COMPLEX) = PSCALARS(2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) + PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+2*JF-1,JM+IOFF_COMPLEX) = & + & -PREEL_COMPLEX(KSCALARS_OFFSET+2*JF ,JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) + PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+2*JF, JM+IOFF_COMPLEX) = & + & PREEL_COMPLEX(KSCALARS_OFFSET+2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) ENDDO !$ACC LOOP SEQ DO JM=G_NMEN(IGLG)+1,(G_NLOEN(IGLG)+4)/2-1 - PSCALARS_EWDER(2*JF-1,JM+IOFF_COMPLEX) = 0._JPRBT - PSCALARS_EWDER(2*JF, JM+IOFF_COMPLEX) = 0._JPRBT + PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+2*JF-1,JM+IOFF_COMPLEX) = 0._JPRBT + PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+2*JF, JM+IOFF_COMPLEX) = 0._JPRBT ENDDO ENDDO ENDDO diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index 388ca81f7..809b2ec1c 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -100,8 +100,8 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:) REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX_2D(:,:) -REAL(KIND=JPRBT), POINTER :: & - & PUV(:,:), PSCALARS(:,:), PSCALARS_NSDER(:,:), PUV_EWDER(:,:), PSCALARS_EWDER(:,:) +INTEGER(KIND=JPIM) :: KUV_OFFSET, KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, & + & KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET INTEGER(KIND=JPIM) :: IST INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) @@ -135,25 +135,30 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& ALLOCATE(PREEL_COMPLEX((2*KF_FS)*(D%NLENGTF/2))) !$ACC ENTER DATA CREATE(PREEL_COMPLEX) +! Initialize potentially unset offsets +KSCALARS_NSDER_OFFSET = -1 +KUV_EWDER_OFFSET = -1 +KSCALARS_EWDER_OFFSET = -1 + ! And reiterate domain decomposition to assign pointers CALL C_F_POINTER(C_LOC(PREEL_COMPLEX), PREEL_COMPLEX_2D, [2*KF_FS,D%NLENGTF/2]) IFIRST = 0 IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence -PUV => PREEL_COMPLEX_2D(2*IFIRST+1:2*(IFIRST+2*KF_UV),:) +KUV_OFFSET = 2*IFIRST IFIRST = IFIRST + 2*KF_UV ! U and V -PSCALARS => PREEL_COMPLEX_2D(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) +KSCALARS_OFFSET = 2*IFIRST IFIRST = IFIRST + KF_SCALARS ! Scalars IF (LSCDERS) THEN - PSCALARS_NSDER => PREEL_COMPLEX_2D(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) + KSCALARS_NSDER_OFFSET = 2*IFIRST IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives ENDIF IF (LUVDER) THEN - PUV_EWDER => PREEL_COMPLEX_2D(2*IFIRST+1:2*(IFIRST+2*KF_UV),:) + KUV_EWDER_OFFSET = 2*IFIRST IFIRST = IFIRST+2*KF_UV ! U and V derivatives ENDIF IF (LSCDERS) THEN - PSCALARS_EWDER => PREEL_COMPLEX_2D(2*IFIRST+1:2*(IFIRST+KF_SCALARS),:) + KSCALARS_EWDER_OFFSET = 2*IFIRST IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives ENDIF @@ -163,7 +168,8 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& ! 2. Fourier space computations ! fill the rest of PREEL_COMPLEX -CALL FSC(KF_UV, KF_SCALARS, PUV, PSCALARS, PSCALARS_NSDER, PUV_EWDER, PSCALARS_EWDER) +CALL FSC(PREEL_COMPLEX_2D, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, & + & KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) ! 3. Fourier transform ! inplace operation From 4f0f9b46265cf1a6f083497642107f6620a5c0df Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:18 -0700 Subject: [PATCH 146/263] linearize PREEL in FSC --- src/trans/gpu/internal/fsc_mod.F90 | 64 ++++++++++++------------ src/trans/gpu/internal/ftinv_ctl_mod.F90 | 6 +-- 2 files changed, 34 insertions(+), 36 deletions(-) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index add4af50c..2d5f2dcb9 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -10,7 +10,7 @@ MODULE FSC_MOD CONTAINS -SUBROUTINE FSC(PREEL_COMPLEX, KF_UV, KF_SCALARS, KUV_OFFSET, & +SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & & KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) !**** *FSC - Division by a*cos(theta), east-west derivatives @@ -25,11 +25,11 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_UV, KF_SCALARS, KUV_OFFSET, & !** Interface. ! ---------- ! CALL FSC(..) -! Explicit arguments : PUV - u and v -! -------------------- PSCALAR - scalar valued varaibles -! PNSDERS - N-S derivative of S.V.V. -! PEWDERS - E-W derivative of S.V.V. -! PUVDERS - E-W derivative of u and v +! Explicit arguments : KF_FS - total stride +! -------------------- KF_UV - # uv layers +! KF_SCALARS - # scalar layers +! *_OFFSET - offset of the respective layer +! ! Method. ! ------- @@ -65,8 +65,8 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_UV, KF_SCALARS, KUV_OFFSET, & INTEGER(KIND=JPIM) :: JF,IGLG,JM INTEGER(KIND=JPIM) :: IBEG,IEND,IINC -REAL(KIND=JPRBT), INTENT(INOUT) :: PREEL_COMPLEX(:,:) -INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV, KF_SCALARS +REAL(KIND=JPRBT), INTENT(INOUT) :: PREEL_COMPLEX(:) +INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV, KF_SCALARS INTEGER(KIND=JPIM), INTENT(IN) :: KUV_OFFSET, KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET ! ------------------------------------------------------------------ @@ -99,16 +99,16 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_UV, KF_SCALARS, KUV_OFFSET, & DO KGL=IBEG,IEND,IINC DO JF=1,2*KF_UV IGLG = OFFSET_VAR+KGL-1 - IOFF_COMPLEX = D_NSTAGTF(KGL)/2+1 + IOFF_COMPLEX = D_NSTAGTF(KGL)/2 ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PREEL_COMPLEX(KUV_OFFSET+2*JF-1,JM+IOFF_COMPLEX) = & - & PREEL_COMPLEX(KUV_OFFSET+2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2 - PREEL_COMPLEX(KUV_OFFSET+2*JF, JM+IOFF_COMPLEX) = & - & PREEL_COMPLEX(KUV_OFFSET+2*JF ,JM+IOFF_COMPLEX)*ZACHTE2 + PREEL_COMPLEX(KUV_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX)) = & + & PREEL_COMPLEX(KUV_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX))*ZACHTE2 + PREEL_COMPLEX(KUV_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX)) = & + & PREEL_COMPLEX(KUV_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX))*ZACHTE2 ENDDO ENDDO ENDDO @@ -120,16 +120,16 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_UV, KF_SCALARS, KUV_OFFSET, & DO KGL=IBEG,IEND,IINC DO JF=1,KF_SCALARS IGLG = OFFSET_VAR+KGL-1 - IOFF_COMPLEX = D_NSTAGTF(KGL)/2+1 + IOFF_COMPLEX = D_NSTAGTF(KGL)/2 ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+2*JF-1,JM+IOFF_COMPLEX) = & - & PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2 - PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+2*JF, JM+IOFF_COMPLEX) = & - & PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+2*JF, JM+IOFF_COMPLEX)*ZACHTE2 + PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX)) = & + & PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX))*ZACHTE2 + PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX)) = & + & PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX))*ZACHTE2 ENDDO ENDDO ENDDO @@ -147,21 +147,21 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_UV, KF_SCALARS, KUV_OFFSET, & DO KGL=IBEG,IEND,IINC DO JF=1,2*KF_UV IGLG = OFFSET_VAR+KGL-1 - IOFF_COMPLEX = D_NSTAGTF(KGL)/2+1 + IOFF_COMPLEX = D_NSTAGTF(KGL)/2 ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PREEL_COMPLEX(KUV_EWDER_OFFSET+2*JF-1,JM+IOFF_COMPLEX) = & - & -PREEL_COMPLEX(KUV_OFFSET+2*JF, JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) - PREEL_COMPLEX(KUV_EWDER_OFFSET+2*JF, JM+IOFF_COMPLEX) = & - & PREEL_COMPLEX(KUV_OFFSET+2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) + PREEL_COMPLEX(KUV_EWDER_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX)) = & + & -PREEL_COMPLEX(KUV_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX))*ZACHTE2*REAL(JM,JPRBT) + PREEL_COMPLEX(KUV_EWDER_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX)) = & + & PREEL_COMPLEX(KUV_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX))*ZACHTE2*REAL(JM,JPRBT) ENDDO !$ACC LOOP SEQ DO JM=G_NMEN(IGLG)+1,(G_NLOEN(IGLG)+4)/2-1 - PREEL_COMPLEX(KUV_EWDER_OFFSET+2*JF-1,JM+IOFF_COMPLEX) = 0._JPRBT - PREEL_COMPLEX(KUV_EWDER_OFFSET+2*JF, JM+IOFF_COMPLEX) = 0._JPRBT + PREEL_COMPLEX(KUV_EWDER_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX)) = 0._JPRBT + PREEL_COMPLEX(KUV_EWDER_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX)) = 0._JPRBT ENDDO ENDDO ENDDO @@ -173,21 +173,21 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_UV, KF_SCALARS, KUV_OFFSET, & DO KGL=IBEG,IEND,IINC DO JF=1,KF_SCALARS IGLG = OFFSET_VAR+KGL-1 - IOFF_COMPLEX = D_NSTAGTF(KGL)/2+1 + IOFF_COMPLEX = D_NSTAGTF(KGL)/2 ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+2*JF-1,JM+IOFF_COMPLEX) = & - & -PREEL_COMPLEX(KSCALARS_OFFSET+2*JF ,JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) - PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+2*JF, JM+IOFF_COMPLEX) = & - & PREEL_COMPLEX(KSCALARS_OFFSET+2*JF-1,JM+IOFF_COMPLEX)*ZACHTE2*REAL(JM,JPRBT) + PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX)) = & + & -PREEL_COMPLEX(KSCALARS_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX))*ZACHTE2*REAL(JM,JPRBT) + PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX)) = & + & PREEL_COMPLEX(KSCALARS_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX))*ZACHTE2*REAL(JM,JPRBT) ENDDO !$ACC LOOP SEQ DO JM=G_NMEN(IGLG)+1,(G_NLOEN(IGLG)+4)/2-1 - PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+2*JF-1,JM+IOFF_COMPLEX) = 0._JPRBT - PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+2*JF, JM+IOFF_COMPLEX) = 0._JPRBT + PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX)) = 0._JPRBT + PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX)) = 0._JPRBT ENDDO ENDDO ENDDO diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index 809b2ec1c..d2bbae7e4 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -99,7 +99,6 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:) REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) -REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX_2D(:,:) INTEGER(KIND=JPIM) :: KUV_OFFSET, KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, & & KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET @@ -141,7 +140,6 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& KSCALARS_EWDER_OFFSET = -1 ! And reiterate domain decomposition to assign pointers -CALL C_F_POINTER(C_LOC(PREEL_COMPLEX), PREEL_COMPLEX_2D, [2*KF_FS,D%NLENGTF/2]) IFIRST = 0 IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence @@ -168,8 +166,8 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& ! 2. Fourier space computations ! fill the rest of PREEL_COMPLEX -CALL FSC(PREEL_COMPLEX_2D, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, & - & KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) +CALL FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, & + & KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) ! 3. Fourier transform ! inplace operation From 4994f311522183a17036ead2f5427bca99a04b89 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:18 -0700 Subject: [PATCH 147/263] slightly simplify offset computation in FSC --- src/trans/gpu/internal/fsc_mod.F90 | 48 +++++++++++++++--------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 2d5f2dcb9..b12e3d8b1 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -99,16 +99,16 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & DO KGL=IBEG,IEND,IINC DO JF=1,2*KF_UV IGLG = OFFSET_VAR+KGL-1 - IOFF_COMPLEX = D_NSTAGTF(KGL)/2 + IOFF_COMPLEX = 2*(JF-1)+2*KF_FS*D_NSTAGTF(KGL)/2 ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PREEL_COMPLEX(KUV_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX)) = & - & PREEL_COMPLEX(KUV_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX))*ZACHTE2 - PREEL_COMPLEX(KUV_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX)) = & - & PREEL_COMPLEX(KUV_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX))*ZACHTE2 + PREEL_COMPLEX(KUV_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1) = & + & PREEL_COMPLEX(KUV_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1)*ZACHTE2 + PREEL_COMPLEX(KUV_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2) = & + & PREEL_COMPLEX(KUV_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2)*ZACHTE2 ENDDO ENDDO ENDDO @@ -120,16 +120,16 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & DO KGL=IBEG,IEND,IINC DO JF=1,KF_SCALARS IGLG = OFFSET_VAR+KGL-1 - IOFF_COMPLEX = D_NSTAGTF(KGL)/2 + IOFF_COMPLEX = 2*(JF-1)+2*KF_FS*D_NSTAGTF(KGL)/2 ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX)) = & - & PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX))*ZACHTE2 - PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX)) = & - & PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX))*ZACHTE2 + PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1) = & + & PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1)*ZACHTE2 + PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2) = & + & PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2)*ZACHTE2 ENDDO ENDDO ENDDO @@ -147,21 +147,21 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & DO KGL=IBEG,IEND,IINC DO JF=1,2*KF_UV IGLG = OFFSET_VAR+KGL-1 - IOFF_COMPLEX = D_NSTAGTF(KGL)/2 + IOFF_COMPLEX = 2*(JF-1)+2*KF_FS*D_NSTAGTF(KGL)/2 ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PREEL_COMPLEX(KUV_EWDER_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX)) = & - & -PREEL_COMPLEX(KUV_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX))*ZACHTE2*REAL(JM,JPRBT) - PREEL_COMPLEX(KUV_EWDER_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX)) = & - & PREEL_COMPLEX(KUV_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX))*ZACHTE2*REAL(JM,JPRBT) + PREEL_COMPLEX(KUV_EWDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1) = & + & -PREEL_COMPLEX(KUV_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2)*ZACHTE2*REAL(JM,JPRBT) + PREEL_COMPLEX(KUV_EWDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2) = & + & PREEL_COMPLEX(KUV_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1)*ZACHTE2*REAL(JM,JPRBT) ENDDO !$ACC LOOP SEQ DO JM=G_NMEN(IGLG)+1,(G_NLOEN(IGLG)+4)/2-1 - PREEL_COMPLEX(KUV_EWDER_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX)) = 0._JPRBT - PREEL_COMPLEX(KUV_EWDER_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX)) = 0._JPRBT + PREEL_COMPLEX(KUV_EWDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1) = 0._JPRBT + PREEL_COMPLEX(KUV_EWDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2) = 0._JPRBT ENDDO ENDDO ENDDO @@ -173,21 +173,21 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & DO KGL=IBEG,IEND,IINC DO JF=1,KF_SCALARS IGLG = OFFSET_VAR+KGL-1 - IOFF_COMPLEX = D_NSTAGTF(KGL)/2 + IOFF_COMPLEX = 2*(JF-1)+2*KF_FS*D_NSTAGTF(KGL)/2 ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX)) = & - & -PREEL_COMPLEX(KSCALARS_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX))*ZACHTE2*REAL(JM,JPRBT) - PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX)) = & - & PREEL_COMPLEX(KSCALARS_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX))*ZACHTE2*REAL(JM,JPRBT) + PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1) = & + & -PREEL_COMPLEX(KSCALARS_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2)*ZACHTE2*REAL(JM,JPRBT) + PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2) = & + & PREEL_COMPLEX(KSCALARS_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1)*ZACHTE2*REAL(JM,JPRBT) ENDDO !$ACC LOOP SEQ DO JM=G_NMEN(IGLG)+1,(G_NLOEN(IGLG)+4)/2-1 - PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+2*JF-1+2*KF_FS*(JM+IOFF_COMPLEX)) = 0._JPRBT - PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+2*JF +2*KF_FS*(JM+IOFF_COMPLEX)) = 0._JPRBT + PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1) = 0._JPRBT + PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2) = 0._JPRBT ENDDO ENDDO ENDDO From 92e080f9cf84fbe1343582c5a9490cc59f659bf2 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:19 -0700 Subject: [PATCH 148/263] Prepare for FFT transposition --- .../external/fourier/execute_plan_fftc.cu | 108 +++++++++++++++++- src/trans/gpu/internal/ftdir_mod.F90 | 99 +++++++++++++++- src/trans/gpu/internal/sump_trans_mod.F90 | 3 +- 3 files changed, 203 insertions(+), 7 deletions(-) diff --git a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu index e5434de92..dcbb99738 100644 --- a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu +++ b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu @@ -208,10 +208,116 @@ void execute_fft(typename Type::real *data_real, typename Type::cmplx *data_comp /* } */ CUDA_CHECK(cudaDeviceSynchronize()); } +template +void execute_fft_new(typename Type::real *data_real, typename Type::cmplx *data_complex, + int kfield, int *loens, int *offsets, int nfft) { + using real = typename Type::real; + using cmplx = typename Type::cmplx; + + /* static std::unordered_map allocationCache; // nloens -> ptr */ + static std::unordered_map> fftPlansCache; // kfield -> handles + static std::unordered_map graphCache; // kfield -> graphs + + // if the pointers are changed, we need to update the graph + static std::unordered_map> ptrCache; // kfield -> ptrs + + auto ptrs = ptrCache.find(kfield); + if (ptrs != ptrCache.end() && ( + ptrs->second.first != data_real || ptrs->second.second != data_complex)) { + // the plan is cached, but the pointers are not correct. we remove and delete the graph, + // but we keep the FFT plans, if this happens more often, we should cache this... + std::cout << "WARNING: POINTER CHANGE --> THIS MIGHT BE SLOW" << std::endl; + CUDA_CHECK(cudaGraphExecDestroy(graphCache[kfield])); + graphCache.erase(kfield); + ptrCache.erase(kfield); + } + + auto graph = graphCache.find(kfield); + if (graph == graphCache.end()) { + // this graph does not exist yet + + auto fftPlans = fftPlansCache.find(kfield); + if (fftPlans == fftPlansCache.end()) { + // the fft plans do not exist yet + std::vector newPlans; + newPlans.resize(nfft); + for (int i = 0; i < nfft; ++i) { + int nloen = loens[i]; + + cufftHandle plan; + CUFFT_CHECK(cufftCreate(&plan)); + int dist = 1; + int embed[] = {1}; + CUFFT_CHECK(cufftPlanMany(&plan, 1, &nloen, embed, kfield, dist, embed, + kfield, dist, Direction, kfield)); + newPlans[i] = plan; + } + fftPlansCache.insert({kfield, newPlans}); + } + fftPlans = fftPlansCache.find(kfield); + + // create a temporary stream + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + for (auto &plan : fftPlans->second) // set the streams + CUFFT_CHECK(cufftSetStream(plan, stream)); + + // now create the cuda graph + cudaGraph_t new_graph; + cudaGraphCreate(&new_graph, 0); + for (int i = 0; i < nfft; ++i) { + int offset = offsets[i]; + real *data_real_l = &data_real[kfield * offset]; + cmplx *data_complex_l = &data_complex[kfield * offset / 2]; + CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); + if constexpr(Direction == CUFFT_R2C) + CUFFT_CHECK(cufftExecR2C(fftPlans->second[i], data_real_l, data_complex_l)) + else if constexpr(Direction == CUFFT_C2R) + CUFFT_CHECK(cufftExecC2R(fftPlans->second[i], data_complex_l, data_real_l)) + else if constexpr(Direction == CUFFT_D2Z) + CUFFT_CHECK(cufftExecD2Z(fftPlans->second[i], data_real_l, data_complex_l)) + else if constexpr(Direction == CUFFT_Z2D) + CUFFT_CHECK(cufftExecZ2D(fftPlans->second[i], data_complex_l, data_real_l)); + cudaGraph_t my_graph; + CUDA_CHECK(cudaStreamEndCapture(stream, &my_graph)); + cudaGraphNode_t my_node; + CUDA_CHECK(cudaGraphAddChildGraphNode(&my_node, new_graph, nullptr, 0, my_graph)); + } + cudaGraphExec_t instance; + CUDA_CHECK(cudaGraphInstantiate(&instance, new_graph, NULL, NULL, 0)); + CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaGraphDestroy(new_graph)); + + graphCache.insert({kfield, instance}); + ptrCache.insert({kfield, std::make_pair(data_real, data_complex)}); + } + + CUDA_CHECK(cudaGraphLaunch(graphCache.at(kfield), 0)); + /* for (int i = 0; i < nfft; ++i) { */ + /* int nloen = loens[i]; */ + + /* cufftHandle plan; */ + /* CUFFT_CHECK(cufftCreate(&plan)); */ + /* int dist = 1; */ + /* int embed[] = {1}; */ + /* CUFFT_CHECK(cufftPlanMany(&plan, 1, &nloen, embed, kfield, dist, embed, */ + /* kfield, dist, Direction, kfield)); */ + /* int offset = offsets[i]; */ + /* real *data_real_l = &data_real[kfield * offset]; */ + /* cmplx *data_complex_l = &data_complex[kfield * offset / 2]; */ + /* if (Direction == CUFFT_R2C) */ + /* CUFFT_CHECK(cufftExecR2C(plan, data_real_l, data_complex_l)) */ + /* else */ + /* CUFFT_CHECK(cufftExecC2R(plan, data_complex_l, data_real_l)); */ + /* CUFFT_CHECK(cufftDestroy(plan)); */ + /* } */ + CUDA_CHECK(cudaDeviceSynchronize()); +} extern "C" { void execute_dir_fft_float(float *data_real, cufftComplex *data_complex, int kfield, int *loens, int *offsets, int nfft) { - execute_fft(data_real, data_complex, kfield, loens, offsets, nfft); + execute_fft_new(data_real, data_complex, kfield, loens, offsets, nfft); } void execute_inv_fft_float(cufftComplex *data_complex, float *data_real, int kfield, int *loens, int *offsets, int nfft) { diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index a23859d4e..705f6e699 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -60,9 +60,11 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:) +REAL(KIND=JPRBT), ALLOCATABLE :: PREEL_COMPLEX_TMP(:), PREEL_REAL_TMP(:) INTEGER(KIND=JPIM) :: IGLG,IOFF_REAL,IOFF_COMPLEX,KGL,IRET INTEGER(KIND=JPIM) :: IPLAN_R2C INTEGER(KIND=JPIM) :: IBEG,IEND,IINC +INTEGER(KIND=JPIM) :: KF, KI ! ------------------------------------------------------------------ @@ -77,10 +79,12 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) ENDIF +ALLOCATE(PREEL_REAL_TMP(KFIELD*D%NLENGTF)) ALLOCATE(PREEL_COMPLEX(KFIELD*D%NLENGTF)) -!$ACC ENTER DATA CREATE(PREEL_COMPLEX) +ALLOCATE(PREEL_COMPLEX_TMP(KFIELD*D%NLENGTF)) +!$ACC ENTER DATA CREATE(PREEL_COMPLEX,PREEL_COMPLEX_TMP,PREEL_REAL_TMP) -!$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX) +!$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX,PREEL_COMPLEX_TMP,PREEL_REAL_TMP) IF (LSYNC_TRANS) THEN CALL GSTATS(430,0) @@ -88,10 +92,93 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) CALL GSTATS(430,1) ENDIF CALL GSTATS(413,0) +!$ACC KERNELS +PREEL_REAL_TMP(:) = -1 +!$ACC END KERNELS +FLUSH(300+MYPROC) +!$ACC PARALLEL LOOP COLLAPSE(2) +DO KGL=1,D%NDGL_FS + DO KF=1,KFIELD + !$ACC LOOP SEQ + DO KI=1,D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL) + PREEL_REAL_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+KI) = & + & PREEL_REAL(KFIELD*D%NSTAGTF(KGL)+(KI-1)*KFIELD+KF) + ENDDO + ENDDO +ENDDO +!$ACC KERNELS +PREEL_REAL(:) = -1 +!$ACC END KERNELS +!$ACC PARALLEL LOOP COLLAPSE(2) +DO KGL=1,D%NDGL_FS + DO KF=1,KFIELD + !$ACC LOOP SEQ + DO KI=1,D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL) + PREEL_REAL(KFIELD*D%NSTAGTF(KGL)+(KI-1)*KFIELD+KF) = & + & PREEL_REAL_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+KI) + ENDDO + ENDDO +ENDDO CALL EXECUTE_DIR_FFT(PREEL_REAL(:),PREEL_COMPLEX(:),KFIELD, & & LOENS=G%NLOEN(D%NPTRLS(MYSETW):D%NPTRLS(MYSETW)+D%NDGL_FS-1), & - & OFFSETS=D%NSTAGTF(1:D%NDGL_FS)) - + & OFFSETS=D%NSTAGTF(1:D%NDGL_FS+1)) + +! DO KGL=1,D%NDGL_FS + ! DO KI=1,(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))/2 + ! KF=1 + ! WRITE(300+MYPROC,*) & + ! & KI, KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+1, & + ! & "<-", KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+1 + ! WRITE(300+MYPROC,*) & + ! & KI, KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+2, & + ! & "<-", KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+2 + ! KF=2 + ! WRITE(300+MYPROC,*) & + ! & KI, KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+1, & + ! & "<-", KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+1 + ! WRITE(300+MYPROC,*) & + ! & KI, KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+2, & + ! & "<-", KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+2 + ! KF=KFIELD + ! WRITE(300+MYPROC,*) & + ! & KI, KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+1, & + ! & "<-", KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+1 + ! WRITE(300+MYPROC,*) & + ! & KI, KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+2, & + ! & "<-", KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+2 + ! ENDDO +! ENDDO +!$ACC KERNELS +PREEL_COMPLEX_TMP(:) = -1 +!$ACC END KERNELS +FLUSH(300+MYPROC) +!$ACC PARALLEL LOOP COLLAPSE(2) +DO KGL=1,D%NDGL_FS + DO KF=1,KFIELD + !$ACC LOOP SEQ + DO KI=1,(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))/2 + PREEL_COMPLEX_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+1) = & + & PREEL_COMPLEX(KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+1) + PREEL_COMPLEX_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+2) = & + & PREEL_COMPLEX(KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+2) + ENDDO + ENDDO +ENDDO +!$ACC KERNELS +PREEL_COMPLEX(:) = -1 +!$ACC END KERNELS +!$ACC PARALLEL LOOP COLLAPSE(2) +DO KGL=1,D%NDGL_FS + DO KF=1,KFIELD + !$ACC LOOP SEQ + DO KI=1,(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))/2 + PREEL_COMPLEX(KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+1) = & + & PREEL_COMPLEX_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+1) + PREEL_COMPLEX(KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+2) = & + & PREEL_COMPLEX_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+2) + ENDDO + ENDDO +ENDDO DO KGL=IBEG,IEND,IINC ! NSTAGTF gives us space for NLOEN+3 elements @@ -117,8 +204,10 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) !$ACC END DATA -!$ACC EXIT DATA DELETE(PREEL_REAL) +!$ACC EXIT DATA DELETE(PREEL_REAL,PREEL_COMPLEX_TMP,PREEL_REAL_TMP) DEALLOCATE(PREEL_REAL) +DEALLOCATE(PREEL_COMPLEX_TMP) +DEALLOCATE(PREEL_REAL_TMP) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/sump_trans_mod.F90 b/src/trans/gpu/internal/sump_trans_mod.F90 index 04dda6722..8669aa95d 100755 --- a/src/trans/gpu/internal/sump_trans_mod.F90 +++ b/src/trans/gpu/internal/sump_trans_mod.F90 @@ -256,7 +256,7 @@ SUBROUTINE SUMP_TRANS D%NGPTOTL(:,:) = IGPTOTL(:,:) IF(.NOT.D%LGRIDONLY) THEN - ALLOCATE(D%NSTAGTF(D%NDGL_FS)) + ALLOCATE(D%NSTAGTF(D%NDGL_FS+1)) IF(LLP2)WRITE(NOUT,9) 'D%NSTAGTF ',SIZE(D%NSTAGTF ),SHAPE(D%NSTAGTF ) IOFF = 0 DO JGL=1,D%NDGL_FS @@ -269,6 +269,7 @@ SUBROUTINE SUMP_TRANS ! unaligned complex buffers IOFF = (IOFF+1)/2*2 ENDDO + D%NSTAGTF(D%NDGL_FS+1) = IOFF D%NLENGTF = IOFF ENDIF From f8938df7ed42fa849e9477d07f771c07d7c1d9dd Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:19 -0700 Subject: [PATCH 149/263] DIR FFT is transposed now (CHANGE1: 6) (CHANGE2: 8) --- .../external/fourier/execute_plan_fftc.cu | 8 +-- src/trans/gpu/internal/ftdir_mod.F90 | 61 +------------------ 2 files changed, 5 insertions(+), 64 deletions(-) diff --git a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu index dcbb99738..0b170be5c 100644 --- a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu +++ b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu @@ -246,10 +246,10 @@ void execute_fft_new(typename Type::real *data_real, typename Type::cmplx *data_ cufftHandle plan; CUFFT_CHECK(cufftCreate(&plan)); - int dist = 1; + int dist = offsets[i+1] - offsets[i]; int embed[] = {1}; - CUFFT_CHECK(cufftPlanMany(&plan, 1, &nloen, embed, kfield, dist, embed, - kfield, dist, Direction, kfield)); + CUFFT_CHECK(cufftPlanMany(&plan, 1, &nloen, embed, 1, dist, embed, + 1, dist / 2, Direction, kfield)); newPlans[i] = plan; } fftPlansCache.insert({kfield, newPlans}); @@ -325,7 +325,7 @@ void execute_inv_fft_float(cufftComplex *data_complex, float *data_real, } void execute_dir_fft_double(double *data_real, cufftDoubleComplex *data_complex, int kfield, int *loens, int *offsets, int nfft) { - execute_fft(data_real, data_complex, kfield, loens, offsets, nfft); + execute_fft_new(data_real, data_complex, kfield, loens, offsets, nfft); } void execute_inv_fft_double(cufftDoubleComplex *data_complex, double *data_real, int kfield, int *loens, int *offsets, int nfft) { diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 705f6e699..204fe6af2 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -95,7 +95,6 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) !$ACC KERNELS PREEL_REAL_TMP(:) = -1 !$ACC END KERNELS -FLUSH(300+MYPROC) !$ACC PARALLEL LOOP COLLAPSE(2) DO KGL=1,D%NDGL_FS DO KF=1,KFIELD @@ -106,67 +105,9 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) ENDDO ENDDO ENDDO -!$ACC KERNELS -PREEL_REAL(:) = -1 -!$ACC END KERNELS -!$ACC PARALLEL LOOP COLLAPSE(2) -DO KGL=1,D%NDGL_FS - DO KF=1,KFIELD - !$ACC LOOP SEQ - DO KI=1,D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL) - PREEL_REAL(KFIELD*D%NSTAGTF(KGL)+(KI-1)*KFIELD+KF) = & - & PREEL_REAL_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+KI) - ENDDO - ENDDO -ENDDO -CALL EXECUTE_DIR_FFT(PREEL_REAL(:),PREEL_COMPLEX(:),KFIELD, & +CALL EXECUTE_DIR_FFT(PREEL_REAL_TMP(:),PREEL_COMPLEX_TMP(:),KFIELD, & & LOENS=G%NLOEN(D%NPTRLS(MYSETW):D%NPTRLS(MYSETW)+D%NDGL_FS-1), & & OFFSETS=D%NSTAGTF(1:D%NDGL_FS+1)) - -! DO KGL=1,D%NDGL_FS - ! DO KI=1,(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))/2 - ! KF=1 - ! WRITE(300+MYPROC,*) & - ! & KI, KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+1, & - ! & "<-", KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+1 - ! WRITE(300+MYPROC,*) & - ! & KI, KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+2, & - ! & "<-", KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+2 - ! KF=2 - ! WRITE(300+MYPROC,*) & - ! & KI, KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+1, & - ! & "<-", KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+1 - ! WRITE(300+MYPROC,*) & - ! & KI, KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+2, & - ! & "<-", KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+2 - ! KF=KFIELD - ! WRITE(300+MYPROC,*) & - ! & KI, KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+1, & - ! & "<-", KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+1 - ! WRITE(300+MYPROC,*) & - ! & KI, KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+2, & - ! & "<-", KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+2 - ! ENDDO -! ENDDO -!$ACC KERNELS -PREEL_COMPLEX_TMP(:) = -1 -!$ACC END KERNELS -FLUSH(300+MYPROC) -!$ACC PARALLEL LOOP COLLAPSE(2) -DO KGL=1,D%NDGL_FS - DO KF=1,KFIELD - !$ACC LOOP SEQ - DO KI=1,(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))/2 - PREEL_COMPLEX_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+1) = & - & PREEL_COMPLEX(KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+1) - PREEL_COMPLEX_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+2) = & - & PREEL_COMPLEX(KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+2) - ENDDO - ENDDO -ENDDO -!$ACC KERNELS -PREEL_COMPLEX(:) = -1 -!$ACC END KERNELS !$ACC PARALLEL LOOP COLLAPSE(2) DO KGL=1,D%NDGL_FS DO KF=1,KFIELD From 1182224068ddd3d40b85aa36e49d82216b842022 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:19 -0700 Subject: [PATCH 150/263] fft dir: transpose complex part and remove intermediate --- src/trans/gpu/internal/fourier_out_mod.F90 | 7 ++-- src/trans/gpu/internal/ftdir_mod.F90 | 39 +++------------------- 2 files changed, 8 insertions(+), 38 deletions(-) diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 index bd70fb256..29b95029b 100755 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ b/src/trans/gpu/internal/fourier_out_mod.F90 @@ -76,7 +76,7 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) DO KGL=IBEG,IEND,IINC DO JF=1,KFIELDS IGLG = OFFSET_VAR+KGL-1 - IOFF_COMPLEX = D_NSTAGTF(KGL)/2 + IOFF_COMPLEX = KFIELDS*D_NSTAGTF(KGL)+(JF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL)) SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) !$ACC LOOP SEQ @@ -84,9 +84,8 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) IPROC = D_NPROCM(JM) ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KFIELDS*2 - ! This is not contiguous in PREEL due to the memory layout. - FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL_COMPLEX(2*JF-1+KFIELDS*2*(JM+IOFF_COMPLEX)) - FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL_COMPLEX(2*JF +KFIELDS*2*(JM+IOFF_COMPLEX)) + FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL_COMPLEX(IOFF_COMPLEX+2*JM+1) + FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL_COMPLEX(IOFF_COMPLEX+2*JM+2) ENDDO ENDDO ENDDO diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 204fe6af2..4dc165e63 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -60,7 +60,7 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:) -REAL(KIND=JPRBT), ALLOCATABLE :: PREEL_COMPLEX_TMP(:), PREEL_REAL_TMP(:) +REAL(KIND=JPRBT), ALLOCATABLE :: PREEL_REAL_TMP(:) INTEGER(KIND=JPIM) :: IGLG,IOFF_REAL,IOFF_COMPLEX,KGL,IRET INTEGER(KIND=JPIM) :: IPLAN_R2C INTEGER(KIND=JPIM) :: IBEG,IEND,IINC @@ -81,10 +81,9 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) ALLOCATE(PREEL_REAL_TMP(KFIELD*D%NLENGTF)) ALLOCATE(PREEL_COMPLEX(KFIELD*D%NLENGTF)) -ALLOCATE(PREEL_COMPLEX_TMP(KFIELD*D%NLENGTF)) -!$ACC ENTER DATA CREATE(PREEL_COMPLEX,PREEL_COMPLEX_TMP,PREEL_REAL_TMP) +!$ACC ENTER DATA CREATE(PREEL_COMPLEX,PREEL_REAL_TMP) -!$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX,PREEL_COMPLEX_TMP,PREEL_REAL_TMP) +!$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX,PREEL_REAL_TMP) IF (LSYNC_TRANS) THEN CALL GSTATS(430,0) @@ -105,36 +104,9 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) ENDDO ENDDO ENDDO -CALL EXECUTE_DIR_FFT(PREEL_REAL_TMP(:),PREEL_COMPLEX_TMP(:),KFIELD, & +CALL EXECUTE_DIR_FFT(PREEL_REAL_TMP(:),PREEL_COMPLEX(:),KFIELD, & & LOENS=G%NLOEN(D%NPTRLS(MYSETW):D%NPTRLS(MYSETW)+D%NDGL_FS-1), & & OFFSETS=D%NSTAGTF(1:D%NDGL_FS+1)) -!$ACC PARALLEL LOOP COLLAPSE(2) -DO KGL=1,D%NDGL_FS - DO KF=1,KFIELD - !$ACC LOOP SEQ - DO KI=1,(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))/2 - PREEL_COMPLEX(KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+1) = & - & PREEL_COMPLEX_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+1) - PREEL_COMPLEX(KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+2) = & - & PREEL_COMPLEX_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+2) - ENDDO - ENDDO -ENDDO -DO KGL=IBEG,IEND,IINC - - ! NSTAGTF gives us space for NLOEN+3 elements - ! In reality, at this point we need space for at most NLOEN+2 elements - ! (in case NLOEN is even, otherwise NLOEN+1, due to the R2C definition) - IOFF_REAL=D%NSTAGTF(KGL)+1 - IOFF_COMPLEX=D%NSTAGTF(KGL)/2+1 - IGLG = D%NPTRLS(MYSETW)+KGL-1 - - ! CALL CREATE_PLAN_FFT(IPLAN_R2C,-1,G%NLOEN(IGLG),KFIELD,KFIELD) - ! !$ACC HOST_DATA USE_DEVICE(PREEL_REAL,PREEL_COMPLEX) - ! CALL EXECUTE_PLAN_FFTC(IPLAN_R2C,-1,PREEL_REAL(1,IOFF_REAL),PREEL_COMPLEX(1,IOFF_COMPLEX)) - ! !$ACC END HOST_DATA -END DO -IRET = CUDA_SYNCHRONIZE() IF (LSYNC_TRANS) THEN CALL GSTATS(433,0) @@ -145,9 +117,8 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) !$ACC END DATA -!$ACC EXIT DATA DELETE(PREEL_REAL,PREEL_COMPLEX_TMP,PREEL_REAL_TMP) +!$ACC EXIT DATA DELETE(PREEL_REAL,PREEL_REAL_TMP) DEALLOCATE(PREEL_REAL) -DEALLOCATE(PREEL_COMPLEX_TMP) DEALLOCATE(PREEL_REAL_TMP) ! ------------------------------------------------------------------ From d4b5a412d1b30a1516d359e1dfff2a4bd11ff3ea Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:19 -0700 Subject: [PATCH 151/263] FFT Dir trans: move the temporary real buffer to trgtol and remove double buffer --- src/trans/gpu/internal/ftdir_mod.F90 | 26 +++----------- src/trans/gpu/internal/trgtol_mod.F90 | 52 ++++++++++++++++++++++----- 2 files changed, 48 insertions(+), 30 deletions(-) diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 4dc165e63..f1f15404c 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -60,7 +60,6 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:) -REAL(KIND=JPRBT), ALLOCATABLE :: PREEL_REAL_TMP(:) INTEGER(KIND=JPIM) :: IGLG,IOFF_REAL,IOFF_COMPLEX,KGL,IRET INTEGER(KIND=JPIM) :: IPLAN_R2C INTEGER(KIND=JPIM) :: IBEG,IEND,IINC @@ -79,11 +78,9 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) ENDIF -ALLOCATE(PREEL_REAL_TMP(KFIELD*D%NLENGTF)) -ALLOCATE(PREEL_COMPLEX(KFIELD*D%NLENGTF)) -!$ACC ENTER DATA CREATE(PREEL_COMPLEX,PREEL_REAL_TMP) +PREEL_COMPLEX => PREEL_REAL -!$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX,PREEL_REAL_TMP) +!$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX) IF (LSYNC_TRANS) THEN CALL GSTATS(430,0) @@ -91,20 +88,7 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) CALL GSTATS(430,1) ENDIF CALL GSTATS(413,0) -!$ACC KERNELS -PREEL_REAL_TMP(:) = -1 -!$ACC END KERNELS -!$ACC PARALLEL LOOP COLLAPSE(2) -DO KGL=1,D%NDGL_FS - DO KF=1,KFIELD - !$ACC LOOP SEQ - DO KI=1,D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL) - PREEL_REAL_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+KI) = & - & PREEL_REAL(KFIELD*D%NSTAGTF(KGL)+(KI-1)*KFIELD+KF) - ENDDO - ENDDO -ENDDO -CALL EXECUTE_DIR_FFT(PREEL_REAL_TMP(:),PREEL_COMPLEX(:),KFIELD, & +CALL EXECUTE_DIR_FFT(PREEL_REAL(:),PREEL_COMPLEX(:),KFIELD, & & LOENS=G%NLOEN(D%NPTRLS(MYSETW):D%NPTRLS(MYSETW)+D%NDGL_FS-1), & & OFFSETS=D%NSTAGTF(1:D%NDGL_FS+1)) @@ -117,9 +101,7 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) !$ACC END DATA -!$ACC EXIT DATA DELETE(PREEL_REAL,PREEL_REAL_TMP) -DEALLOCATE(PREEL_REAL) -DEALLOCATE(PREEL_REAL_TMP) +NULLIFY(PREEL_REAL) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index f82d26230..0e8bb8ab0 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -96,6 +96,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3A(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP2(:,:,:) + REAL(KIND=JPRBT),ALLOCATABLE :: PREEL_REAL_TMP(:) ! LOCAL VARIABLES @@ -115,6 +116,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& &JGL, JI, JK, JL, ISETW, IFLD, & &II,IBUFLENR,IRECV_COUNTS, IPROC,IFLDS, & &ISEND_COUNTS,INS,INR,IR, JKL, PBOUND, IERROR, ILOCAL_LAT + INTEGER(KIND=JPIM) :: KF, KGL, KI INTEGER(KIND=JPIM) :: IOFF, ILAT_STRIP INTEGER(KIND=JPIM) :: IRECV_BUFR_TO_OUT(D%NLENGTF),IRECV_BUFR_TO_OUT_OFFSET(NPROC), IRECV_BUFR_TO_OUT_V @@ -141,6 +143,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& #define TRGTOL_DTYPE MPI_DOUBLE_PRECISION #endif +ALLOCATE(PREEL_REAL_TMP(KF_FS*D%NLENGTF)) ! ------------------------------------------------------------------ !* 0. Some initializations @@ -224,7 +227,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& IRECVTOT(JROC) = IPOS*KF_FS ENDDO - !$ACC DATA COPYIN(IRECV_BUFR_TO_OUT,PGP_INDICES) PRESENT(PREEL_REAL) ASYNC(1) + !$ACC DATA COPYIN(IRECV_BUFR_TO_OUT,PGP_INDICES) PRESENT(PREEL_REAL) CREATE(PREEL_REAL_TMP) ASYNC(1) CALL GSTATS(1805,1) @@ -281,7 +284,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) IPOS = IRECV_BUFR_TO_OUT_V+JL - PREEL_REAL(JFLD+KF_FS*(IRECV_BUFR_TO_OUT(IPOS)-1)) = PGP(JK,IFLD,JBLK) + PREEL_REAL_TMP(JFLD+KF_FS*(IRECV_BUFR_TO_OUT(IPOS)-1)) = PGP(JK,IFLD,JBLK) ENDDO ENDDO ELSE @@ -295,18 +298,18 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) PBOUND=UBOUND(PGPUV,2) - PREEL_REAL(JFLD+KF_FS*IPOS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PREEL_REAL_TMP(JFLD+KF_FS*IPOS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) - PREEL_REAL(JFLD+KF_FS*IPOS) = PGP2(JK,IOFF+1,JBLK) + PREEL_REAL_TMP(JFLD+KF_FS*IPOS) = PGP2(JK,IOFF+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) PBOUND=UBOUND(PGP3A,2) - PREEL_REAL(JFLD+KF_FS*IPOS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PREEL_REAL_TMP(JFLD+KF_FS*IPOS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) PBOUND=UBOUND(PGP3B,2) - PREEL_REAL(JFLD+KF_FS*IPOS) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PREEL_REAL_TMP(JFLD+KF_FS*IPOS) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ENDIF ENDDO ENDDO @@ -487,16 +490,29 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& DO JFLD=1,KF_FS DO JL=1,ILEN II = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL)-1 - PREEL_REAL(JFLD+KF_FS*II) = ZCOMBUFR(ICOMBUFR_OFFSET_V+JL+(JFLD-1)*ILEN) + PREEL_REAL_TMP(JFLD+KF_FS*II) = ZCOMBUFR(ICOMBUFR_OFFSET_V+JL+(JFLD-1)*ILEN) ENDDO ENDDO ENDDO + !$ACC WAIT(1) + !$ACC KERNELS + PREEL_REAL(:) = -1 + !$ACC END KERNELS + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) + DO KGL=1,D%NDGL_FS + DO KF=1,KF_FS + !$ACC LOOP SEQ + DO KI=1,D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL) + PREEL_REAL(KF_FS*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+KI) = & + & PREEL_REAL_TMP(KF_FS*D%NSTAGTF(KGL)+(KI-1)*KF_FS+KF) + ENDDO + ENDDO + ENDDO CALL GSTATS(1603,1) !$ACC END DATA ! ZCOMBUFR !$ACC END DATA ! IRECV_BUFR_TO_OUT,PGPINDICES - !$ACC WAIT(1) IF (IRECV_COUNTS > 0) DEALLOCATE(ZCOMBUFR) @@ -595,6 +611,7 @@ SUBROUTINE TRGTOL(PREEL_REAL,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP2(:,:,:) REAL(KIND=JPRBT),ALLOCATABLE :: ZCOMBUFS(:,:),ZCOMBUFR(:,:) + REAL(KIND=JPRBT),ALLOCATABLE :: PREEL_REAL_TMP(:) REAL(KIND=JPRBT) :: ZDUM(2) INTEGER(KIND=JPIM) :: ISENT (NPROC) @@ -636,6 +653,7 @@ SUBROUTINE TRGTOL(PREEL_REAL,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR INTEGER(KIND=JPIM) :: IERROR, irank + INTEGER(KIND=JPIM) :: KF, KGL, KI REAL(KIND=JPRBT) :: TIMEF, tc @@ -1151,8 +1169,26 @@ SUBROUTINE TRGTOL(PREEL_REAL,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IF (IBUFLENS > 0) DEALLOCATE(ZCOMBUFS) IF (IBUFLENR > 0) DEALLOCATE(ZCOMBUFR) + ! this is not efficient but I don't care about this part of code !$ACC UPDATE DEVICE(PREEL_REAL) + !$ACC DATA PRESENT(PREEL_REAL) CREATE(PREEL_REAL_TMP) + !$ACC KERNELS + PREEL_REAL_TMP(:) = PREEL_REAL(:) + !$ACC END KERNELS + + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) + DO KGL=1,D%NDGL_FS + DO KF=1,KF_FS + !$ACC LOOP SEQ + DO KI=1,D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL) + PREEL_REAL(KF_FS*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+KI) = & + & PREEL_REAL_TMP(KF_FS*D%NSTAGTF(KGL)+(KI-1)*KF_FS+KF) + ENDDO + ENDDO + ENDDO + !$ACC END DATA + IF (LHOOK) CALL DR_HOOK('TRGTOL',1,ZHOOK_HANDLE) END SUBROUTINE TRGTOL From 57febf9d35a79b6db2a14bdbb754bc692d0ef1ac Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Thu, 2 Jun 2022 05:28:34 -0700 Subject: [PATCH 152/263] Re-enable GPNORM --- src/trans/gpu/external/gpnorm_trans.F90 | 47 ++++++++++++++++++------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/src/trans/gpu/external/gpnorm_trans.F90 b/src/trans/gpu/external/gpnorm_trans.F90 index 78d78e56a..fa1e2faa4 100755 --- a/src/trans/gpu/external/gpnorm_trans.F90 +++ b/src/trans/gpu/external/gpnorm_trans.F90 @@ -57,12 +57,12 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) USE TPM_GEN ,ONLY : NOUT USE TPM_DIM ,ONLY : R -USE TPM_TRANS ,ONLY : LGPNORM, NGPBLKS, NPROMA, ZGTF, ZAVE, ZMINGL, ZMAXGL, ZMINGPN, ZMAXGPN +USE TPM_TRANS ,ONLY : LGPNORM, NGPBLKS, NPROMA, ZAVE, ZMINGL, ZMAXGL, ZMINGPN, ZMAXGPN USE TPM_DISTR ,ONLY : D, NPRCIDS, NPRTRV, NPRTRW, MYSETV, MYSETW, NPROC, D_NSTAGTF,D_NPTRLS, MYPROC USE TPM_GEOMETRY ,ONLY : G,G_NLOEN,G_NLOEN_MAX USE TPM_FIELDS ,ONLY : F USE SET_RESOL_MOD ,ONLY : SET_RESOL -USE TRGTOL_MOD ,ONLY : TRGTOL +USE TRGTOL_MOD ,ONLY : TRGTOL_CUDAAWARE USE SET2PE_MOD ,ONLY : SET2PE USE MPL_MODULE ,ONLY : MPL_RECV, MPL_SEND, JP_BLOCKING_STANDARD USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS @@ -92,7 +92,8 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) INTEGER(KIND=JPIM),ALLOCATABLE :: IVSETS(:) INTEGER(KIND=JPIM),ALLOCATABLE :: IVSETG(:,:) !GPU -!REAL(KIND=JPRBT),ALLOCATABLE :: ZGTF(:,:) +REAL(KIND=JPRBT) :: V +REAL(KIND=JPRBT),ALLOCATABLE,SAVE :: ZGTF(:) !REAL(KIND=JPRD),ALLOCATABLE :: ZAVE(:,:) !REAL(KIND=JPRBT),ALLOCATABLE :: ZMINGL(:,:) !REAL(KIND=JPRBT),ALLOCATABLE :: ZMAXGL(:,:) @@ -107,6 +108,7 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) INTEGER(KIND=JPIM) :: IPROC,ITAG,ILEN,ILENR,IBEG,IEND,IND !INTEGER(KIND=JPIM) :: iunit + ! ------------------------------------------------------------------ IF (LHOOK) CALL DR_HOOK('GPNORM_TRANS',0,ZHOOK_HANDLE) @@ -145,6 +147,26 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) IF_FS=IF_FS+1 ENDIF ENDDO +if (.not. allocated(zave)) then +ALLOCATE(ZAVE(IF_FS,R%NDGL)) +ALLOCATE(ZMINGL(IF_FS,R%NDGL)) +ALLOCATE(ZMAXGL(IF_FS,R%NDGL)) +ALLOCATE(ZMINGPN(IF_FS)) +ALLOCATE(ZMAXGPN(IF_FS)) + +ZAVE = 0._JPRBT +ZMINGL = 0._JPRBT +ZMAXGL = 0._JPRBT +ZMINGPN = 0._JPRBT +ZMAXGPN = 0._JPRBT +!$ACC ENTER DATA COPYIN(ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) + +if (.not. allocated(zgtf)) then +ALLOCATE(ZGTF(IF_FS*D%NLENGTF)) +write(nout,*)'ZGTF :',size(ZGTF) +!$ACC ENTER DATA CREATE(ZGTF) +endif +endif ALLOCATE(IVSETS(NPRTRV)) IVSETS(:)=0 @@ -167,10 +189,7 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) ! done in setup_trans LGPNORM=.TRUE. -print *, "not supported" -flush(6) -stop 1 -! CALL TRGTOL(ZGTF,IF_FS,IF_GP,IF_SCALARS_G,IVSET,PGP=PGP) +CALL TRGTOL_CUDAAWARE(ZGTF,IF_FS,IF_GP,IVSET,PGP=PGP) LGPNORM=.FALSE. ! ZGTF is now on GPU @@ -188,23 +207,25 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) !$ACC KERNELS DO JF=1,IF_FS - ZMINGL(JF,IBEG:IEND)=ZGTF(JF,D_NSTAGTF(1)+1) - ZMAXGL(JF,IBEG:IEND)=ZGTF(JF,D_NSTAGTF(1)+1) + V = ZGTF(IF_FS*D_NSTAGTF(1)+(JF-1)*(D%NSTAGTF(2)-D%NSTAGTF(1))) + ZMINGL(JF,IBEG:IEND)=HUGE(1_JPRBT) + ZMAXGL(JF,IBEG:IEND)=-HUGE(1_JPRBT) ENDDO !$ACC END KERNELS ! FIRST DO SUMS IN EACH FULL LATITUDE !$ACC KERNELS - DO JGL=IBEG,IEND + DO JGL=1,D%NDGL_FS IGL = D_NPTRLS(MYSETW) + JGL - 1 DO JF=1,IF_FS ZAVE(JF,JGL)=0.0_JPRB !$ACC loop DO JL=1,G_NLOEN(IGL) - ZAVE(JF,JGL)=ZAVE(JF,JGL)+ZGTF(JF,D_NSTAGTF(JGL)+JL) - ZMINGL(JF,JGL)=MIN(ZMINGL(JF,JGL),ZGTF(JF,D_NSTAGTF(JGL)+JL)) - ZMAXGL(JF,JGL)=MAX(ZMAXGL(JF,JGL),ZGTF(JF,D_NSTAGTF(JGL)+JL)) + V = ZGTF(IF_FS*D%NSTAGTF(JGL)+(JF-1)*(D%NSTAGTF(JGL+1)-D%NSTAGTF(JGL))+JL) + ZAVE(JF,JGL)=ZAVE(JF,JGL)+V + ZMINGL(JF,JGL)=MIN(ZMINGL(JF,JGL),V) + ZMAXGL(JF,JGL)=MAX(ZMAXGL(JF,JGL),V) ENDDO ENDDO ENDDO From 3de130e4727e75a99ec0fc3ec9d6d8e6e78a7f34 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:19 -0700 Subject: [PATCH 153/263] FFT Dir: Integrate transformed preel into trgtol --- src/trans/gpu/internal/trgtol_mod.F90 | 48 +++++++++++---------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 0e8bb8ab0..dbe0e0676 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -96,7 +96,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3A(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP2(:,:,:) - REAL(KIND=JPRBT),ALLOCATABLE :: PREEL_REAL_TMP(:) ! LOCAL VARIABLES @@ -119,7 +118,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& INTEGER(KIND=JPIM) :: KF, KGL, KI INTEGER(KIND=JPIM) :: IOFF, ILAT_STRIP - INTEGER(KIND=JPIM) :: IRECV_BUFR_TO_OUT(D%NLENGTF),IRECV_BUFR_TO_OUT_OFFSET(NPROC), IRECV_BUFR_TO_OUT_V + INTEGER(KIND=JPIM) :: IRECV_BUFR_TO_OUT(D%NLENGTF,2),IRECV_BUFR_TO_OUT_OFFSET(NPROC), IRECV_BUFR_TO_OUT_V INTEGER(KIND=JPIM) :: ISEND_FIELD_COUNT(NPRTRV),ISEND_FIELD_COUNT_V INTEGER(KIND=JPIM) :: ISEND_WSET_SIZE(NPRTRW),ISEND_WSET_SIZE_V INTEGER(KIND=JPIM) :: ISEND_WSET_OFFSET(NPRTRW+1), ISEND_WSET_OFFSET_V @@ -143,7 +142,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& #define TRGTOL_DTYPE MPI_DOUBLE_PRECISION #endif -ALLOCATE(PREEL_REAL_TMP(KF_FS*D%NLENGTF)) ! ------------------------------------------------------------------ !* 0. Some initializations @@ -219,15 +217,19 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& IGLL = JGL-D%NPTRLS(MYSETW)+1 DO JL=1,D%NONL(IGL,ISETB) IPOS = IPOS+1 - ! indicates where the data has to be stored - IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_OFFSET(JROC)+IPOS) = D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL)+JL-1 + ! offset to first layer of this gridpoint + IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_OFFSET(JROC)+IPOS,1) = & + & KF_FS*D%NSTAGTF(IGLL)+(D%NSTA(IGL,ISETB)-1)+(JL-1) + ! distance between two layers of this gridpoint + IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_OFFSET(JROC)+IPOS,2) = & + & D%NSTAGTF(IGLL+1)-D%NSTAGTF(IGLL) ENDDO ENDDO !we always receive the full fourier space IRECVTOT(JROC) = IPOS*KF_FS ENDDO - !$ACC DATA COPYIN(IRECV_BUFR_TO_OUT,PGP_INDICES) PRESENT(PREEL_REAL) CREATE(PREEL_REAL_TMP) ASYNC(1) + !$ACC DATA COPYIN(IRECV_BUFR_TO_OUT,PGP_INDICES) PRESENT(PREEL_REAL) ASYNC(1) CALL GSTATS(1805,1) @@ -283,8 +285,9 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) - IPOS = IRECV_BUFR_TO_OUT_V+JL - PREEL_REAL_TMP(JFLD+KF_FS*(IRECV_BUFR_TO_OUT(IPOS)-1)) = PGP(JK,IFLD,JBLK) + IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ & + & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 + PREEL_REAL(IPOS) = PGP(JK,IFLD,JBLK) ENDDO ENDDO ELSE @@ -294,22 +297,23 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) - IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL)-1 + IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ & + & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) PBOUND=UBOUND(PGPUV,2) - PREEL_REAL_TMP(JFLD+KF_FS*IPOS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PREEL_REAL(IPOS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) - PREEL_REAL_TMP(JFLD+KF_FS*IPOS) = PGP2(JK,IOFF+1,JBLK) + PREEL_REAL(IPOS) = PGP2(JK,IOFF+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) PBOUND=UBOUND(PGP3A,2) - PREEL_REAL_TMP(JFLD+KF_FS*IPOS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PREEL_REAL(IPOS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) PBOUND=UBOUND(PGP3B,2) - PREEL_REAL_TMP(JFLD+KF_FS*IPOS) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PREEL_REAL(IPOS) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ENDIF ENDDO ENDDO @@ -489,26 +493,14 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(II) ASYNC(1) DO JFLD=1,KF_FS DO JL=1,ILEN - II = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL)-1 - PREEL_REAL_TMP(JFLD+KF_FS*II) = ZCOMBUFR(ICOMBUFR_OFFSET_V+JL+(JFLD-1)*ILEN) + IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ & + & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 + PREEL_REAL(IPOS) = ZCOMBUFR(ICOMBUFR_OFFSET_V+JL+(JFLD-1)*ILEN) ENDDO ENDDO ENDDO !$ACC WAIT(1) - !$ACC KERNELS - PREEL_REAL(:) = -1 - !$ACC END KERNELS - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) - DO KGL=1,D%NDGL_FS - DO KF=1,KF_FS - !$ACC LOOP SEQ - DO KI=1,D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL) - PREEL_REAL(KF_FS*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+KI) = & - & PREEL_REAL_TMP(KF_FS*D%NSTAGTF(KGL)+(KI-1)*KF_FS+KF) - ENDDO - ENDDO - ENDDO CALL GSTATS(1603,1) !$ACC END DATA ! ZCOMBUFR From 34c3106ab3178bd3bb91ebab9431a1316dda0bd9 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:20 -0700 Subject: [PATCH 154/263] INV: Prepare for FFT transposition --- src/trans/gpu/internal/ftdir_mod.F90 | 16 +------ src/trans/gpu/internal/ftinv_mod.F90 | 68 ++++++++++++++++++++++++++-- 2 files changed, 66 insertions(+), 18 deletions(-) diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index f1f15404c..908fed977 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -60,24 +60,10 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:) -INTEGER(KIND=JPIM) :: IGLG,IOFF_REAL,IOFF_COMPLEX,KGL,IRET -INTEGER(KIND=JPIM) :: IPLAN_R2C -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC -INTEGER(KIND=JPIM) :: KF, KI +INTEGER(KIND=JPIM) :: IGLG,KGL ! ------------------------------------------------------------------ -IF(MYPROC > NPROC/2)THEN - IBEG=1 - IEND=D%NDGL_FS - IINC=1 -ELSE - IBEG=D%NDGL_FS - IEND=1 - IINC=-1 -ENDIF - - PREEL_COMPLEX => PREEL_REAL !$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX) diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index bef3162f5..b22ac3df7 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -60,9 +60,11 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_COMPLEX(:) +REAL(KIND=JPRBT), ALLOCATABLE :: PREEL_COMPLEX_TMP(:), PREEL_REAL_TMP(:) INTEGER(KIND=JPIM) :: IGLG,IOFF_REAL,IOFF_COMPLEX,KGL,IRET INTEGER(KIND=JPIM) :: IPLAN_C2R INTEGER(KIND=JPIM) :: IBEG,IEND,IINC +INTEGER(KIND=JPIM) :: KF, KI ! ------------------------------------------------------------------ @@ -78,9 +80,11 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) ALLOCATE(PREEL_REAL(KFIELD*D%NLENGTF)) -!$ACC ENTER DATA CREATE(PREEL_REAL) +ALLOCATE(PREEL_REAL_TMP(KFIELD*D%NLENGTF)) +ALLOCATE(PREEL_COMPLEX_TMP(KFIELD*D%NLENGTF)) +!$ACC ENTER DATA CREATE(PREEL_REAL,PREEL_COMPLEX_TMP,PREEL_REAL_TMP) -!$ACC DATA PRESENT(PREEL_COMPLEX,PREEL_REAL) +!$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX,PREEL_COMPLEX_TMP,PREEL_REAL_TMP) IF (LSYNC_TRANS) THEN CALL GSTATS(440,0) @@ -88,9 +92,65 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) CALL GSTATS(440,1) ENDIF CALL GSTATS(423,0) +!$ACC KERNELS +PREEL_COMPLEX_TMP(:) = -1 +!$ACC END KERNELS +!$ACC PARALLEL LOOP COLLAPSE(2) +DO KGL=1,D%NDGL_FS + DO KF=1,KFIELD + !$ACC LOOP SEQ + DO KI=1,(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))/2 + PREEL_COMPLEX_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+1) & + & = PREEL_COMPLEX(KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+1) + PREEL_COMPLEX_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+2) & + & = PREEL_COMPLEX(KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+2) + ENDDO + ENDDO +ENDDO +!$ACC KERNELS +PREEL_COMPLEX(:) = -1 +!$ACC END KERNELS +!$ACC PARALLEL LOOP COLLAPSE(2) +DO KGL=1,D%NDGL_FS + DO KF=1,KFIELD + !$ACC LOOP SEQ + DO KI=1,(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))/2 + PREEL_COMPLEX(KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+1) = & + & PREEL_COMPLEX_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+1) + PREEL_COMPLEX(KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+2) = & + & PREEL_COMPLEX_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+2) + ENDDO + ENDDO +ENDDO CALL EXECUTE_INV_FFT(PREEL_COMPLEX(:),PREEL_REAL(:),KFIELD, & & LOENS=G%NLOEN(D%NPTRLS(MYSETW):D%NPTRLS(MYSETW)+D%NDGL_FS-1), & & OFFSETS=D%NSTAGTF(1:D%NDGL_FS)) +!$ACC KERNELS +PREEL_REAL_TMP(:) = -1 +!$ACC END KERNELS +!$ACC PARALLEL LOOP COLLAPSE(2) +DO KGL=1,D%NDGL_FS + DO KF=1,KFIELD + !$ACC LOOP SEQ + DO KI=1,D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL) + PREEL_REAL_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+KI) = & + & PREEL_REAL(KFIELD*D%NSTAGTF(KGL)+(KI-1)*KFIELD+KF) + ENDDO + ENDDO +ENDDO +!$ACC KERNELS +PREEL_REAL(:) = -1 +!$ACC END KERNELS +!$ACC PARALLEL LOOP COLLAPSE(2) +DO KGL=1,D%NDGL_FS + DO KF=1,KFIELD + !$ACC LOOP SEQ + DO KI=1,D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL) + PREEL_REAL(KFIELD*D%NSTAGTF(KGL)+(KI-1)*KFIELD+KF) = & + & PREEL_REAL_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+KI) + ENDDO + ENDDO +ENDDO ! DO KGL=IBEG,IEND,IINC @@ -114,8 +174,10 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) !$ACC END DATA -!$ACC EXIT DATA DELETE(PREEL_COMPLEX) +!$ACC EXIT DATA DELETE(PREEL_COMPLEX,PREEL_COMPLEX_TMP,PREEL_REAL_TMP) DEALLOCATE(PREEL_COMPLEX) +DEALLOCATE(PREEL_COMPLEX_TMP) +DEALLOCATE(PREEL_REAL_TMP) ! ------------------------------------------------------------------ From 56efaf563774202addc988f4a2828bafc4935a58 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:20 -0700 Subject: [PATCH 155/263] INV FFT is transposed now (CHANGE1: 7) (CHANGE2: 9) --- .../external/fourier/execute_plan_fftc.cu | 9 ++--- src/trans/gpu/internal/ftinv_mod.F90 | 33 +------------------ 2 files changed, 6 insertions(+), 36 deletions(-) diff --git a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu index 0b170be5c..b04b36c9a 100644 --- a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu +++ b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu @@ -211,6 +211,7 @@ void execute_fft(typename Type::real *data_real, typename Type::cmplx *data_comp template void execute_fft_new(typename Type::real *data_real, typename Type::cmplx *data_complex, int kfield, int *loens, int *offsets, int nfft) { + constexpr bool is_forward = Direction == CUFFT_R2C || Direction == CUFFT_D2Z; using real = typename Type::real; using cmplx = typename Type::cmplx; @@ -248,8 +249,8 @@ void execute_fft_new(typename Type::real *data_real, typename Type::cmplx *data_ CUFFT_CHECK(cufftCreate(&plan)); int dist = offsets[i+1] - offsets[i]; int embed[] = {1}; - CUFFT_CHECK(cufftPlanMany(&plan, 1, &nloen, embed, 1, dist, embed, - 1, dist / 2, Direction, kfield)); + CUFFT_CHECK(cufftPlanMany(&plan, 1, &nloen, embed, 1, is_forward ? dist : dist / 2, embed, + 1, is_forward ? dist / 2 : dist, Direction, kfield)); newPlans[i] = plan; } fftPlansCache.insert({kfield, newPlans}); @@ -321,7 +322,7 @@ void execute_dir_fft_float(float *data_real, cufftComplex *data_complex, } void execute_inv_fft_float(cufftComplex *data_complex, float *data_real, int kfield, int *loens, int *offsets, int nfft) { - execute_fft(data_real, data_complex, kfield, loens, offsets, nfft); + execute_fft_new(data_real, data_complex, kfield, loens, offsets, nfft); } void execute_dir_fft_double(double *data_real, cufftDoubleComplex *data_complex, int kfield, int *loens, int *offsets, int nfft) { @@ -329,6 +330,6 @@ void execute_dir_fft_double(double *data_real, cufftDoubleComplex *data_complex, } void execute_inv_fft_double(cufftDoubleComplex *data_complex, double *data_real, int kfield, int *loens, int *offsets, int nfft) { - execute_fft(data_real, data_complex, kfield, loens, offsets, nfft); + execute_fft_new(data_real, data_complex, kfield, loens, offsets, nfft); } } diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index b22ac3df7..a9f5ffa4f 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -107,40 +107,9 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) ENDDO ENDDO ENDDO -!$ACC KERNELS -PREEL_COMPLEX(:) = -1 -!$ACC END KERNELS -!$ACC PARALLEL LOOP COLLAPSE(2) -DO KGL=1,D%NDGL_FS - DO KF=1,KFIELD - !$ACC LOOP SEQ - DO KI=1,(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))/2 - PREEL_COMPLEX(KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+1) = & - & PREEL_COMPLEX_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+1) - PREEL_COMPLEX(KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+2) = & - & PREEL_COMPLEX_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+2) - ENDDO - ENDDO -ENDDO -CALL EXECUTE_INV_FFT(PREEL_COMPLEX(:),PREEL_REAL(:),KFIELD, & +CALL EXECUTE_INV_FFT(PREEL_COMPLEX_TMP(:),PREEL_REAL_TMP(:),KFIELD, & & LOENS=G%NLOEN(D%NPTRLS(MYSETW):D%NPTRLS(MYSETW)+D%NDGL_FS-1), & & OFFSETS=D%NSTAGTF(1:D%NDGL_FS)) -!$ACC KERNELS -PREEL_REAL_TMP(:) = -1 -!$ACC END KERNELS -!$ACC PARALLEL LOOP COLLAPSE(2) -DO KGL=1,D%NDGL_FS - DO KF=1,KFIELD - !$ACC LOOP SEQ - DO KI=1,D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL) - PREEL_REAL_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+KI) = & - & PREEL_REAL(KFIELD*D%NSTAGTF(KGL)+(KI-1)*KFIELD+KF) - ENDDO - ENDDO -ENDDO -!$ACC KERNELS -PREEL_REAL(:) = -1 -!$ACC END KERNELS !$ACC PARALLEL LOOP COLLAPSE(2) DO KGL=1,D%NDGL_FS DO KF=1,KFIELD From 9deea4864e7ecb44624bf93e510721c92b0790cb Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:20 -0700 Subject: [PATCH 156/263] Redundant/temporary duplication in the fft wrappers not needed anymore --- .../external/fourier/execute_plan_fftc.cu | 309 ++++++------------ 1 file changed, 100 insertions(+), 209 deletions(-) diff --git a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu index b04b36c9a..ff2a6ebe1 100644 --- a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu +++ b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu @@ -49,15 +49,15 @@ static const char *_cudaGetErrorEnum(cufftResult error) { } \ } -#define CUFFT_CHECK(e) { \ - cufftResult_t err = (e); \ - if (err != CUFFT_SUCCESS) \ - { \ - fprintf(stderr, "CUFFT error: %s, line %d, %s: %s\n", \ - __FILE__, __LINE__, #e, _cudaGetErrorEnum(err)); \ - exit(EXIT_FAILURE); \ - } \ -} +#define CUFFT_CHECK(e) \ + { \ + cufftResult_t err = (e); \ + if (err != CUFFT_SUCCESS) { \ + fprintf(stderr, "CUFFT error: %s, line %d, %s: %s\n", __FILE__, \ + __LINE__, #e, _cudaGetErrorEnum(err)); \ + exit(EXIT_FAILURE); \ + } \ + } extern void *planWorkspace; @@ -101,235 +101,126 @@ struct Float { using real = float; using cmplx = cufftComplex; }; -} +} // namespace template -void execute_fft(typename Type::real *data_real, typename Type::cmplx *data_complex, - int kfield, int *loens, int *offsets, int nfft) { - using real = typename Type::real; - using cmplx = typename Type::cmplx; - - /* static std::unordered_map allocationCache; // nloens -> ptr */ - static std::unordered_map> fftPlansCache; // kfield -> handles - static std::unordered_map graphCache; // kfield -> graphs - - // if the pointers are changed, we need to update the graph - static std::unordered_map> ptrCache; // kfield -> ptrs - - auto ptrs = ptrCache.find(kfield); - if (ptrs != ptrCache.end() && ( - ptrs->second.first != data_real || ptrs->second.second != data_complex)) { - // the plan is cached, but the pointers are not correct. we remove and delete the graph, - // but we keep the FFT plans, if this happens more often, we should cache this... - std::cout << "WARNING: POINTER CHANGE --> THIS MIGHT BE SLOW" << std::endl; - CUDA_CHECK(cudaGraphExecDestroy(graphCache[kfield])); - graphCache.erase(kfield); - ptrCache.erase(kfield); - } - - auto graph = graphCache.find(kfield); - if (graph == graphCache.end()) { - // this graph does not exist yet - - auto fftPlans = fftPlansCache.find(kfield); - if (fftPlans == fftPlansCache.end()) { - // the fft plans do not exist yet - std::vector newPlans; - newPlans.resize(nfft); - for (int i = 0; i < nfft; ++i) { - int nloen = loens[i]; - - cufftHandle plan; - CUFFT_CHECK(cufftCreate(&plan)); - int dist = 1; - int embed[] = {1}; - CUFFT_CHECK(cufftPlanMany(&plan, 1, &nloen, embed, kfield, dist, embed, - kfield, dist, Direction, kfield)); - newPlans[i] = plan; - } - fftPlansCache.insert({kfield, newPlans}); - } - fftPlans = fftPlansCache.find(kfield); - - // create a temporary stream - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - for (auto &plan : fftPlans->second) // set the streams - CUFFT_CHECK(cufftSetStream(plan, stream)); - - // now create the cuda graph - cudaGraph_t new_graph; - cudaGraphCreate(&new_graph, 0); - for (int i = 0; i < nfft; ++i) { - int offset = offsets[i]; - real *data_real_l = &data_real[kfield * offset]; - cmplx *data_complex_l = &data_complex[kfield * offset / 2]; - CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); - if constexpr(Direction == CUFFT_R2C) - CUFFT_CHECK(cufftExecR2C(fftPlans->second[i], data_real_l, data_complex_l)) - else if constexpr(Direction == CUFFT_C2R) - CUFFT_CHECK(cufftExecC2R(fftPlans->second[i], data_complex_l, data_real_l)) - else if constexpr(Direction == CUFFT_D2Z) - CUFFT_CHECK(cufftExecD2Z(fftPlans->second[i], data_real_l, data_complex_l)) - else if constexpr(Direction == CUFFT_Z2D) - CUFFT_CHECK(cufftExecZ2D(fftPlans->second[i], data_complex_l, data_real_l)); - cudaGraph_t my_graph; - CUDA_CHECK(cudaStreamEndCapture(stream, &my_graph)); - cudaGraphNode_t my_node; - CUDA_CHECK(cudaGraphAddChildGraphNode(&my_node, new_graph, nullptr, 0, my_graph)); - } - cudaGraphExec_t instance; - CUDA_CHECK(cudaGraphInstantiate(&instance, new_graph, NULL, NULL, 0)); - CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaGraphDestroy(new_graph)); - - graphCache.insert({kfield, instance}); - ptrCache.insert({kfield, std::make_pair(data_real, data_complex)}); - } - - CUDA_CHECK(cudaGraphLaunch(graphCache.at(kfield), 0)); - /* for (int i = 0; i < nfft; ++i) { */ - /* int nloen = loens[i]; */ - - /* cufftHandle plan; */ - /* CUFFT_CHECK(cufftCreate(&plan)); */ - /* int dist = 1; */ - /* int embed[] = {1}; */ - /* CUFFT_CHECK(cufftPlanMany(&plan, 1, &nloen, embed, kfield, dist, embed, */ - /* kfield, dist, Direction, kfield)); */ - /* int offset = offsets[i]; */ - /* real *data_real_l = &data_real[kfield * offset]; */ - /* cmplx *data_complex_l = &data_complex[kfield * offset / 2]; */ - /* if (Direction == CUFFT_R2C) */ - /* CUFFT_CHECK(cufftExecR2C(plan, data_real_l, data_complex_l)) */ - /* else */ - /* CUFFT_CHECK(cufftExecC2R(plan, data_complex_l, data_real_l)); */ - /* CUFFT_CHECK(cufftDestroy(plan)); */ - /* } */ - CUDA_CHECK(cudaDeviceSynchronize()); -} -template -void execute_fft_new(typename Type::real *data_real, typename Type::cmplx *data_complex, - int kfield, int *loens, int *offsets, int nfft) { +void execute_fft(typename Type::real *data_real, + typename Type::cmplx *data_complex, int kfield, int *loens, + int *offsets, int nfft) { constexpr bool is_forward = Direction == CUFFT_R2C || Direction == CUFFT_D2Z; using real = typename Type::real; using cmplx = typename Type::cmplx; /* static std::unordered_map allocationCache; // nloens -> ptr */ - static std::unordered_map> fftPlansCache; // kfield -> handles - static std::unordered_map graphCache; // kfield -> graphs + static std::unordered_map> + fftPlansCache; // kfield -> handles + static std::unordered_map + graphCache; // kfield -> graphs // if the pointers are changed, we need to update the graph - static std::unordered_map> ptrCache; // kfield -> ptrs + static std::unordered_map> + ptrCache; // kfield -> ptrs auto ptrs = ptrCache.find(kfield); - if (ptrs != ptrCache.end() && ( - ptrs->second.first != data_real || ptrs->second.second != data_complex)) { - // the plan is cached, but the pointers are not correct. we remove and delete the graph, - // but we keep the FFT plans, if this happens more often, we should cache this... - std::cout << "WARNING: POINTER CHANGE --> THIS MIGHT BE SLOW" << std::endl; - CUDA_CHECK(cudaGraphExecDestroy(graphCache[kfield])); - graphCache.erase(kfield); - ptrCache.erase(kfield); + if (ptrs != ptrCache.end() && (ptrs->second.first != data_real || + ptrs->second.second != data_complex)) { + // the plan is cached, but the pointers are not correct. we remove and + // delete the graph, but we keep the FFT plans, if this happens more often, + // we should cache this... + std::cout << "WARNING: POINTER CHANGE --> THIS MIGHT BE SLOW" << std::endl; + CUDA_CHECK(cudaGraphExecDestroy(graphCache[kfield])); + graphCache.erase(kfield); + ptrCache.erase(kfield); } auto graph = graphCache.find(kfield); if (graph == graphCache.end()) { - // this graph does not exist yet - - auto fftPlans = fftPlansCache.find(kfield); - if (fftPlans == fftPlansCache.end()) { - // the fft plans do not exist yet - std::vector newPlans; - newPlans.resize(nfft); - for (int i = 0; i < nfft; ++i) { - int nloen = loens[i]; - - cufftHandle plan; - CUFFT_CHECK(cufftCreate(&plan)); - int dist = offsets[i+1] - offsets[i]; - int embed[] = {1}; - CUFFT_CHECK(cufftPlanMany(&plan, 1, &nloen, embed, 1, is_forward ? dist : dist / 2, embed, - 1, is_forward ? dist / 2 : dist, Direction, kfield)); - newPlans[i] = plan; - } - fftPlansCache.insert({kfield, newPlans}); - } - fftPlans = fftPlansCache.find(kfield); - - // create a temporary stream - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - for (auto &plan : fftPlans->second) // set the streams - CUFFT_CHECK(cufftSetStream(plan, stream)); + // this graph does not exist yet - // now create the cuda graph - cudaGraph_t new_graph; - cudaGraphCreate(&new_graph, 0); + auto fftPlans = fftPlansCache.find(kfield); + if (fftPlans == fftPlansCache.end()) { + // the fft plans do not exist yet + std::vector newPlans; + newPlans.resize(nfft); for (int i = 0; i < nfft; ++i) { - int offset = offsets[i]; - real *data_real_l = &data_real[kfield * offset]; - cmplx *data_complex_l = &data_complex[kfield * offset / 2]; - CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); - if constexpr(Direction == CUFFT_R2C) - CUFFT_CHECK(cufftExecR2C(fftPlans->second[i], data_real_l, data_complex_l)) - else if constexpr(Direction == CUFFT_C2R) - CUFFT_CHECK(cufftExecC2R(fftPlans->second[i], data_complex_l, data_real_l)) - else if constexpr(Direction == CUFFT_D2Z) - CUFFT_CHECK(cufftExecD2Z(fftPlans->second[i], data_real_l, data_complex_l)) - else if constexpr(Direction == CUFFT_Z2D) - CUFFT_CHECK(cufftExecZ2D(fftPlans->second[i], data_complex_l, data_real_l)); - cudaGraph_t my_graph; - CUDA_CHECK(cudaStreamEndCapture(stream, &my_graph)); - cudaGraphNode_t my_node; - CUDA_CHECK(cudaGraphAddChildGraphNode(&my_node, new_graph, nullptr, 0, my_graph)); + int nloen = loens[i]; + + cufftHandle plan; + CUFFT_CHECK(cufftCreate(&plan)); + int dist = offsets[i + 1] - offsets[i]; + int embed[] = {1}; + CUFFT_CHECK(cufftPlanMany( + &plan, 1, &nloen, embed, 1, is_forward ? dist : dist / 2, embed, 1, + is_forward ? dist / 2 : dist, Direction, kfield)); + newPlans[i] = plan; } - cudaGraphExec_t instance; - CUDA_CHECK(cudaGraphInstantiate(&instance, new_graph, NULL, NULL, 0)); - CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaGraphDestroy(new_graph)); - - graphCache.insert({kfield, instance}); - ptrCache.insert({kfield, std::make_pair(data_real, data_complex)}); + fftPlansCache.insert({kfield, newPlans}); + } + fftPlans = fftPlansCache.find(kfield); + + // create a temporary stream + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + for (auto &plan : fftPlans->second) // set the streams + CUFFT_CHECK(cufftSetStream(plan, stream)); + + // now create the cuda graph + cudaGraph_t new_graph; + cudaGraphCreate(&new_graph, 0); + for (int i = 0; i < nfft; ++i) { + int offset = offsets[i]; + real *data_real_l = &data_real[kfield * offset]; + cmplx *data_complex_l = &data_complex[kfield * offset / 2]; + CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); + if constexpr (Direction == CUFFT_R2C) + CUFFT_CHECK( + cufftExecR2C(fftPlans->second[i], data_real_l, data_complex_l)) + else if constexpr (Direction == CUFFT_C2R) + CUFFT_CHECK( + cufftExecC2R(fftPlans->second[i], data_complex_l, data_real_l)) + else if constexpr (Direction == CUFFT_D2Z) + CUFFT_CHECK( + cufftExecD2Z(fftPlans->second[i], data_real_l, data_complex_l)) + else if constexpr (Direction == CUFFT_Z2D) + CUFFT_CHECK( + cufftExecZ2D(fftPlans->second[i], data_complex_l, data_real_l)); + cudaGraph_t my_graph; + CUDA_CHECK(cudaStreamEndCapture(stream, &my_graph)); + cudaGraphNode_t my_node; + CUDA_CHECK(cudaGraphAddChildGraphNode(&my_node, new_graph, nullptr, 0, + my_graph)); + } + cudaGraphExec_t instance; + CUDA_CHECK(cudaGraphInstantiate(&instance, new_graph, NULL, NULL, 0)); + CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaGraphDestroy(new_graph)); + + graphCache.insert({kfield, instance}); + ptrCache.insert({kfield, std::make_pair(data_real, data_complex)}); } CUDA_CHECK(cudaGraphLaunch(graphCache.at(kfield), 0)); - /* for (int i = 0; i < nfft; ++i) { */ - /* int nloen = loens[i]; */ - - /* cufftHandle plan; */ - /* CUFFT_CHECK(cufftCreate(&plan)); */ - /* int dist = 1; */ - /* int embed[] = {1}; */ - /* CUFFT_CHECK(cufftPlanMany(&plan, 1, &nloen, embed, kfield, dist, embed, */ - /* kfield, dist, Direction, kfield)); */ - /* int offset = offsets[i]; */ - /* real *data_real_l = &data_real[kfield * offset]; */ - /* cmplx *data_complex_l = &data_complex[kfield * offset / 2]; */ - /* if (Direction == CUFFT_R2C) */ - /* CUFFT_CHECK(cufftExecR2C(plan, data_real_l, data_complex_l)) */ - /* else */ - /* CUFFT_CHECK(cufftExecC2R(plan, data_complex_l, data_real_l)); */ - /* CUFFT_CHECK(cufftDestroy(plan)); */ - /* } */ CUDA_CHECK(cudaDeviceSynchronize()); } extern "C" { void execute_dir_fft_float(float *data_real, cufftComplex *data_complex, - int kfield, int *loens, int *offsets, int nfft) { - execute_fft_new(data_real, data_complex, kfield, loens, offsets, nfft); + int kfield, int *loens, int *offsets, int nfft) { + execute_fft(data_real, data_complex, kfield, loens, offsets, + nfft); } void execute_inv_fft_float(cufftComplex *data_complex, float *data_real, - int kfield, int *loens, int *offsets, int nfft) { - execute_fft_new(data_real, data_complex, kfield, loens, offsets, nfft); + int kfield, int *loens, int *offsets, int nfft) { + execute_fft(data_real, data_complex, kfield, loens, offsets, + nfft); } void execute_dir_fft_double(double *data_real, cufftDoubleComplex *data_complex, - int kfield, int *loens, int *offsets, int nfft) { - execute_fft_new(data_real, data_complex, kfield, loens, offsets, nfft); + int kfield, int *loens, int *offsets, int nfft) { + execute_fft(data_real, data_complex, kfield, loens, + offsets, nfft); } void execute_inv_fft_double(cufftDoubleComplex *data_complex, double *data_real, - int kfield, int *loens, int *offsets, int nfft) { - execute_fft_new(data_real, data_complex, kfield, loens, offsets, nfft); + int kfield, int *loens, int *offsets, int nfft) { + execute_fft(data_real, data_complex, kfield, loens, + offsets, nfft); } } From 5eb4bc52c32b13a631c3b4c633301740464c86dc Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:20 -0700 Subject: [PATCH 157/263] FFT INV: Remove double buffer for preel_real and transpose in trltog --- src/trans/gpu/internal/ftinv_mod.F90 | 34 ++++----------------------- src/trans/gpu/internal/trltog_mod.F90 | 19 ++++++++++----- 2 files changed, 18 insertions(+), 35 deletions(-) diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index a9f5ffa4f..eb813e4b8 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -60,7 +60,7 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_COMPLEX(:) -REAL(KIND=JPRBT), ALLOCATABLE :: PREEL_COMPLEX_TMP(:), PREEL_REAL_TMP(:) +REAL(KIND=JPRBT), ALLOCATABLE :: PREEL_COMPLEX_TMP(:) INTEGER(KIND=JPIM) :: IGLG,IOFF_REAL,IOFF_COMPLEX,KGL,IRET INTEGER(KIND=JPIM) :: IPLAN_C2R INTEGER(KIND=JPIM) :: IBEG,IEND,IINC @@ -80,11 +80,10 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) ALLOCATE(PREEL_REAL(KFIELD*D%NLENGTF)) -ALLOCATE(PREEL_REAL_TMP(KFIELD*D%NLENGTF)) ALLOCATE(PREEL_COMPLEX_TMP(KFIELD*D%NLENGTF)) -!$ACC ENTER DATA CREATE(PREEL_REAL,PREEL_COMPLEX_TMP,PREEL_REAL_TMP) +!$ACC ENTER DATA CREATE(PREEL_REAL,PREEL_COMPLEX_TMP) -!$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX,PREEL_COMPLEX_TMP,PREEL_REAL_TMP) +!$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX,PREEL_COMPLEX_TMP) IF (LSYNC_TRANS) THEN CALL GSTATS(440,0) @@ -107,31 +106,9 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) ENDDO ENDDO ENDDO -CALL EXECUTE_INV_FFT(PREEL_COMPLEX_TMP(:),PREEL_REAL_TMP(:),KFIELD, & +CALL EXECUTE_INV_FFT(PREEL_COMPLEX_TMP(:),PREEL_REAL(:),KFIELD, & & LOENS=G%NLOEN(D%NPTRLS(MYSETW):D%NPTRLS(MYSETW)+D%NDGL_FS-1), & & OFFSETS=D%NSTAGTF(1:D%NDGL_FS)) -!$ACC PARALLEL LOOP COLLAPSE(2) -DO KGL=1,D%NDGL_FS - DO KF=1,KFIELD - !$ACC LOOP SEQ - DO KI=1,D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL) - PREEL_REAL(KFIELD*D%NSTAGTF(KGL)+(KI-1)*KFIELD+KF) = & - & PREEL_REAL_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+KI) - ENDDO - ENDDO -ENDDO - -! DO KGL=IBEG,IEND,IINC - - ! IOFF_REAL=D%NSTAGTF(KGL)+1 - ! IOFF_COMPLEX=D%NSTAGTF(KGL)/2+1 - ! IGLG = D%NPTRLS(MYSETW)+KGL-1 - - ! CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G%NLOEN(IGLG),KFIELD,KFIELD) - ! !$ACC HOST_DATA USE_DEVICE(PREEL_COMPLEX,PREEL_REAL) - ! CALL EXECUTE_PLAN_FFTC(IPLAN_C2R,1,PREEL_COMPLEX(1,IOFF_COMPLEX),PREEL_REAL(1,IOFF_REAL)) - ! !$ACC END HOST_DATA -! END DO IRET = CUDA_SYNCHRONIZE() IF (LSYNC_TRANS) THEN @@ -143,10 +120,9 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) !$ACC END DATA -!$ACC EXIT DATA DELETE(PREEL_COMPLEX,PREEL_COMPLEX_TMP,PREEL_REAL_TMP) +!$ACC EXIT DATA DELETE(PREEL_COMPLEX,PREEL_COMPLEX_TMP) DEALLOCATE(PREEL_COMPLEX) DEALLOCATE(PREEL_COMPLEX_TMP) -DEALLOCATE(PREEL_REAL_TMP) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 08165ec30..91ab2d770 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -122,7 +122,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KP INTEGER(KIND=JPIM) :: IUVPAR,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR,IPAROFF,IOFF INTEGER(KIND=JPIM) :: IFLDA(KF_GP) - INTEGER(KIND=JPIM) :: IIN_TO_SEND_BUFR(D%NLENGTF),IIN_TO_SEND_BUFR_OFFSET(NPROC), IIN_TO_SEND_BUFR_V + INTEGER(KIND=JPIM) :: IIN_TO_SEND_BUFR(D%NLENGTF,2),IIN_TO_SEND_BUFR_OFFSET(NPROC), IIN_TO_SEND_BUFR_V INTEGER(KIND=JPIM) :: IRECV_FIELD_COUNT(NPRTRV),IRECV_FIELD_COUNT_V INTEGER(KIND=JPIM) :: IRECV_WSET_SIZE(NPRTRW),IRECV_WSET_SIZE_V INTEGER(KIND=JPIM) :: IRECV_WSET_OFFSET(NPRTRW+1), IRECV_WSET_OFFSET_V @@ -333,8 +333,12 @@ SUBROUTINE TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KP IGLL = JGL-D%NPTRLS(MYSETW)+1 DO JL=1,D%NONL(IGL,ISETB) IPOS = IPOS+1 - ! indicates where the data has to be stored - IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_OFFSET(JROC)+IPOS) = D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL)+JL-1 + ! offset to first layer of this gridpoint + IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_OFFSET(JROC)+IPOS,1) = & + & KF_FS*D%NSTAGTF(IGLL)+(D%NSTA(IGL,ISETB)-1)+(JL-1) + ! distance between two layers of this gridpoint + IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_OFFSET(JROC)+IPOS,2) = & + & D%NSTAGTF(IGLL+1)-D%NSTAGTF(IGLL) ENDDO ENDDO !we always receive the full fourier space @@ -384,7 +388,8 @@ SUBROUTINE TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KP JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) - IPOS = KF_FS*(IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL)-1)+JFLD + IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,1)+ & + & (JFLD-1)*IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,2)+1 PGP(JK,IFLD,JBLK) = PREEL_REAL(IPOS) ENDDO ENDDO @@ -395,7 +400,8 @@ SUBROUTINE TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KP JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) - IPOS = KF_FS*(IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL)-1)+JFLD + IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,1)+ & + & (JFLD-1)*IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,2)+1 IF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_UV) THEN PGPUV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP2) THEN @@ -456,7 +462,8 @@ SUBROUTINE TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KP !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IPOS) COLLAPSE(2) ASYNC(1) DO JL=1,ILEN DO JFLD=1,KF_FS - IPOS = KF_FS*(IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL)-1)+JFLD + IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,1)+ & + & (JFLD-1)*IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,2)+1 ZCOMBUFS(ICOMBUFS_OFFSET_V+(JFLD-1)*ILEN+JL) = PREEL_REAL(IPOS) ENDDO ENDDO From 41c098b084fa41ef4230ab0e76072730dbc7bf29 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:21 -0700 Subject: [PATCH 158/263] INV: Move transposition into ftinv_ctl_mod --- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 26 +++++++++++++++ src/trans/gpu/internal/ftinv_mod.F90 | 41 +++--------------------- 2 files changed, 31 insertions(+), 36 deletions(-) diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index d2bbae7e4..941854b3c 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -109,6 +109,9 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& INTEGER(KIND=JPIM) :: J3,JGL,IGL,IOFF,IFGP2,IFGP3A,IFGP3B,IGP3APAR,IGP3BPAR INTEGER(KIND=JPIM) :: IFIRST INTEGER(KIND=JPIM) :: KF_FS +INTEGER(JPIM) :: KGL, KF, KI + +REAL(KIND=JPRBT), ALLOCATABLE :: PREEL_COMPLEX_TMP(:) ! ------------------------------------------------------------------ @@ -169,6 +172,29 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& CALL FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, & & KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) +ALLOCATE(PREEL_COMPLEX_TMP(KF_FS*D%NLENGTF)) +!$ACC DATA PRESENT(PREEL_COMPLEX) CREATE(PREEL_COMPLEX_TMP) +!$ACC KERNELS +PREEL_COMPLEX_TMP(:) = -1 +!$ACC END KERNELS +!$ACC PARALLEL LOOP COLLAPSE(2) +DO KGL=1,D%NDGL_FS + DO KF=1,KF_FS + !$ACC LOOP SEQ + DO KI=1,(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))/2 + PREEL_COMPLEX_TMP(KF_FS*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+1) & + & = PREEL_COMPLEX(KF_FS*D%NSTAGTF(KGL)+2*(KI-1)*KF_FS+2*(KF-1)+1) + PREEL_COMPLEX_TMP(KF_FS*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+2) & + & = PREEL_COMPLEX(KF_FS*D%NSTAGTF(KGL)+2*(KI-1)*KF_FS+2*(KF-1)+2) + ENDDO + ENDDO +ENDDO +!$ACC KERNELS +PREEL_COMPLEX(:) = PREEL_COMPLEX_TMP(:) +!$ACC END KERNELS +!$ACC END DATA +DEALLOCATE(PREEL_COMPLEX_TMP) + ! 3. Fourier transform ! inplace operation IF(KF_FS > 0) THEN diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index eb813e4b8..c0dab53ab 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -60,30 +60,15 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_COMPLEX(:) -REAL(KIND=JPRBT), ALLOCATABLE :: PREEL_COMPLEX_TMP(:) -INTEGER(KIND=JPIM) :: IGLG,IOFF_REAL,IOFF_COMPLEX,KGL,IRET -INTEGER(KIND=JPIM) :: IPLAN_C2R -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC -INTEGER(KIND=JPIM) :: KF, KI +INTEGER(KIND=JPIM) :: IGLG,KGL,IRET ! ------------------------------------------------------------------ -IF(MYPROC > NPROC/2)THEN - IBEG=1 - IEND=D%NDGL_FS - IINC=1 -ELSE - IBEG=D%NDGL_FS - IEND=1 - IINC=-1 -ENDIF - ALLOCATE(PREEL_REAL(KFIELD*D%NLENGTF)) -ALLOCATE(PREEL_COMPLEX_TMP(KFIELD*D%NLENGTF)) -!$ACC ENTER DATA CREATE(PREEL_REAL,PREEL_COMPLEX_TMP) +!$ACC ENTER DATA CREATE(PREEL_REAL) -!$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX,PREEL_COMPLEX_TMP) +!$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX) IF (LSYNC_TRANS) THEN CALL GSTATS(440,0) @@ -91,22 +76,7 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) CALL GSTATS(440,1) ENDIF CALL GSTATS(423,0) -!$ACC KERNELS -PREEL_COMPLEX_TMP(:) = -1 -!$ACC END KERNELS -!$ACC PARALLEL LOOP COLLAPSE(2) -DO KGL=1,D%NDGL_FS - DO KF=1,KFIELD - !$ACC LOOP SEQ - DO KI=1,(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))/2 - PREEL_COMPLEX_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+1) & - & = PREEL_COMPLEX(KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+1) - PREEL_COMPLEX_TMP(KFIELD*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+2) & - & = PREEL_COMPLEX(KFIELD*D%NSTAGTF(KGL)+2*(KI-1)*KFIELD+2*(KF-1)+2) - ENDDO - ENDDO -ENDDO -CALL EXECUTE_INV_FFT(PREEL_COMPLEX_TMP(:),PREEL_REAL(:),KFIELD, & +CALL EXECUTE_INV_FFT(PREEL_COMPLEX(:),PREEL_REAL(:),KFIELD, & & LOENS=G%NLOEN(D%NPTRLS(MYSETW):D%NPTRLS(MYSETW)+D%NDGL_FS-1), & & OFFSETS=D%NSTAGTF(1:D%NDGL_FS)) IRET = CUDA_SYNCHRONIZE() @@ -120,9 +90,8 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) !$ACC END DATA -!$ACC EXIT DATA DELETE(PREEL_COMPLEX,PREEL_COMPLEX_TMP) +!$ACC EXIT DATA DELETE(PREEL_COMPLEX) DEALLOCATE(PREEL_COMPLEX) -DEALLOCATE(PREEL_COMPLEX_TMP) ! ------------------------------------------------------------------ From b529aa3d0c43b190c8455878afd73598f78ea464 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:21 -0700 Subject: [PATCH 159/263] INV: In-place FFT --- src/trans/gpu/internal/ftinv_mod.F90 | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index c0dab53ab..435989cd0 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -64,9 +64,7 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) ! ------------------------------------------------------------------ - -ALLOCATE(PREEL_REAL(KFIELD*D%NLENGTF)) -!$ACC ENTER DATA CREATE(PREEL_REAL) +PREEL_REAL => PREEL_COMPLEX !$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX) @@ -90,8 +88,7 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) !$ACC END DATA -!$ACC EXIT DATA DELETE(PREEL_COMPLEX) -DEALLOCATE(PREEL_COMPLEX) +NULLIFY(PREEL_COMPLEX) ! ------------------------------------------------------------------ From 337d4530b74e779ba3aa3fec739b4cd4059ce67c Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:21 -0700 Subject: [PATCH 160/263] INV: Adapt FSC to the tranposed layout --- src/trans/gpu/internal/fsc_mod.F90 | 61 ++++++++++++++---------- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 23 ++++----- 2 files changed, 47 insertions(+), 37 deletions(-) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index b12e3d8b1..4f70ea548 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -61,7 +61,8 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & INTEGER(KIND=JPIM) :: KGL REAL(KIND=JPRBT) :: ZACHTE2 -INTEGER(KIND=JPIM) :: IOFF_COMPLEX,OFFSET_VAR +INTEGER(KIND=JPIM) :: IOFF_LAT,OFFSET_VAR +INTEGER(KIND=JPIM) :: IOFF_SCALARS,IOFF_SCALARS_EWDER,IOFF_UV,IOFF_UV_EWDER,IOFF_KSCALARS_NSDER INTEGER(KIND=JPIM) :: JF,IGLG,JM INTEGER(KIND=JPIM) :: IBEG,IEND,IINC @@ -99,16 +100,17 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & DO KGL=IBEG,IEND,IINC DO JF=1,2*KF_UV IGLG = OFFSET_VAR+KGL-1 - IOFF_COMPLEX = 2*(JF-1)+2*KF_FS*D_NSTAGTF(KGL)/2 + IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_UV = IOFF_LAT+(KUV_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PREEL_COMPLEX(KUV_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1) = & - & PREEL_COMPLEX(KUV_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1)*ZACHTE2 - PREEL_COMPLEX(KUV_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2) = & - & PREEL_COMPLEX(KUV_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2)*ZACHTE2 + PREEL_COMPLEX(IOFF_UV+2*JM+1) = & + & PREEL_COMPLEX(IOFF_UV+2*JM+1)*ZACHTE2 + PREEL_COMPLEX(IOFF_UV+2*JM+2) = & + & PREEL_COMPLEX(IOFF_UV+2*JM+2)*ZACHTE2 ENDDO ENDDO ENDDO @@ -120,16 +122,17 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & DO KGL=IBEG,IEND,IINC DO JF=1,KF_SCALARS IGLG = OFFSET_VAR+KGL-1 - IOFF_COMPLEX = 2*(JF-1)+2*KF_FS*D_NSTAGTF(KGL)/2 + IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_KSCALARS_NSDER = IOFF_LAT+(KSCALARS_NSDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1) = & - & PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1)*ZACHTE2 - PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2) = & - & PREEL_COMPLEX(KSCALARS_NSDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2)*ZACHTE2 + PREEL_COMPLEX(IOFF_KSCALARS_NSDER+2*JM+1) = & + & PREEL_COMPLEX(IOFF_KSCALARS_NSDER+2*JM+1)*ZACHTE2 + PREEL_COMPLEX(IOFF_KSCALARS_NSDER+2*JM+2) = & + & PREEL_COMPLEX(IOFF_KSCALARS_NSDER+2*JM+2)*ZACHTE2 ENDDO ENDDO ENDDO @@ -147,21 +150,23 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & DO KGL=IBEG,IEND,IINC DO JF=1,2*KF_UV IGLG = OFFSET_VAR+KGL-1 - IOFF_COMPLEX = 2*(JF-1)+2*KF_FS*D_NSTAGTF(KGL)/2 + IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_UV = IOFF_LAT+(KUV_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + IOFF_UV_EWDER = IOFF_LAT+(KUV_EWDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PREEL_COMPLEX(KUV_EWDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1) = & - & -PREEL_COMPLEX(KUV_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2)*ZACHTE2*REAL(JM,JPRBT) - PREEL_COMPLEX(KUV_EWDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2) = & - & PREEL_COMPLEX(KUV_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1)*ZACHTE2*REAL(JM,JPRBT) + PREEL_COMPLEX(IOFF_UV_EWDER+2*JM+1) = & + & -PREEL_COMPLEX(IOFF_UV+2*JM+2)*ZACHTE2*REAL(JM,JPRBT) + PREEL_COMPLEX(IOFF_UV_EWDER+2*JM+2) = & + & PREEL_COMPLEX(IOFF_UV+2*JM+1)*ZACHTE2*REAL(JM,JPRBT) ENDDO !$ACC LOOP SEQ DO JM=G_NMEN(IGLG)+1,(G_NLOEN(IGLG)+4)/2-1 - PREEL_COMPLEX(KUV_EWDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1) = 0._JPRBT - PREEL_COMPLEX(KUV_EWDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2) = 0._JPRBT + PREEL_COMPLEX(IOFF_UV_EWDER+2*JM+1) = 0._JPRBT + PREEL_COMPLEX(IOFF_UV_EWDER+2*JM+2) = 0._JPRBT ENDDO ENDDO ENDDO @@ -169,25 +174,29 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & !* 2.2 SCALAR VARIABLES IF (KSCALARS_EWDER_OFFSET > 0) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) & + !$ACC& PRIVATE(IGLG,IOFF_LAT,IOFF_SCALARS_EWDER,IOFF_SCALARS,ZACHTE2,JM) DO KGL=IBEG,IEND,IINC DO JF=1,KF_SCALARS IGLG = OFFSET_VAR+KGL-1 - IOFF_COMPLEX = 2*(JF-1)+2*KF_FS*D_NSTAGTF(KGL)/2 + IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_SCALARS_EWDER = IOFF_LAT+(KSCALARS_EWDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + IOFF_SCALARS = IOFF_LAT+(KSCALARS_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + ZACHTE2 = F%RACTHE(IGLG) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) - PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1) = & - & -PREEL_COMPLEX(KSCALARS_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2)*ZACHTE2*REAL(JM,JPRBT) - PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2) = & - & PREEL_COMPLEX(KSCALARS_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1)*ZACHTE2*REAL(JM,JPRBT) + PREEL_COMPLEX(IOFF_SCALARS_EWDER+2*JM+1) = & + & -PREEL_COMPLEX(IOFF_SCALARS+2*JM+2)*ZACHTE2*REAL(JM,JPRBT) + PREEL_COMPLEX(IOFF_SCALARS_EWDER+2*JM+2) = & + & PREEL_COMPLEX(IOFF_SCALARS+2*JM+1)*ZACHTE2*REAL(JM,JPRBT) ENDDO !$ACC LOOP SEQ DO JM=G_NMEN(IGLG)+1,(G_NLOEN(IGLG)+4)/2-1 - PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+1) = 0._JPRBT - PREEL_COMPLEX(KSCALARS_EWDER_OFFSET+IOFF_COMPLEX+2*KF_FS*JM+2) = 0._JPRBT + PREEL_COMPLEX(IOFF_SCALARS_EWDER+2*JM+1) = 0._JPRBT + PREEL_COMPLEX(IOFF_SCALARS_EWDER+2*JM+2) = 0._JPRBT ENDDO ENDDO ENDDO diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index 941854b3c..d5bf75147 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -134,7 +134,7 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives KF_FS = IFIRST -ALLOCATE(PREEL_COMPLEX((2*KF_FS)*(D%NLENGTF/2))) +ALLOCATE(PREEL_COMPLEX(KF_FS*D%NLENGTF)) !$ACC ENTER DATA CREATE(PREEL_COMPLEX) ! Initialize potentially unset offsets @@ -146,31 +146,26 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& IFIRST = 0 IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence -KUV_OFFSET = 2*IFIRST +KUV_OFFSET = IFIRST IFIRST = IFIRST + 2*KF_UV ! U and V -KSCALARS_OFFSET = 2*IFIRST +KSCALARS_OFFSET = IFIRST IFIRST = IFIRST + KF_SCALARS ! Scalars IF (LSCDERS) THEN - KSCALARS_NSDER_OFFSET = 2*IFIRST + KSCALARS_NSDER_OFFSET = IFIRST IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives ENDIF IF (LUVDER) THEN - KUV_EWDER_OFFSET = 2*IFIRST + KUV_EWDER_OFFSET = IFIRST IFIRST = IFIRST+2*KF_UV ! U and V derivatives ENDIF IF (LSCDERS) THEN - KSCALARS_EWDER_OFFSET = 2*IFIRST + KSCALARS_EWDER_OFFSET = IFIRST IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives ENDIF ! from FOUBUF to PREEL_COMPLEX. Divide by two because we consider this complex space now CALL FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_INPUT/2,KF_FS) -! 2. Fourier space computations - -! fill the rest of PREEL_COMPLEX -CALL FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, & - & KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) ALLOCATE(PREEL_COMPLEX_TMP(KF_FS*D%NLENGTF)) !$ACC DATA PRESENT(PREEL_COMPLEX) CREATE(PREEL_COMPLEX_TMP) @@ -195,6 +190,12 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& !$ACC END DATA DEALLOCATE(PREEL_COMPLEX_TMP) +! 2. Fourier space computations + +! fill the rest of PREEL_COMPLEX +CALL FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, & + & KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) + ! 3. Fourier transform ! inplace operation IF(KF_FS > 0) THEN From 8280e2c4a862cb10e0e12132d6f07d1e880588bb Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:21 -0700 Subject: [PATCH 161/263] INV: Fourier_in is transposed, too --- src/trans/gpu/internal/fourier_in_mod.F90 | 14 +++++++------- src/trans/gpu/internal/fourier_out_mod.F90 | 10 +++++----- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 6 +++--- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index 837a3b107..f821c9a74 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -50,7 +50,7 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) REAL(KIND=JPRBT), INTENT(OUT) :: PREEL_COMPLEX(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_CURRENT, KF_TOTAL -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA,OFFSET_VAR,IOFF_COMPLEX,KGL +INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA,OFFSET_VAR,IOFF_LAT,KGL INTEGER(KIND=JPIM) :: IBEG,IEND,IINC IF(MYPROC > NPROC/2)THEN @@ -66,27 +66,27 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) !$ACC DATA PRESENT(G_NLOEN,G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF) ASYNC(1) OFFSET_VAR=D_NPTRLS(MYSETW) -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF_COMPLEX,JM,IPROC,ISTA) DEFAULT(NONE) ASYNC(1) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF_LAT,JM,IPROC,ISTA) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=1,KF_CURRENT IGLG = OFFSET_VAR+KGL-1 - IOFF_COMPLEX = D_NSTAGTF(KGL)/2 + IOFF_LAT = KF_TOTAL*D_NSTAGTF(KGL)+(JF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL)) !$ACC LOOP SEQ DO JM=0,G_NMEN(IGLG) IPROC = D_NPROCM(JM) ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KF_CURRENT*2 - PREEL_COMPLEX(2*JF-1+2*KF_TOTAL*(JM+IOFF_COMPLEX)) = FOUBUF(ISTA+2*JF-1) - PREEL_COMPLEX(2*JF +2*KF_TOTAL*(JM+IOFF_COMPLEX)) = FOUBUF(ISTA+2*JF ) + PREEL_COMPLEX(IOFF_LAT+2*JM+1) = FOUBUF(ISTA+2*JF-1) + PREEL_COMPLEX(IOFF_LAT+2*JM+2) = FOUBUF(ISTA+2*JF ) ENDDO !$ACC LOOP SEQ DO JM=G_NMEN(IGLG)+1,(G_NLOEN(IGLG)+4)/2-1 ! Truncation (not sure what is the exact upper bound here...) ! Same is also in FSC for the new fields. I *think* it should be N/2+1 elements in total ! TODO: Make sure this is correct - PREEL_COMPLEX(2*JF-1+2*KF_TOTAL*(JM+IOFF_COMPLEX)) = 0._JPRBT - PREEL_COMPLEX(2*JF +2*KF_TOTAL*(JM+IOFF_COMPLEX)) = 0._JPRBT + PREEL_COMPLEX(IOFF_LAT+2*JM+1) = 0._JPRBT + PREEL_COMPLEX(IOFF_LAT+2*JM+2) = 0._JPRBT ENDDO ENDDO ENDDO diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 index 29b95029b..8a481e8e5 100755 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ b/src/trans/gpu/internal/fourier_out_mod.F90 @@ -49,7 +49,7 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) REAL(KIND=JPRBT), ALLOCATABLE, INTENT(OUT) :: FOUBUF_IN(:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA,OFFSET_VAR,IOFF_COMPLEX,KGL +INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA,OFFSET_VAR,IOFF_LAT,KGL INTEGER(KIND=JPIM) :: IBEG,IEND,IINC REAL(KIND=JPRBT) :: SCAL @@ -72,11 +72,11 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) ! scale results and move into next transformation buffer OFFSET_VAR=D_NPTRLS(MYSETW) -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF_COMPLEX,JM,IPROC,ISTA,SCAL) DEFAULT(NONE) ASYNC(1) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF_LAT,JM,IPROC,ISTA,SCAL) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=1,KFIELDS IGLG = OFFSET_VAR+KGL-1 - IOFF_COMPLEX = KFIELDS*D_NSTAGTF(KGL)+(JF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL)) + IOFF_LAT = KFIELDS*D_NSTAGTF(KGL)+(JF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL)) SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) !$ACC LOOP SEQ @@ -84,8 +84,8 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) IPROC = D_NPROCM(JM) ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KFIELDS*2 - FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL_COMPLEX(IOFF_COMPLEX+2*JM+1) - FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL_COMPLEX(IOFF_COMPLEX+2*JM+2) + FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+1) + FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+2) ENDDO ENDDO ENDDO diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index d5bf75147..0192ae198 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -163,9 +163,6 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives ENDIF -! from FOUBUF to PREEL_COMPLEX. Divide by two because we consider this complex space now -CALL FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_INPUT/2,KF_FS) - ALLOCATE(PREEL_COMPLEX_TMP(KF_FS*D%NLENGTF)) !$ACC DATA PRESENT(PREEL_COMPLEX) CREATE(PREEL_COMPLEX_TMP) @@ -190,6 +187,9 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& !$ACC END DATA DEALLOCATE(PREEL_COMPLEX_TMP) +! from FOUBUF to PREEL_COMPLEX. Divide by two because we consider this complex space now +CALL FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_INPUT/2,KF_FS) + ! 2. Fourier space computations ! fill the rest of PREEL_COMPLEX From 3b498fe314893bc31232ba2c02a6aa6611652cc8 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:21 -0700 Subject: [PATCH 162/263] INV: remove now redundant field --- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 27 ------------------------ 1 file changed, 27 deletions(-) diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index 0192ae198..166f1ecdc 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -109,9 +109,6 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& INTEGER(KIND=JPIM) :: J3,JGL,IGL,IOFF,IFGP2,IFGP3A,IFGP3B,IGP3APAR,IGP3BPAR INTEGER(KIND=JPIM) :: IFIRST INTEGER(KIND=JPIM) :: KF_FS -INTEGER(JPIM) :: KGL, KF, KI - -REAL(KIND=JPRBT), ALLOCATABLE :: PREEL_COMPLEX_TMP(:) ! ------------------------------------------------------------------ @@ -163,30 +160,6 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives ENDIF - -ALLOCATE(PREEL_COMPLEX_TMP(KF_FS*D%NLENGTF)) -!$ACC DATA PRESENT(PREEL_COMPLEX) CREATE(PREEL_COMPLEX_TMP) -!$ACC KERNELS -PREEL_COMPLEX_TMP(:) = -1 -!$ACC END KERNELS -!$ACC PARALLEL LOOP COLLAPSE(2) -DO KGL=1,D%NDGL_FS - DO KF=1,KF_FS - !$ACC LOOP SEQ - DO KI=1,(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))/2 - PREEL_COMPLEX_TMP(KF_FS*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+1) & - & = PREEL_COMPLEX(KF_FS*D%NSTAGTF(KGL)+2*(KI-1)*KF_FS+2*(KF-1)+1) - PREEL_COMPLEX_TMP(KF_FS*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+2*(KI-1)+2) & - & = PREEL_COMPLEX(KF_FS*D%NSTAGTF(KGL)+2*(KI-1)*KF_FS+2*(KF-1)+2) - ENDDO - ENDDO -ENDDO -!$ACC KERNELS -PREEL_COMPLEX(:) = PREEL_COMPLEX_TMP(:) -!$ACC END KERNELS -!$ACC END DATA -DEALLOCATE(PREEL_COMPLEX_TMP) - ! from FOUBUF to PREEL_COMPLEX. Divide by two because we consider this complex space now CALL FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_INPUT/2,KF_FS) From a3f7d5fbedd891340637649d03fb34af2f67b330 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:22 -0700 Subject: [PATCH 163/263] Clean up unused functions from fft wrappers --- src/trans/gpu/CMakeLists.txt | 16 +- .../external/fourier/create_plan_fftc.cu | 121 -------------- .../external/fourier/destroy_plan_fftc.cu | 71 -------- .../{execute_plan_fftc.cu => fft_wrapper.cu} | 31 ---- .../algor/external/fourier/storage_fftc.cu | 22 --- src/trans/gpu/external/setup_trans.F90 | 2 - src/trans/gpu/internal/ftdir_mod.F90 | 2 +- src/trans/gpu/internal/ftinv_mod.F90 | 2 +- src/trans/gpu/internal/set_resol_mod.F90 | 3 +- src/trans/gpu/internal/sufft_mod.F90 | 4 +- src/trans/gpu/internal/tpm_fftc.F90 | 157 +----------------- 11 files changed, 9 insertions(+), 422 deletions(-) delete mode 100644 src/trans/gpu/algor/external/fourier/create_plan_fftc.cu delete mode 100644 src/trans/gpu/algor/external/fourier/destroy_plan_fftc.cu rename src/trans/gpu/algor/external/fourier/{execute_plan_fftc.cu => fft_wrapper.cu} (89%) delete mode 100644 src/trans/gpu/algor/external/fourier/storage_fftc.cu diff --git a/src/trans/gpu/CMakeLists.txt b/src/trans/gpu/CMakeLists.txt index 55cc4a943..82cd2424b 100644 --- a/src/trans/gpu/CMakeLists.txt +++ b/src/trans/gpu/CMakeLists.txt @@ -1,4 +1,5 @@ # (C) Copyright 2020- ECMWF. +# (C) Copyright 2022- NVIDIA. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -37,10 +38,7 @@ foreach( prec sp dp ) TARGET trans_gpu_${prec} LINKER_LANGUAGE Fortran SOURCES ${trans_src} - algor/external/fourier/destroy_plan_fftc.cu - algor/external/fourier/create_plan_fftc.cu - algor/external/fourier/storage_fftc.cu - algor/external/fourier/execute_plan_fftc.cu + algor/external/fourier/fft_wrapper.cu PUBLIC_INCLUDES $ $ $ @@ -55,10 +53,7 @@ foreach( prec sp dp ) TARGET trans_gpu_static_${prec} LINKER_LANGUAGE Fortran SOURCES ${trans_src} - algor/external/fourier/destroy_plan_fftc.cu - algor/external/fourier/create_plan_fftc.cu - algor/external/fourier/storage_fftc.cu - algor/external/fourier/execute_plan_fftc.cu + algor/external/fourier/fft_wrapper.cu TYPE STATIC PUBLIC_INCLUDES $ $ @@ -74,10 +69,7 @@ foreach( prec sp dp ) TARGET trans_gpu_static_CA_${prec} LINKER_LANGUAGE Fortran SOURCES ${trans_src} - algor/external/fourier/destroy_plan_fftc.cu - algor/external/fourier/create_plan_fftc.cu - algor/external/fourier/storage_fftc.cu - algor/external/fourier/execute_plan_fftc.cu + algor/external/fourier/fft_wrapper.cu TYPE STATIC PUBLIC_INCLUDES $ $ diff --git a/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu deleted file mode 100644 index d47c28cec..000000000 --- a/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu +++ /dev/null @@ -1,121 +0,0 @@ -#include "cufft.h" -#include "stdio.h" -static const char *_cudaGetErrorEnum(cufftResult error) { - switch (error) { - case CUFFT_SUCCESS: - return "CUFFT_SUCCESS"; - - case CUFFT_INVALID_PLAN: - return "CUFFT_INVALID_PLAN"; - - case CUFFT_ALLOC_FAILED: - return "CUFFT_ALLOC_FAILED"; - - case CUFFT_INVALID_TYPE: - return "CUFFT_INVALID_TYPE"; - - case CUFFT_INVALID_VALUE: - return "CUFFT_INVALID_VALUE"; - - case CUFFT_INTERNAL_ERROR: - return "CUFFT_INTERNAL_ERROR"; - - case CUFFT_EXEC_FAILED: - return "CUFFT_EXEC_FAILED"; - - case CUFFT_SETUP_FAILED: - return "CUFFT_SETUP_FAILED"; - - case CUFFT_INVALID_SIZE: - return "CUFFT_INVALID_SIZE"; - - case CUFFT_UNALIGNED_DATA: - return "CUFFT_UNALIGNED_DATA"; - } - - return ""; -} -#define CUFFT_CHECK(e) { \ - cufftResult_t err = (e); \ - if (err != CUFFT_SUCCESS) \ - { \ - fprintf(stderr, "CUFFT error: %s, line %d, %s: %s\n", \ - __FILE__, __LINE__, #e, _cudaGetErrorEnum(err)); \ - exit(EXIT_FAILURE); \ - } \ -} - -void *planWorkspace = nullptr; -static int currentWorkspaceSize = 0; - -extern "C" void create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, - int *LOTp, int *stridep, int *plan_size) { - int ISIGN = *ISIGNp; - int N = *Np; - int LOT = *LOTp; - int stride = *stridep; - - cufftHandle plan; - - if (cudaDeviceSynchronize() != cudaSuccess) { - fprintf(stderr, "Cuda error: Failed to synchronize\n"); - return; - } - - int embed[1]; - int dist; - -#ifdef TRANS_SINGLE - cufftType cufft_1 = CUFFT_R2C; - cufftType cufft_2 = CUFFT_C2R; -#else - cufftType cufft_1 = CUFFT_D2Z; - cufftType cufft_2 = CUFFT_Z2D; -#endif - - embed[0] = 1; - dist = 1; - - CUFFT_CHECK(cufftCreate(&plan)); - - // Disable auto allocation - CUFFT_CHECK(cufftSetAutoAllocation(plan, false)); - - // printf("CreatePlan cuFFT\n","N=",N); - // printf("%s %d \n","plan=",plan); - // printf("%s %d \n","LOT=",LOT); - // printf("%s %d \n","ISIGN=",ISIGN); - // printf("%s %d \n","Np=",*Np); - - if (ISIGN == -1) { - CUFFT_CHECK(cufftPlanMany(&plan, 1, &N, embed, stride, dist, embed, - stride, dist, cufft_1, LOT)); - } else if (ISIGN == 1) { - CUFFT_CHECK(cufftPlanMany(&plan, 1, &N, embed, stride, dist, embed, - stride, dist, cufft_2, LOT)); - } else { - abort(); - } - - // get size used by this plan - size_t thisWorkplanSize; - CUFFT_CHECK(cufftGetSize(plan, &thisWorkplanSize)); - - // check if this the work space is sufficiently large - if (thisWorkplanSize > currentWorkspaceSize) { - cudaDeviceSynchronize(); - cudaFree(planWorkspace); - cudaMalloc(&planWorkspace, thisWorkplanSize); - currentWorkspaceSize = thisWorkplanSize; - } - - if (cudaDeviceSynchronize() != cudaSuccess) { - fprintf(stderr, "Cuda error: Failed to synchronize\n"); - return; - } - - *PLANp = plan; - *plan_size = thisWorkplanSize; - - return; -} diff --git a/src/trans/gpu/algor/external/fourier/destroy_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/destroy_plan_fftc.cu deleted file mode 100644 index df8478345..000000000 --- a/src/trans/gpu/algor/external/fourier/destroy_plan_fftc.cu +++ /dev/null @@ -1,71 +0,0 @@ -#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__) -#include "cufft.h" -#include "stdio.h" -static const char *_cudaGetErrorEnum(cufftResult error) { - switch (error) { - case CUFFT_SUCCESS: - return "CUFFT_SUCCESS"; - - case CUFFT_INVALID_PLAN: - return "CUFFT_INVALID_PLAN"; - - case CUFFT_ALLOC_FAILED: - return "CUFFT_ALLOC_FAILED"; - - case CUFFT_INVALID_TYPE: - return "CUFFT_INVALID_TYPE"; - - case CUFFT_INVALID_VALUE: - return "CUFFT_INVALID_VALUE"; - - case CUFFT_INTERNAL_ERROR: - return "CUFFT_INTERNAL_ERROR"; - - case CUFFT_EXEC_FAILED: - return "CUFFT_EXEC_FAILED"; - - case CUFFT_SETUP_FAILED: - return "CUFFT_SETUP_FAILED"; - - case CUFFT_INVALID_SIZE: - return "CUFFT_INVALID_SIZE"; - - case CUFFT_UNALIGNED_DATA: - return "CUFFT_UNALIGNED_DATA"; - } - - return ""; -} - -inline void __cufftSafeCall(cufftResult err, const char *file, const int line) { - if (CUFFT_SUCCESS != err) { - fprintf(stderr, "CUFFT error at 1\n"); - fprintf(stderr, "CUFFT error in file '%s'\n", __FILE__); - fprintf(stderr, "CUFFT error at 2\n"); - /*fprintf(stderr, "CUFFT error line '%s'\n",__LINE__);*/ - fprintf(stderr, "CUFFT error at 3\n"); - /*fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: - %s\nterminating!\n",__FILE__, __LINE__,err, \ - _cudaGetErrorEnum(err)); \*/ - fprintf(stderr, "CUFFT error %d: %s\nterminating!\n", err, - _cudaGetErrorEnum(err)); - cudaDeviceReset(); - return; - } -} - -extern "C" void destroy_plan_fftc_(cufftHandle *PLANp) { - cufftHandle plan = *PLANp; - - if (cudaDeviceSynchronize() != cudaSuccess) { - fprintf(stderr, "Cuda error: Failed to synchronize\n"); - return; - } - - cufftSafeCall(cufftDestroy(plan)); - - if (cudaDeviceSynchronize() != cudaSuccess) { - fprintf(stderr, "Cuda error: Failed to synchronize\n"); - return; - } -} diff --git a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/fft_wrapper.cu similarity index 89% rename from src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu rename to src/trans/gpu/algor/external/fourier/fft_wrapper.cu index ff2a6ebe1..26b84ddc4 100644 --- a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu +++ b/src/trans/gpu/algor/external/fourier/fft_wrapper.cu @@ -61,37 +61,6 @@ static const char *_cudaGetErrorEnum(cufftResult error) { extern void *planWorkspace; -extern "C" void -#ifdef TRANS_SINGLE -execute_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, cufftComplex *data_in, - cufftComplex *data_out) -#else -execute_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, cufftDoubleComplex *data_in, - cufftDoubleComplex *data_out) -#endif -{ - cufftHandle plan = *PLANp; - int ISIGN = *ISIGNp; - - CUFFT_CHECK(cufftSetWorkArea(plan, planWorkspace)); - - if (ISIGN == -1) { -#ifdef TRANS_SINGLE - CUFFT_CHECK(cufftExecR2C(plan, (cufftReal *)data_in, data_out)); -#else - CUFFT_CHECK(cufftExecD2Z(plan, (cufftDoubleReal *)data_in, data_out)); -#endif - } else if (ISIGN == 1) { -#ifdef TRANS_SINGLE - CUFFT_CHECK(cufftExecC2R(plan, data_in, (cufftReal *)data_out)); -#else - CUFFT_CHECK(cufftExecZ2D(plan, data_in, (cufftDoubleReal *)data_out)); -#endif - } else { - abort(); - } -} - namespace { struct Double { using real = double; diff --git a/src/trans/gpu/algor/external/fourier/storage_fftc.cu b/src/trans/gpu/algor/external/fourier/storage_fftc.cu deleted file mode 100644 index 729195338..000000000 --- a/src/trans/gpu/algor/external/fourier/storage_fftc.cu +++ /dev/null @@ -1,22 +0,0 @@ -#include "cufft.h" -#include "stdio.h" -extern "C" cufftDoubleComplex *create_storage_(int *Np) { - int N = *Np; - cufftDoubleComplex *data; - /*cudaMalloc((void**)&data,sizeof(cufftDoubleComplex)*N); - if (cudaGetLastError() != cudaSuccess){ - fprintf(stderr, "Cuda error: Failed to allocate\n"); - return 0; - } - return data;*/ - printf("%s %d \n", "sizeof(cufftDoubleComplex)=", sizeof(cufftDoubleComplex)); - printf("%s %d \n", "N=", N); - if (cudaMalloc(&data, sizeof(cufftDoubleComplex) * N) == cudaSuccess) { - printf("%s %X \n", "data ", data); - return data; - } - fprintf(stderr, "Cuda error: Failed to allocate\n"); - return 0; -} - -extern "C" void destroy_storage_(int *ptr) { cudaFree(ptr); } diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index dbd1a9fad..f6e92c81b 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -116,7 +116,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& #ifdef WITH_FFTW USE TPM_FFTW ,ONLY : TW, FFTW_RESOL #endif -USE TPM_FFTC ,ONLY : TC, FFTC_RESOL USE TPM_FLT USE TPM_TRANS ,ONLY : ZAVE, ZMINGL, ZMAXGL, ZMINGPN, ZMAXGPN USE TPM_CTL @@ -202,7 +201,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& #ifdef WITH_FFTW ALLOCATE(FFTW_RESOL(NMAX_RESOL)) #endif - ALLOCATE(FFTC_RESOL(NMAX_RESOL)) ALLOCATE(FLT_RESOL(NMAX_RESOL)) ALLOCATE(CTL_RESOL(NMAX_RESOL)) GEOM_RESOL(:)%LAM=.FALSE. diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 908fed977..e1e995347 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -49,7 +49,7 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC USE TPM_GEOMETRY ,ONLY : G -USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT, EXECUTE_DIR_FFT +USE TPM_FFTC ,ONLY : EXECUTE_DIR_FFT USE CUDA_DEVICE_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index 435989cd0..0ee01bf63 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -49,7 +49,7 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC USE TPM_GEOMETRY ,ONLY : G -USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT, EXECUTE_INV_FFT +USE TPM_FFTC ,ONLY : EXECUTE_INV_FFT USE CUDA_DEVICE_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX diff --git a/src/trans/gpu/internal/set_resol_mod.F90 b/src/trans/gpu/internal/set_resol_mod.F90 index 61486f6c0..e317e89a5 100755 --- a/src/trans/gpu/internal/set_resol_mod.F90 +++ b/src/trans/gpu/internal/set_resol_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -22,7 +23,6 @@ SUBROUTINE SET_RESOL(KRESOL,LDSETUP) #ifdef WITH_FFTW USE TPM_FFTW ,ONLY : TW, FFTW_RESOL #endif -USE TPM_FFTC ,ONLY : TC, FFTC_RESOL USE TPM_FLT USE TPM_CTL ,ONLY : C, CTL_RESOL USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS @@ -68,7 +68,6 @@ SUBROUTINE SET_RESOL(KRESOL,LDSETUP) #ifdef WITH_FFTW TW => FFTW_RESOL(NCUR_RESOL) #endif - TC => FFTC_RESOL(NCUR_RESOL) S => FLT_RESOL(NCUR_RESOL) C => CTL_RESOL(NCUR_RESOL) ENDIF diff --git a/src/trans/gpu/internal/sufft_mod.F90 b/src/trans/gpu/internal/sufft_mod.F90 index 8b06ed634..27142dc0c 100755 --- a/src/trans/gpu/internal/sufft_mod.F90 +++ b/src/trans/gpu/internal/sufft_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -18,7 +19,6 @@ SUBROUTINE SUFFT USE TPM_DISTR ,ONLY : D, MYSETW USE TPM_GEOMETRY ,ONLY : G USE TPM_FFT ,ONLY : T - USE TPM_FFTC ,ONLY : TC, INIT_PLANS_FFT #ifdef WITH_FFTW USE TPM_FFTW ,ONLY : TW, INIT_PLANS_FFTW #endif @@ -46,8 +46,6 @@ SUBROUTINE SUFFT ENDIF #endif - CALL INIT_PLANS_FFT(R%NDLON) - ENDIF ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/tpm_fftc.F90 b/src/trans/gpu/internal/tpm_fftc.F90 index dbad5f624..949eafdc4 100755 --- a/src/trans/gpu/internal/tpm_fftc.F90 +++ b/src/trans/gpu/internal/tpm_fftc.F90 @@ -20,37 +20,12 @@ MODULE TPM_FFTC USE, INTRINSIC :: ISO_C_BINDING -USE PARKIND_ECTRANS, ONLY: JPIM -USE MPL_MODULE, ONLY : MPL_MYRANK - IMPLICIT NONE SAVE PRIVATE -PUBLIC CREATE_PLAN_FFT, DESTROY_PLAN_FFT, DESTROY_ALL_PLANS_FFT, INIT_PLANS_FFT, & - & FFTC_RESOL, TC, EXECUTE_DIR_FFT, EXECUTE_INV_FFT - -TYPE FFTC_TYPE - INTEGER(KIND=JPIM),POINTER :: N_PLANS(:) - TYPE(FFTC_PLAN),POINTER :: FFTC_PLANS(:) - INTEGER(KIND=JPIM) :: N_MAX=0 - INTEGER(KIND=JPIM) :: N_MAX_PLANS=8 -END TYPE FFTC_TYPE - - -TYPE FFTC_PLAN - INTEGER(KIND=JPIM) :: NPLAN_ID=123456 - INTEGER(KIND=JPIM) :: NPLAN - INTEGER(KIND=JPIM) :: NLOT - INTEGER(KIND=JPIM) :: NSTRIDE - INTEGER(KIND=JPIM) :: NTYPE - TYPE(FFTC_PLAN),POINTER :: NEXT_PLAN => NULL() -END TYPE FFTC_PLAN - -TYPE(FFTC_TYPE),ALLOCATABLE,TARGET :: FFTC_RESOL(:) -TYPE(FFTC_TYPE),POINTER :: TC - +PUBLIC EXECUTE_DIR_FFT, EXECUTE_INV_FFT INTERFACE EXECUTE_DIR_FFT SUBROUTINE EXECUTE_DIR_FFT_FLOAT(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS) @@ -99,136 +74,6 @@ SUBROUTINE EXECUTE_INV_FFT_DOUBLE(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS) CONTAINS ! ------------------------------------------------------------------ - -SUBROUTINE INIT_PLANS_FFT(KDLON) -INTEGER(KIND=JPIM),INTENT(IN) :: KDLON - -TC%N_MAX=KDLON -ALLOCATE(TC%FFTC_PLANS(TC%N_MAX)) -ALLOCATE(TC%N_PLANS(TC%N_MAX)) -TC%N_PLANS(:)=0 -RETURN -END SUBROUTINE INIT_PLANS_FFT - - -SUBROUTINE CREATE_PLAN_FFT(KPLAN,KTYPE,KN,KLOT,KSTRIDE) -INTEGER(KIND=JPIM),INTENT(OUT) :: KPLAN -INTEGER(KIND=JPIM),INTENT(IN) :: KTYPE,KN,KLOT,KSTRIDE - -INTEGER(KIND=JPIM) :: IPLAN, IPLAN_SIZE -INTEGER(KIND=JPIM) :: IRANK, ISTRIDE -INTEGER(KIND=JPIM) :: JL, JN -INTEGER(KIND=JPIM) :: IRDIST,ICDIST,IN(1),IEMBED(1) -LOGICAL :: LLFOUND -LOGICAL :: LLRESTRICT_PLANS=.TRUE. -TYPE(FFTC_PLAN),POINTER :: CURR_FFTC_PLAN,START_FFTC_PLAN -INTERFACE - SUBROUTINE CREATE_PLAN_FFTC(KPLAN,KTYPE,KN,KLOT,KSTRIDE,PLAN_SIZE) BIND(C,NAME="create_plan_fftc_") - USE, INTRINSIC :: ISO_C_BINDING - INTEGER(C_INT) :: KPLAN - INTEGER(C_INT) :: KTYPE,KN,KLOT,KSTRIDE,PLAN_SIZE - END SUBROUTINE CREATE_PLAN_FFTC -END INTERFACE - -IF( KN > TC%N_MAX )THEN - CALL ABOR1('CREATE_PLAN_FFT: KN > N_MAX THAT WAS INITIALISED IN INIT_PLANS_FFTC') -ENDIF - -IRANK=1 -ISTRIDE=1 -IN(1)=KN -IEMBED(1)=IN(1) -ICDIST=KN/2+1 -IRDIST=ICDIST*2 - -!!$OMP CRITICAL -LLFOUND=.FALSE. -IF( TC%FFTC_PLANS(KN)%NPLAN_ID /= 123456 )THEN - WRITE(*,'("CREATE_PLAN_FFT.1: PLAN_ID=",I10)')TC%FFTC_PLANS(KN)%NPLAN_ID - CALL ABOR1('CREATE_PLAN_FFT.1: NPLAN_ID /= 123456') -ENDIF -CURR_FFTC_PLAN=>TC%FFTC_PLANS(KN) -IF( CURR_FFTC_PLAN%NPLAN_ID /= 123456 )THEN - WRITE(*,'("CREATE_PLAN_FFT.2: PLAN_ID=",I10)')CURR_FFTC_PLAN%NPLAN_ID - CALL ABOR1('CREATE_PLAN_FFT.2: NPLAN_ID /= 123456') -ENDIF -! search for plan in existing plans -DO JL=1,TC%N_PLANS(KN) - IF( KLOT == CURR_FFTC_PLAN%NLOT .AND. KTYPE == CURR_FFTC_PLAN%NTYPE .AND. KSTRIDE == CURR_FFTC_PLAN%NSTRIDE )THEN - LLFOUND=.TRUE. - IPLAN=CURR_FFTC_PLAN%NPLAN - EXIT - ELSEIF( JL /= TC%N_PLANS(KN) )THEN - CURR_FFTC_PLAN=>CURR_FFTC_PLAN%NEXT_PLAN - IF( CURR_FFTC_PLAN%NPLAN_ID /= 123456 )THEN - WRITE(*,'("CREATE_PLAN_FFT.3: PLAN_ID=",I10)')CURR_FFTC_PLAN%NPLAN_ID - CALL ABOR1('CREATE_PLAN_FFT.3: NPLAN_ID /= 123456') - ENDIF - ENDIF -ENDDO -IF( .NOT.LLFOUND )THEN - IF( LLRESTRICT_PLANS )THEN - IF( TC%N_PLANS(KN) == TC%N_MAX_PLANS )THEN - ! destroy the plan at the start of the list -! WRITE(*,'("CREATE_PLAN_FFT: BEG: DESTROYING A PLAN AT THE START OF THE LIST")') - CALL DESTROY_PLAN_FFT(TC%FFTC_PLANS(KN)%NPLAN) - TC%FFTC_PLANS(KN)%NPLAN_ID=999999 - START_FFTC_PLAN=>TC%FFTC_PLANS(KN) - TC%FFTC_PLANS(KN)=TC%FFTC_PLANS(KN)%NEXT_PLAN - ! DEALLOCATE(START_FFTC_PLAN) - TC%N_PLANS(KN)=TC%N_PLANS(KN)-1 -! WRITE(*,'("CREATE_PLAN_FFT: END: DESTROYING A PLAN AT THE START OF THE LIST")') - ENDIF - ENDIF - CALL CREATE_PLAN_FFTC(IPLAN,KTYPE,KN,KLOT,KSTRIDE,IPLAN_SIZE) - KPLAN=IPLAN - TC%N_PLANS(KN)=TC%N_PLANS(KN)+1 - IF( TC%N_PLANS(KN) /= 1 )THEN - ALLOCATE(CURR_FFTC_PLAN%NEXT_PLAN) - CURR_FFTC_PLAN=>CURR_FFTC_PLAN%NEXT_PLAN - ENDIF - IF( CURR_FFTC_PLAN%NPLAN_ID /= 123456 )THEN - WRITE(*,'("CREATE_PLAN_FFT.4: PLAN_ID=",I10)')CURR_FFTC_PLAN%NPLAN_ID - CALL ABOR1('CREATE_PLAN_FFT.4: NPLAN_ID /= 123456') - ENDIF - CURR_FFTC_PLAN%NPLAN=IPLAN - CURR_FFTC_PLAN%NLOT=KLOT - CURR_FFTC_PLAN%NSTRIDE=KSTRIDE - CURR_FFTC_PLAN%NTYPE=KTYPE - CURR_FFTC_PLAN%NEXT_PLAN=>NULL() -! write(*,'("CREATE_PLAN_FFT: KN=",I5," NPLANS=",I3," KLOT=",I6," KTYPE=",I2,& -! & " NEW IPLAN=",Z16)')KN,TC%N_PLANS(KN),KLOT,KTYPE,IPLAN -ELSE - KPLAN=IPLAN -! write(*,'("CREATE_PLAN_FFT: KN=",I5," NPLANS=",I3," KLOT=",I6," KTYPE=",I2,& -! & " CUR IPLAN=",Z16)')KN,TC%N_PLANS(KN),KLOT,KTYPE,IPLAN -ENDIF -!!$OMP END CRITICAL - -RETURN -END SUBROUTINE CREATE_PLAN_FFT - - -SUBROUTINE DESTROY_PLAN_FFT(KPLAN) -INTEGER(KIND=JPIM),INTENT(IN) :: KPLAN -CALL DESTROY_PLAN_FFTC(KPLAN) -RETURN -END SUBROUTINE DESTROY_PLAN_FFT - - -SUBROUTINE DESTROY_ALL_PLANS_FFT -INTEGER(KIND=JPIM) :: JL, JN -TYPE(FFTC_PLAN),POINTER :: CURR_FFTC_PLAN -DO JN=1,TC%N_MAX - CURR_FFTC_PLAN=>TC%FFTC_PLANS(JN) -ENDDO -WRITE(*,'("DESTROY_ALL_PLANS_FFTC: MPL_RANK=",I6," SUM(TC%N_PLANS(:))=",I10)')& - & MPL_MYRANK(), SUM(TC%N_PLANS(:)) -DEALLOCATE(TC%FFTC_PLANS) -DEALLOCATE(TC%N_PLANS) -RETURN -END SUBROUTINE DESTROY_ALL_PLANS_FFT - SUBROUTINE EXECUTE_DIR_FFT_FLOAT(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS) USE PARKIND_ECTRANS ,ONLY : JPIM From 3d8d069845732651281c5e9c061868a83909f211 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:22 -0700 Subject: [PATCH 164/263] Avoid reallocating PREEL (needed for cudaGraph) --- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 17 +++++++++++------ src/trans/gpu/internal/ftinv_ctl_mod.F90 | 14 +++++++++++--- src/trans/gpu/internal/tpm_trans.F90 | 6 ++++++ src/trans/gpu/internal/trltog_mod.F90 | 8 +------- 4 files changed, 29 insertions(+), 16 deletions(-) diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index 0d6031a1f..ddbe4f473 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -62,6 +62,7 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & !USE TPM_GEOMETRY USE TPM_DISTR ,ONLY : D, MYPROC, NPROC USE FOURIER_OUT_MOD , ONLY: FOURIER_OUT +USE TPM_TRANS, ONLY: PREEL_PTR USE TRGTOL_MOD ,ONLY : TRGTOL, TRGTOL_CUDAAWARE USE FTDIR_MOD ,ONLY : FTDIR @@ -150,8 +151,16 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & IST = IST+KF_SCALARS_G ENDIF -ALLOCATE(PREEL_REAL(KF_FS*D%NLENGTF)) -!$ACC ENTER DATA CREATE(PREEL_REAL) +IF (.NOT. ALLOCATED(PREEL_PTR)) THEN + ALLOCATE(PREEL_PTR(KF_FS*D%NLENGTF)) + !$ACC ENTER DATA CREATE(PREEL_PTR) +ELSEIF (SIZE(PREEL_PTR) < KF_FS*D%NLENGTF) THEN + !$ACC EXIT DATA DELETE(PREEL_PTR) + DEALLOCATE(PREEL_PTR) + ALLOCATE(PREEL_PTR(KF_FS*D%NLENGTF)) + !$ACC ENTER DATA CREATE(PREEL_PTR) +ENDIF +PREEL_REAL => PREEL_PTR ! Transposition @@ -179,10 +188,6 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & ENDIF CALL GSTATS(1640,1) -IF (ALLOCATED(PREEL_COMPLEX)) THEN - !$ACC EXIT DATA DELETE(PREEL_COMPLEX) - DEALLOCATE(PREEL_COMPLEX) -ENDIF CALL GSTATS(106,1) ! ------------------------------------------------------------------ END SUBROUTINE FTDIR_CTL diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index 166f1ecdc..ccdd218ed 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -62,7 +62,7 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& USE TPM_GEN ,ONLY : NERR, nout !USE TPM_DIM !USE TPM_GEOMETRY -USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP,LATLON +USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP,LATLON, PREEL_PTR USE TPM_DISTR ,ONLY : D, MYPROC, NPROC USE TPM_FLT ,ONLY : S USE FOURIER_IN_MOD ,ONLY : FOURIER_IN @@ -131,8 +131,16 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives KF_FS = IFIRST -ALLOCATE(PREEL_COMPLEX(KF_FS*D%NLENGTF)) -!$ACC ENTER DATA CREATE(PREEL_COMPLEX) +IF (.NOT. ALLOCATED(PREEL_PTR)) THEN + ALLOCATE(PREEL_PTR(KF_FS*D%NLENGTF)) + !$ACC ENTER DATA CREATE(PREEL_PTR) +ELSEIF (SIZE(PREEL_PTR) < KF_FS*D%NLENGTF) THEN + !$ACC EXIT DATA DELETE(PREEL_PTR) + DEALLOCATE(PREEL_PTR) + ALLOCATE(PREEL_PTR(KF_FS*D%NLENGTF)) + !$ACC ENTER DATA CREATE(PREEL_PTR) +ENDIF +PREEL_COMPLEX => PREEL_PTR ! Initialize potentially unset offsets KSCALARS_NSDER_OFFSET = -1 diff --git a/src/trans/gpu/internal/tpm_trans.F90 b/src/trans/gpu/internal/tpm_trans.F90 index 6010612c2..50d028c7d 100755 --- a/src/trans/gpu/internal/tpm_trans.F90 +++ b/src/trans/gpu/internal/tpm_trans.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -62,4 +63,9 @@ MODULE TPM_TRANS REAL(KIND=JPRBT),ALLOCATABLE :: ZMINGPN(:) REAL(KIND=JPRBT),ALLOCATABLE :: ZMAXGPN(:) +! This is used in fourier space. It's reused among the transforms because +! we cannot reallocate - the captured CUDA graphs should not be modified. +! Hence, we keep it if it is large enough, otherwise we adapt the size. +REAL(KIND=JPRBT),ALLOCATABLE,TARGET :: PREEL_PTR(:) + END MODULE TPM_TRANS diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 91ab2d770..ba259cdf2 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -354,7 +354,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KP !$ACC DATA IF(PRESENT(PGP3B)) COPYOUT(PGP3B) ASYNC(1) ! Present until self contribution and packing are done - !$ACC DATA PRESENT(PREEL_REAL) ASYNC(1) + !$ACC DATA PRESENT(PREEL_REAL) CALL GSTATS(1806,1) ! Copy local contribution @@ -472,14 +472,11 @@ SUBROUTINE TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KP !$ACC END DATA ! ZCOMBUFS !$ACC END DATA ! PREEL_REAL - !$ACC EXIT DATA DELETE(PREEL_REAL) ASYNC(1) IF (IRECV_COUNTS > 0) ALLOCATE(ZCOMBUFR(ICOMBUFR_OFFSET(IRECV_COUNTS+1))) !$ACC DATA IF(IRECV_COUNTS > 0) CREATE(ZCOMBUFR) ASYNC(1) !$ACC WAIT(1) - DEALLOCATE(PREEL_REAL) - CALL GSTATS(805,0) IF (LSYNC_TRANS) THEN @@ -1276,9 +1273,6 @@ SUBROUTINE TRLTOG(PREEL_REAL,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL GSTATS(1606,1) IF (IBUFLENS > 0) DEALLOCATE(ZCOMBUFS) IF (IBUFLENR > 0) DEALLOCATE(ZCOMBUFR) - - !$ACC EXIT DATA DELETE(PREEL_REAL) - DEALLOCATE(PREEL_REAL) IF (LHOOK) CALL DR_HOOK('TRLTOG',1,ZHOOK_HANDLE) From 8f576332523e402f9f1fbb3937b380ff54da5148 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:22 -0700 Subject: [PATCH 165/263] TRGTOL: Use contiguous memory accesses --- src/trans/gpu/internal/trgtol_mod.F90 | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index dbe0e0676..44b9b4077 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -280,8 +280,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& CALL GSTATS(1601,0) IF(PRESENT(PGP)) THEN !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) - DO JL=1,ISEND_WSET_SIZE_V - DO JFLD=1,KF_FS + DO JFLD=1,KF_FS + DO JL=1,ISEND_WSET_SIZE_V JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) @@ -292,8 +292,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& ENDDO ELSE !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) - DO JL=1,ISEND_WSET_SIZE_V - DO JFLD=1,KF_FS + DO JFLD=1,KF_FS + DO JL=1,ISEND_WSET_SIZE_V JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) @@ -386,8 +386,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& ISEND_WSET_SIZE_V = ISEND_WSET_SIZE(ISETW) IF(PRESENT(PGP)) THEN !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) ASYNC(1) - DO JL=1,ISEND_WSET_SIZE_V - DO JFLD=1,ISEND_FIELD_COUNT_V + DO JFLD=1,ISEND_FIELD_COUNT_V + DO JL=1,ISEND_WSET_SIZE_V JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) @@ -397,8 +397,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& ENDDO ELSE !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI,IOFF) ASYNC(1) - DO JL=1,ISEND_WSET_SIZE_V - DO JFLD=1,ISEND_FIELD_COUNT_V + DO JFLD=1,ISEND_FIELD_COUNT_V + DO JL=1,ISEND_WSET_SIZE_V JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) From a8870cc0671c39e1f1a9201ded21b5af0fa95d04 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:22 -0700 Subject: [PATCH 166/263] TRLTOG: Use contiguous memory accesses --- src/trans/gpu/internal/trltog_mod.F90 | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index ba259cdf2..236bf237e 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -383,8 +383,8 @@ SUBROUTINE TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KP IIN_TO_SEND_BUFR_V = IIN_TO_SEND_BUFR_OFFSET(MYPROC) IF (PRESENT(PGP)) THEN !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS) ASYNC(1) - DO JL=1,IRECV_WSET_SIZE_V - DO JFLD=1,KF_FS + DO JFLD=1,KF_FS + DO JL=1,IRECV_WSET_SIZE_V JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) @@ -395,8 +395,8 @@ SUBROUTINE TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KP ENDDO ELSE !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS) ASYNC(1) - DO JL=1,IRECV_WSET_SIZE_V - DO JFLD=1,KF_FS + DO JFLD=1,KF_FS + DO JL=1,IRECV_WSET_SIZE_V JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) @@ -460,8 +460,8 @@ SUBROUTINE TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KP IIN_TO_SEND_BUFR_V = IIN_TO_SEND_BUFR_OFFSET(IPROC) ICOMBUFS_OFFSET_V = ICOMBUFS_OFFSET(INS) !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IPOS) COLLAPSE(2) ASYNC(1) - DO JL=1,ILEN - DO JFLD=1,KF_FS + DO JFLD=1,KF_FS + DO JL=1,ILEN IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,1)+ & & (JFLD-1)*IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,2)+1 ZCOMBUFS(ICOMBUFS_OFFSET_V+(JFLD-1)*ILEN+JL) = PREEL_REAL(IPOS) @@ -553,8 +553,8 @@ SUBROUTINE TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KP IRECV_WSET_SIZE_V = IRECV_WSET_SIZE(ISETW) IF (PRESENT(PGP)) THEN !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) ASYNC(1) - DO JL=1,IRECV_WSET_SIZE_V - DO JFLD=1,IRECV_FIELD_COUNT_V + DO JFLD=1,IRECV_FIELD_COUNT_V + DO JL=1,IRECV_WSET_SIZE_V JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD=IFLDA(JFLD) @@ -564,8 +564,8 @@ SUBROUTINE TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KP ENDDO ELSE !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) ASYNC(1) - DO JL=1,IRECV_WSET_SIZE_V - DO JFLD=1,IRECV_FIELD_COUNT_V + DO JFLD=1,IRECV_FIELD_COUNT_V + DO JL=1,IRECV_WSET_SIZE_V JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD=IFLDA(JFLD) From 65d5b30f354d1c34ee46df931032f6140cb09f01 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:22 -0700 Subject: [PATCH 167/263] Merge loops in Fourier_IN --- src/trans/gpu/internal/fourier_in_mod.F90 | 36 +++++++++++++---------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index f821c9a74..39b7498b2 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -41,7 +41,7 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) USE PARKIND_ECTRANS ,ONLY : JPIM,JPRBT USE TPM_DISTR ,ONLY : D,MYSETW,MYPROC, NPROC, D_NSTAGTF, D_NSTAGT0B,D_NPNTGTB0,D_NPROCM,D_NPTRLS -USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NLOEN +USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NLOEN,G_NLOEN_MAX ! IMPLICIT NONE @@ -52,6 +52,7 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA,OFFSET_VAR,IOFF_LAT,KGL INTEGER(KIND=JPIM) :: IBEG,IEND,IINC +REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX IF(MYPROC > NPROC/2)THEN IBEG=1 @@ -66,27 +67,30 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) !$ACC DATA PRESENT(G_NLOEN,G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF) ASYNC(1) OFFSET_VAR=D_NPTRLS(MYSETW) -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF_LAT,JM,IPROC,ISTA) DEFAULT(NONE) ASYNC(1) +!$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,IPROC,ISTA,RET_REAL,RET_COMPLEX) DEFAULT(NONE) & +!$ACC& ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=1,KF_CURRENT - IGLG = OFFSET_VAR+KGL-1 - IOFF_LAT = KF_TOTAL*D_NSTAGTF(KGL)+(JF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL)) + DO JM=0,(G_NLOEN_MAX+4)/2-1 + IGLG = OFFSET_VAR+KGL-1 - !$ACC LOOP SEQ - DO JM=0,G_NMEN(IGLG) - IPROC = D_NPROCM(JM) - ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KF_CURRENT*2 - - PREEL_COMPLEX(IOFF_LAT+2*JM+1) = FOUBUF(ISTA+2*JF-1) - PREEL_COMPLEX(IOFF_LAT+2*JM+2) = FOUBUF(ISTA+2*JF ) - ENDDO - !$ACC LOOP SEQ - DO JM=G_NMEN(IGLG)+1,(G_NLOEN(IGLG)+4)/2-1 ! Truncation (not sure what is the exact upper bound here...) ! Same is also in FSC for the new fields. I *think* it should be N/2+1 elements in total ! TODO: Make sure this is correct - PREEL_COMPLEX(IOFF_LAT+2*JM+1) = 0._JPRBT - PREEL_COMPLEX(IOFF_LAT+2*JM+2) = 0._JPRBT + IF (JM <= (G_NLOEN(IGLG)+4)/2-1) THEN + IOFF_LAT = KF_TOTAL*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + RET_REAL = 0.0_JPRBT + RET_COMPLEX = 0.0_JPRBT + IF (JM <= G_NMEN(IGLG)) THEN + IPROC = D_NPROCM(JM) + ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KF_CURRENT*2 + + RET_REAL = FOUBUF(ISTA+2*JF-1) + RET_COMPLEX = FOUBUF(ISTA+2*JF ) + ENDIF + PREEL_COMPLEX(IOFF_LAT+2*JM+1) = RET_REAL + PREEL_COMPLEX(IOFF_LAT+2*JM+2) = RET_COMPLEX + ENDIF ENDDO ENDDO ENDDO From d0eed6e0d3bd2f71643eb42abb6fa390772f54af Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:23 -0700 Subject: [PATCH 168/263] Merge loops in FOURIER_OUT --- src/trans/gpu/internal/fourier_out_mod.F90 | 26 ++++++++++++---------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 index 8a481e8e5..41af4246c 100755 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ b/src/trans/gpu/internal/fourier_out_mod.F90 @@ -41,6 +41,7 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) USE PARKIND_ECTRANS ,ONLY : JPIM,JPRBT USE TPM_DISTR ,ONLY : D,MYSETW,MYPROC, NPROC, D_NSTAGTF, D_NSTAGT0B,D_NPNTGTB0,D_NPROCM,D_NPTRLS USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NLOEN +USE TPM_DIM, ONLY: R_NSMAX ! IMPLICIT NONE @@ -72,20 +73,21 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) ! scale results and move into next transformation buffer OFFSET_VAR=D_NPTRLS(MYSETW) -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IGLG,IOFF_LAT,JM,IPROC,ISTA,SCAL) DEFAULT(NONE) ASYNC(1) +!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(IGLG,IOFF_LAT,IPROC,ISTA,SCAL) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=1,KFIELDS - IGLG = OFFSET_VAR+KGL-1 - IOFF_LAT = KFIELDS*D_NSTAGTF(KGL)+(JF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL)) - - SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) - !$ACC LOOP SEQ - DO JM=0,G_NMEN(IGLG) - IPROC = D_NPROCM(JM) - ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KFIELDS*2 - - FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+1) - FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+2) + DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG) + IGLG = OFFSET_VAR+KGL-1 + IF (JM <= G_NMEN(IGLG)) THEN + IOFF_LAT = KFIELDS*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + + SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) + IPROC = D_NPROCM(JM) + ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KFIELDS*2 + + FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+1) + FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+2) + ENDIF ENDDO ENDDO ENDDO From 41a57d4cf8a0a8f0cb193b635e285415b860c515 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:23 -0700 Subject: [PATCH 169/263] Fix memory accesses and merge loops for FSC --- src/trans/gpu/internal/fsc_mod.F90 | 147 ++++++++++++++++------------- 1 file changed, 80 insertions(+), 67 deletions(-) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 4f70ea548..8bd218a3a 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -48,10 +48,11 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NSTAGTF -USE TPM_GEOMETRY ,ONLY : G, G_NMEN, G_NLOEN +USE TPM_GEOMETRY ,ONLY : G, G_NMEN, G_NLOEN, G_NLOEN_MAX USE TPM_FIELDS ,ONLY : F USE TPM_FLT ,ONLY: S USE TPM_GEN, ONLY: NOUT +USE TPM_DIM, ONLY: R_NSMAX USE TPM_TRANS ,ONLY : LATLON ! @@ -65,6 +66,7 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & INTEGER(KIND=JPIM) :: IOFF_SCALARS,IOFF_SCALARS_EWDER,IOFF_UV,IOFF_UV_EWDER,IOFF_KSCALARS_NSDER INTEGER(KIND=JPIM) :: JF,IGLG,JM INTEGER(KIND=JPIM) :: IBEG,IEND,IINC +REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX REAL(KIND=JPRBT), INTENT(INOUT) :: PREEL_COMPLEX(:) INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV, KF_SCALARS @@ -96,21 +98,22 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & !* 1.1 U AND V. -!$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) +!$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=1,2*KF_UV - IGLG = OFFSET_VAR+KGL-1 - IOFF_LAT = KF_FS*D_NSTAGTF(KGL) - IOFF_UV = IOFF_LAT+(KUV_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) - - ZACHTE2 = F%RACTHE(IGLG) - - !$ACC LOOP SEQ - DO JM=0,G_NMEN(IGLG) - PREEL_COMPLEX(IOFF_UV+2*JM+1) = & - & PREEL_COMPLEX(IOFF_UV+2*JM+1)*ZACHTE2 - PREEL_COMPLEX(IOFF_UV+2*JM+2) = & - & PREEL_COMPLEX(IOFF_UV+2*JM+2)*ZACHTE2 + DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG) + IGLG = OFFSET_VAR+KGL-1 + IF (JM <= G_NMEN(IGLG)) THEN + IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_UV = IOFF_LAT+(KUV_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + + ZACHTE2 = F%RACTHE(IGLG) + + PREEL_COMPLEX(IOFF_UV+2*JM+1) = & + & PREEL_COMPLEX(IOFF_UV+2*JM+1)*ZACHTE2 + PREEL_COMPLEX(IOFF_UV+2*JM+2) = & + & PREEL_COMPLEX(IOFF_UV+2*JM+2)*ZACHTE2 + ENDIF ENDDO ENDDO ENDDO @@ -118,21 +121,22 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & !* 1.2 N-S DERIVATIVES IF (KSCALARS_NSDER_OFFSET >= 0) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=1,KF_SCALARS - IGLG = OFFSET_VAR+KGL-1 - IOFF_LAT = KF_FS*D_NSTAGTF(KGL) - IOFF_KSCALARS_NSDER = IOFF_LAT+(KSCALARS_NSDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) - - ZACHTE2 = F%RACTHE(IGLG) - - !$ACC LOOP SEQ - DO JM=0,G_NMEN(IGLG) - PREEL_COMPLEX(IOFF_KSCALARS_NSDER+2*JM+1) = & - & PREEL_COMPLEX(IOFF_KSCALARS_NSDER+2*JM+1)*ZACHTE2 - PREEL_COMPLEX(IOFF_KSCALARS_NSDER+2*JM+2) = & - & PREEL_COMPLEX(IOFF_KSCALARS_NSDER+2*JM+2)*ZACHTE2 + DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG) + IGLG = OFFSET_VAR+KGL-1 + IF (JM <= G_NMEN(IGLG)) THEN + IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_KSCALARS_NSDER = IOFF_LAT+(KSCALARS_NSDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + + ZACHTE2 = F%RACTHE(IGLG) + + PREEL_COMPLEX(IOFF_KSCALARS_NSDER+2*JM+1) = & + & PREEL_COMPLEX(IOFF_KSCALARS_NSDER+2*JM+1)*ZACHTE2 + PREEL_COMPLEX(IOFF_KSCALARS_NSDER+2*JM+2) = & + & PREEL_COMPLEX(IOFF_KSCALARS_NSDER+2*JM+2)*ZACHTE2 + ENDIF ENDDO ENDDO ENDDO @@ -146,27 +150,32 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & !* 2.1 U AND V. IF (KUV_EWDER_OFFSET >= 0) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=1,2*KF_UV - IGLG = OFFSET_VAR+KGL-1 - IOFF_LAT = KF_FS*D_NSTAGTF(KGL) - IOFF_UV = IOFF_LAT+(KUV_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) - IOFF_UV_EWDER = IOFF_LAT+(KUV_EWDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) - - ZACHTE2 = F%RACTHE(IGLG) - - !$ACC LOOP SEQ - DO JM=0,G_NMEN(IGLG) - PREEL_COMPLEX(IOFF_UV_EWDER+2*JM+1) = & - & -PREEL_COMPLEX(IOFF_UV+2*JM+2)*ZACHTE2*REAL(JM,JPRBT) - PREEL_COMPLEX(IOFF_UV_EWDER+2*JM+2) = & - & PREEL_COMPLEX(IOFF_UV+2*JM+1)*ZACHTE2*REAL(JM,JPRBT) - ENDDO - !$ACC LOOP SEQ - DO JM=G_NMEN(IGLG)+1,(G_NLOEN(IGLG)+4)/2-1 - PREEL_COMPLEX(IOFF_UV_EWDER+2*JM+1) = 0._JPRBT - PREEL_COMPLEX(IOFF_UV_EWDER+2*JM+2) = 0._JPRBT + DO JM=0,(G_NLOEN_MAX+4)/2-1 + IGLG = OFFSET_VAR+KGL-1 + ! see comment in fourier_in + IF (JM <= (G_NLOEN(IGLG)+4)/2-1) THEN + IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_UV = IOFF_LAT+(KUV_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + IOFF_UV_EWDER = IOFF_LAT+(KUV_EWDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + + RET_REAL = 0.0_JPRBT + RET_COMPLEX = 0.0_JPRBT + + IF (JM <= G_NMEN(IGLG)) THEN + ZACHTE2 = F%RACTHE(IGLG) + + RET_REAL = & + & -PREEL_COMPLEX(IOFF_UV+2*JM+2)*ZACHTE2*REAL(JM,JPRBT) + RET_COMPLEX = & + & PREEL_COMPLEX(IOFF_UV+2*JM+1)*ZACHTE2*REAL(JM,JPRBT) + ENDIF + ! The rest from G_NMEN(IGLG)+1...MAX is zero truncated + PREEL_COMPLEX(IOFF_UV_EWDER+2*JM+1) = RET_REAL + PREEL_COMPLEX(IOFF_UV_EWDER+2*JM+2) = RET_COMPLEX + ENDIF ENDDO ENDDO ENDDO @@ -174,29 +183,33 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & !* 2.2 SCALAR VARIABLES IF (KSCALARS_EWDER_OFFSET > 0) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) ASYNC(1) & - !$ACC& PRIVATE(IGLG,IOFF_LAT,IOFF_SCALARS_EWDER,IOFF_SCALARS,ZACHTE2,JM) + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) ASYNC(1) & + !$ACC& PRIVATE(IGLG,IOFF_LAT,IOFF_SCALARS_EWDER,IOFF_SCALARS,ZACHTE2) DO KGL=IBEG,IEND,IINC DO JF=1,KF_SCALARS - IGLG = OFFSET_VAR+KGL-1 - IOFF_LAT = KF_FS*D_NSTAGTF(KGL) - IOFF_SCALARS_EWDER = IOFF_LAT+(KSCALARS_EWDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) - IOFF_SCALARS = IOFF_LAT+(KSCALARS_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) - - - ZACHTE2 = F%RACTHE(IGLG) - - !$ACC LOOP SEQ - DO JM=0,G_NMEN(IGLG) - PREEL_COMPLEX(IOFF_SCALARS_EWDER+2*JM+1) = & - & -PREEL_COMPLEX(IOFF_SCALARS+2*JM+2)*ZACHTE2*REAL(JM,JPRBT) - PREEL_COMPLEX(IOFF_SCALARS_EWDER+2*JM+2) = & - & PREEL_COMPLEX(IOFF_SCALARS+2*JM+1)*ZACHTE2*REAL(JM,JPRBT) - ENDDO - !$ACC LOOP SEQ - DO JM=G_NMEN(IGLG)+1,(G_NLOEN(IGLG)+4)/2-1 - PREEL_COMPLEX(IOFF_SCALARS_EWDER+2*JM+1) = 0._JPRBT - PREEL_COMPLEX(IOFF_SCALARS_EWDER+2*JM+2) = 0._JPRBT + DO JM=0,(G_NLOEN_MAX+4)/2-1 + IGLG = OFFSET_VAR+KGL-1 + ! see comment in fourier_in + IF (JM <= (G_NLOEN(IGLG)+4)/2-1) THEN + IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_SCALARS_EWDER = IOFF_LAT+(KSCALARS_EWDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + IOFF_SCALARS = IOFF_LAT+(KSCALARS_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + + RET_REAL = 0.0_JPRBT + RET_COMPLEX = 0.0_JPRBT + + IF (JM <= G_NMEN(IGLG)) THEN + ZACHTE2 = F%RACTHE(IGLG) + + RET_REAL = & + & -PREEL_COMPLEX(IOFF_SCALARS+2*JM+2)*ZACHTE2*REAL(JM,JPRBT) + RET_COMPLEX = & + & PREEL_COMPLEX(IOFF_SCALARS+2*JM+1)*ZACHTE2*REAL(JM,JPRBT) + ENDIF + ! The rest from G_NMEN(IGLG+1)...MAX is zero truncated + PREEL_COMPLEX(IOFF_SCALARS_EWDER+2*JM+1) = RET_REAL + PREEL_COMPLEX(IOFF_SCALARS_EWDER+2*JM+2) = RET_COMPLEX + ENDIF ENDDO ENDDO ENDDO From 7b6af6ce444daae190bd78d1971104da13040609 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:23 -0700 Subject: [PATCH 170/263] Merge loops for leinv (now same as ledir) --- src/trans/gpu/internal/leinv_mod.F90 | 48 +++++++++++++++------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 77138ebfa..f9fcac40e 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -299,31 +299,33 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) !$ACC ENTER DATA CREATE(FOUBUF_IN) !$ACC DATA PRESENT(FOUBUF_IN) -!$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,ISTAS,ZAOA,ZSOA) +!$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,ISTAS,ZAOA,ZSOA) DO KMLOC=1,D_NUMP - DO JK=1,KFIELDS - KM = D_MYMS(KMLOC) - ISL = R_NDGNH-G_NDGLU(KM)+1 - !$ACC LOOP SEQ - DO JGL=ISL,R_NDGNH - IGLS = R_NDGL+1-JGL - OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL)) + D_NPNTGTB1(KMLOC,JGL))*KFIELDS - OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS)) + D_NPNTGTB1(KMLOC,IGLS))*KFIELDS - - IF(KM /= 0) THEN - ZSOA = ZOUTS(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) - ZAOA = ZOUTA(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) - ELSEIF (MOD((JK-1),2) .EQ. 0) THEN - ZSOA = ZOUTS((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) - ZAOA = ZOUTA((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) - ELSE - ! Imaginary values of KM=0 is zero, though I don't think we care - ZSOA = 0_JPRBT - ZAOA = 0_JPRBT + DO JGL=1,R_NDGNH + DO JK=1,KFIELDS + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + IF (JGL >= ISL) THEN + !(DO JGL=ISL,R_NDGNH) + IGLS = R_NDGL+1-JGL + OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL)) + D_NPNTGTB1(KMLOC,JGL))*KFIELDS + OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS)) + D_NPNTGTB1(KMLOC,IGLS))*KFIELDS + + IF(KM /= 0) THEN + ZSOA = ZOUTS(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) + ZAOA = ZOUTA(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) + ELSEIF (MOD((JK-1),2) .EQ. 0) THEN + ZSOA = ZOUTS((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) + ZAOA = ZOUTA((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) + ELSE + ! Imaginary values of KM=0 is zero, though I don't think we care + ZSOA = 0_JPRBT + ZAOA = 0_JPRBT + ENDIF + + FOUBUF_IN(OFFSET1+JK) = ZAOA+ZSOA + FOUBUF_IN(OFFSET2+JK) = ZSOA-ZAOA ENDIF - - FOUBUF_IN(OFFSET1+JK) = ZAOA+ZSOA - FOUBUF_IN(OFFSET2+JK) = ZSOA-ZAOA ENDDO ENDDO ENDDO From b04308123c26f4ff13ee2bca9cfd8307777ced1a Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:23 -0700 Subject: [PATCH 171/263] INV: Fourier_in should not over compute preel --- src/trans/gpu/internal/fourier_in_mod.F90 | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index 39b7498b2..c88f4c1ad 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -71,13 +71,13 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) !$ACC& ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=1,KF_CURRENT - DO JM=0,(G_NLOEN_MAX+4)/2-1 + DO JM=0,G_NLOEN_MAX/2 IGLG = OFFSET_VAR+KGL-1 - ! Truncation (not sure what is the exact upper bound here...) - ! Same is also in FSC for the new fields. I *think* it should be N/2+1 elements in total - ! TODO: Make sure this is correct - IF (JM <= (G_NLOEN(IGLG)+4)/2-1) THEN + ! FFT transforms NLON real values to floor(NLON/2)+1 complex numbers. Hence we have + ! to fill those floor(NLON/2)+1 values. + ! Truncation happens starting at G_NMEN+1. Hence, we zero-fill those values. + IF (JM <= G_NLOEN(IGLG)/2) THEN IOFF_LAT = KF_TOTAL*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) RET_REAL = 0.0_JPRBT RET_COMPLEX = 0.0_JPRBT From b93c6a171e7f012bd3e5dd0f020b93b60ced9ea6 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:23 -0700 Subject: [PATCH 172/263] INV: FSC should not over compute preel --- src/trans/gpu/internal/fsc_mod.F90 | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 8bd218a3a..7755e81a2 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -153,10 +153,12 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) ASYNC(1) DO KGL=IBEG,IEND,IINC DO JF=1,2*KF_UV - DO JM=0,(G_NLOEN_MAX+4)/2-1 + DO JM=0,G_NLOEN_MAX/2 IGLG = OFFSET_VAR+KGL-1 - ! see comment in fourier_in - IF (JM <= (G_NLOEN(IGLG)+4)/2-1) THEN + ! FFT transforms NLON real values to floor(NLON/2)+1 complex numbers. Hence we have + ! to fill those floor(NLON/2)+1 values. + ! Truncation happens starting at G_NMEN+1. Hence, we zero-fill those values. + IF (JM <= G_NLOEN(IGLG)/2) THEN IOFF_LAT = KF_FS*D_NSTAGTF(KGL) IOFF_UV = IOFF_LAT+(KUV_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) IOFF_UV_EWDER = IOFF_LAT+(KUV_EWDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) @@ -172,7 +174,6 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & RET_COMPLEX = & & PREEL_COMPLEX(IOFF_UV+2*JM+1)*ZACHTE2*REAL(JM,JPRBT) ENDIF - ! The rest from G_NMEN(IGLG)+1...MAX is zero truncated PREEL_COMPLEX(IOFF_UV_EWDER+2*JM+1) = RET_REAL PREEL_COMPLEX(IOFF_UV_EWDER+2*JM+2) = RET_COMPLEX ENDIF @@ -187,10 +188,12 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & !$ACC& PRIVATE(IGLG,IOFF_LAT,IOFF_SCALARS_EWDER,IOFF_SCALARS,ZACHTE2) DO KGL=IBEG,IEND,IINC DO JF=1,KF_SCALARS - DO JM=0,(G_NLOEN_MAX+4)/2-1 + DO JM=0,G_NLOEN_MAX/2 IGLG = OFFSET_VAR+KGL-1 - ! see comment in fourier_in - IF (JM <= (G_NLOEN(IGLG)+4)/2-1) THEN + ! FFT transforms NLON real values to floor(NLON/2)+1 complex numbers. Hence we have + ! to fill those floor(NLON/2)+1 values. + ! Truncation happens starting at G_NMEN+1. Hence, we zero-fill those values. + IF (JM <= G_NLOEN(IGLG)/2) THEN IOFF_LAT = KF_FS*D_NSTAGTF(KGL) IOFF_SCALARS_EWDER = IOFF_LAT+(KSCALARS_EWDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) IOFF_SCALARS = IOFF_LAT+(KSCALARS_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) From c62aa3339d6b8ce019e84aff8fddf6d96975fbc5 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:24 -0700 Subject: [PATCH 173/263] Remove any extra padding in PREEL (CHANGE1: 8) (CHANGE2: 10) --- src/trans/gpu/internal/sump_trans_mod.F90 | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/trans/gpu/internal/sump_trans_mod.F90 b/src/trans/gpu/internal/sump_trans_mod.F90 index 8669aa95d..b0cb2eee3 100755 --- a/src/trans/gpu/internal/sump_trans_mod.F90 +++ b/src/trans/gpu/internal/sump_trans_mod.F90 @@ -262,12 +262,12 @@ SUBROUTINE SUMP_TRANS DO JGL=1,D%NDGL_FS D%NSTAGTF(JGL) = IOFF IGL = D%NPTRLS(MYSETW) + JGL - 1 - IOFF = IOFF + G%NLOEN(IGL)+3 - ! Make sure IOFF is even. This could really lead to slightly too large buffers - ! esp because the (+3) above (needed?), but it is crucial to have those even - ! because with these offsets we can store complex numbers, and CUFFT won't accept - ! unaligned complex buffers - IOFF = (IOFF+1)/2*2 + ! Each latitude should be able to store NLON real values, or floor(NLON/2)+1 + ! complex values. Note that IOFF should always be even, because we need to + ! store complex values (i.e. 2 floats), but this is the case anyway. + ! WARNING: Extra padding changes results, potentially, though it does not + ! cause wrong results. + IOFF = IOFF + (G%NLOEN(IGL)/2+1)*2 ENDDO D%NSTAGTF(D%NDGL_FS+1) = IOFF D%NLENGTF = IOFF From 3aef2413eb1bb96593852f0f7e540db1bf5ccab0 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:24 -0700 Subject: [PATCH 174/263] Improve fourier_* (tiling for transposition) --- src/trans/gpu/internal/fourier_in_mod.F90 | 6 +++--- src/trans/gpu/internal/fourier_out_mod.F90 | 12 +++++++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index c88f4c1ad..fbc7951f9 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -64,12 +64,12 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) IINC=-1 ENDIF -!$ACC DATA PRESENT(G_NLOEN,G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF) ASYNC(1) +!$ACC DATA PRESENT(D,G_NLOEN,G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF) ASYNC(1) OFFSET_VAR=D_NPTRLS(MYSETW) !$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,IPROC,ISTA,RET_REAL,RET_COMPLEX) DEFAULT(NONE) & -!$ACC& ASYNC(1) -DO KGL=IBEG,IEND,IINC +!$ACC& ASYNC(1) TILE(32,8,1) +DO KGL=1,D%NDGL_FS DO JF=1,KF_CURRENT DO JM=0,G_NLOEN_MAX/2 IGLG = OFFSET_VAR+KGL-1 diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 index 41af4246c..465be8991 100755 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ b/src/trans/gpu/internal/fourier_out_mod.F90 @@ -68,15 +68,17 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) ALLOCATE(FOUBUF_IN(D%NLENGT0B*KFIELDS*2)) !$ACC ENTER DATA CREATE(FOUBUF_IN) -!$ACC DATA PRESENT(G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF_IN,PREEL_COMPLEX,D_NSTAGTF,G_NLOEN) ASYNC(1) +!$ACC DATA PRESENT(D,G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF_IN,PREEL_COMPLEX,D_NSTAGTF,G_NLOEN) ASYNC(1) ! scale results and move into next transformation buffer OFFSET_VAR=D_NPTRLS(MYSETW) -!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(IGLG,IOFF_LAT,IPROC,ISTA,SCAL) DEFAULT(NONE) ASYNC(1) -DO KGL=IBEG,IEND,IINC - DO JF=1,KFIELDS - DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG) + +!$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,IPROC,ISTA,SCAL) DEFAULT(NONE) & +!$ACC& ASYNC(1) TILE(32,8,1) +DO KGL=1,D%NDGL_FS + DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG) + DO JF=1,KFIELDS IGLG = OFFSET_VAR+KGL-1 IF (JM <= G_NMEN(IGLG)) THEN IOFF_LAT = KFIELDS*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) From 3a369cd68afcdd7109cb618123903ba9dc678194 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:24 -0700 Subject: [PATCH 175/263] Mnor cleanup in sump_trans_od --- src/trans/gpu/internal/sump_trans_mod.F90 | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/src/trans/gpu/internal/sump_trans_mod.F90 b/src/trans/gpu/internal/sump_trans_mod.F90 index b0cb2eee3..57151a836 100755 --- a/src/trans/gpu/internal/sump_trans_mod.F90 +++ b/src/trans/gpu/internal/sump_trans_mod.F90 @@ -39,7 +39,6 @@ SUBROUTINE SUMP_TRANS INTEGER(KIND=JPIM) :: JM INTEGER(KIND=JPIM) :: JGL,IGL,IPLAT,ISENDSET,IRECVSET,JML,IPOS,IM -INTEGER(KIND=JPIM) :: I1,I2,I3,IAUX0,IAUX1,JA1 INTEGER(KIND=JPIM) :: IGPTOT,IMEDIAP,IRESTM,JA,JB,IOFF INTEGER(KIND=JPIM),ALLOCATABLE :: IGPTOTL(:,:) @@ -143,26 +142,15 @@ SUBROUTINE SUMP_TRANS ENDDO ENDDO - IAUX0 = 0 - IAUX1 = 0 - DO JA=1,NPRTRNS-1 - I1 = MYSENDSET(NPRTRNS,MYSETW,JA) - I2 = MYRECVSET(NPRTRNS,MYSETW,JA) - I3 = -1 - DO JA1=1,NPRTRNS-1 - IF(MYSENDSET(NPRTRNS,MYSETW,JA1) == I2) I3 =MYRECVSET(NPRTRNS,MYSETW,JA1) - ENDDO - IAUX0 = MAX(D%NLTSFTB(I1),D%NLTSGTB(I2),IAUX0) - IAUX1 = MAX(D%NLTSGTB(I2),D%NLTSFTB(I3),IAUX1) - ENDDO - IAUX0 = MAX(D%NLTSGTB(MYSETW),IAUX0) - IAUX1 = MAX(D%NLTSGTB(MYSETW),IAUX1) D%NSTAGT0B(1) = 0 D%NSTAGT1B(1) = 0 DO JA=2,NPRTRNS D%NSTAGT0B(JA) = D%NSTAGT0B(JA-1)+D%NLTSGTB(JA-1) D%NSTAGT1B(JA) = D%NSTAGT1B(JA-1)+D%NLTSFTB(JA-1) ENDDO + ! D%NSTAGT0B / D%NSTAGT1B: offset of peer rank in send/recv buffer + ! D%NLTSGTB / D%NLTSFTB : size of peer rank in send/recv buffer + ! D%NPNTGTB0 / D%NPNTGTB1: translation inp to send buffer / recv to out buffer D%NLENGT0B = D%NSTAGT0B(NPRTRNS)+D%NLTSGTB(NPRTRNS) D%NLENGT1B = D%NSTAGT1B(NPRTRNS)+D%NLTSFTB(NPRTRNS) ENDIF From 50952c47a8942e1e1035c9eff01b7824bcb526eb Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:24 -0700 Subject: [PATCH 176/263] Slightly simplify the foubuf indexing by storing global indices --- src/trans/gpu/internal/fourier_in_mod.F90 | 15 ++++++------ src/trans/gpu/internal/fourier_out_mod.F90 | 13 +++++----- src/trans/gpu/internal/ledir_mod.F90 | 8 +++---- src/trans/gpu/internal/leinv_mod.F90 | 8 +++---- src/trans/gpu/internal/sump_trans_mod.F90 | 28 ++++++++++++---------- 5 files changed, 37 insertions(+), 35 deletions(-) diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index fbc7951f9..8f5230b4b 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -40,7 +40,7 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) ! ------------------------------------------------------------------ USE PARKIND_ECTRANS ,ONLY : JPIM,JPRBT -USE TPM_DISTR ,ONLY : D,MYSETW,MYPROC, NPROC, D_NSTAGTF, D_NSTAGT0B,D_NPNTGTB0,D_NPROCM,D_NPTRLS +USE TPM_DISTR ,ONLY : D,MYSETW,MYPROC, NPROC, D_NSTAGTF, D_NPNTGTB0,D_NPTRLS USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NLOEN,G_NLOEN_MAX ! @@ -50,7 +50,7 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) REAL(KIND=JPRBT), INTENT(OUT) :: PREEL_COMPLEX(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_CURRENT, KF_TOTAL -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA,OFFSET_VAR,IOFF_LAT,KGL +INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL INTEGER(KIND=JPIM) :: IBEG,IEND,IINC REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX @@ -64,11 +64,11 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) IINC=-1 ENDIF -!$ACC DATA PRESENT(D,G_NLOEN,G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF) ASYNC(1) +!$ACC DATA PRESENT(D,G_NLOEN,G_NMEN,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF) ASYNC(1) OFFSET_VAR=D_NPTRLS(MYSETW) -!$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,IPROC,ISTA,RET_REAL,RET_COMPLEX) DEFAULT(NONE) & -!$ACC& ASYNC(1) TILE(32,8,1) +!$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,ISTA,RET_REAL,RET_COMPLEX) DEFAULT(NONE) & +!$ACC& ASYNC(1) TILE(32,16,1) DO KGL=1,D%NDGL_FS DO JF=1,KF_CURRENT DO JM=0,G_NLOEN_MAX/2 @@ -78,16 +78,15 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) ! to fill those floor(NLON/2)+1 values. ! Truncation happens starting at G_NMEN+1. Hence, we zero-fill those values. IF (JM <= G_NLOEN(IGLG)/2) THEN - IOFF_LAT = KF_TOTAL*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) RET_REAL = 0.0_JPRBT RET_COMPLEX = 0.0_JPRBT IF (JM <= G_NMEN(IGLG)) THEN - IPROC = D_NPROCM(JM) - ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KF_CURRENT*2 + ISTA = D_NPNTGTB0(JM,KGL)*KF_CURRENT*2 RET_REAL = FOUBUF(ISTA+2*JF-1) RET_COMPLEX = FOUBUF(ISTA+2*JF ) ENDIF + IOFF_LAT = KF_TOTAL*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) PREEL_COMPLEX(IOFF_LAT+2*JM+1) = RET_REAL PREEL_COMPLEX(IOFF_LAT+2*JM+2) = RET_COMPLEX ENDIF diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 index 465be8991..dae8aac1c 100755 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ b/src/trans/gpu/internal/fourier_out_mod.F90 @@ -39,7 +39,7 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) ! ------------------------------------------------------------------ USE PARKIND_ECTRANS ,ONLY : JPIM,JPRBT -USE TPM_DISTR ,ONLY : D,MYSETW,MYPROC, NPROC, D_NSTAGTF, D_NSTAGT0B,D_NPNTGTB0,D_NPROCM,D_NPTRLS +USE TPM_DISTR ,ONLY : D,MYSETW,MYPROC, NPROC, D_NSTAGTF, D_NPNTGTB0,D_NPTRLS USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NLOEN USE TPM_DIM, ONLY: R_NSMAX ! @@ -50,7 +50,7 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) REAL(KIND=JPRBT), ALLOCATABLE, INTENT(OUT) :: FOUBUF_IN(:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,ISTA,OFFSET_VAR,IOFF_LAT,KGL +INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL INTEGER(KIND=JPIM) :: IBEG,IEND,IINC REAL(KIND=JPRBT) :: SCAL @@ -68,14 +68,14 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) ALLOCATE(FOUBUF_IN(D%NLENGT0B*KFIELDS*2)) !$ACC ENTER DATA CREATE(FOUBUF_IN) -!$ACC DATA PRESENT(D,G_NMEN,D_NPROCM,D_NSTAGT0B,D_NPNTGTB0,FOUBUF_IN,PREEL_COMPLEX,D_NSTAGTF,G_NLOEN) ASYNC(1) +!$ACC DATA PRESENT(D,G_NMEN,D_NPNTGTB0,FOUBUF_IN,PREEL_COMPLEX,D_NSTAGTF,G_NLOEN) ASYNC(1) ! scale results and move into next transformation buffer OFFSET_VAR=D_NPTRLS(MYSETW) -!$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,IPROC,ISTA,SCAL) DEFAULT(NONE) & -!$ACC& ASYNC(1) TILE(32,8,1) +!$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,ISTA,SCAL) DEFAULT(NONE) & +!$ACC& ASYNC(1) TILE(32,16,1) DO KGL=1,D%NDGL_FS DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG) DO JF=1,KFIELDS @@ -84,8 +84,7 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) IOFF_LAT = KFIELDS*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) - IPROC = D_NPROCM(JM) - ISTA = (D_NSTAGT0B(IPROC)+D_NPNTGTB0(JM,KGL))*KFIELDS*2 + ISTA = D_NPNTGTB0(JM,KGL)*KFIELDS*2 FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+1) FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+2) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 1eb92bf46..0066468b8 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -59,7 +59,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX,R_NTMAX,R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU USE TPM_FIELDS ,ONLY : F,ZAA,ZAS,ZAA0,ZAS0,TDZAA,TDZAS,KMLOC0 -USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPROCL,D_NPNTGTB1,D_NSTAGT1B +USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1 USE CUDA_GEMM_BATCHED_MOD!!, ONLY: CUDA_TCGEMM_BATCHED, CUDA_GEMM_BATCHED USE CUBLAS_MOD, ONLY : CUDA_DGEMM_BATCHED USE MPL_MODULE ,ONLY : MPL_BARRIER @@ -101,7 +101,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !$ACC& PRESENT(F,F%RW) & !$ACC& PRESENT(D,D_MYMS,R,G,G_NDGLU) & !$ACC& PRESENT(ZAA,ZAS,POA1) & -!$ACC& PRESENT(D_NPNTGTB1,D_NSTAGT1B,D_NPROCL) +!$ACC& PRESENT(D_NPNTGTB1) ! TODO this doesn't make sense that we need it (???) !$ACC KERNELS @@ -119,8 +119,8 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) IF (JGL >= ISL) THEN !(DO JGL=ISL,R_NDGNH) IGLS = R_NDGL+1-JGL - OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL) )+D_NPNTGTB1(KMLOC,JGL ))*2*KF_FS - OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS))+D_NPNTGTB1(KMLOC,IGLS))*2*KF_FS + OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*2*KF_FS + OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_FS PAIA = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) PAIS = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) IF (JF <= 4*KF_UV) THEN diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index f9fcac40e..253f21383 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -56,7 +56,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX, R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU USE TPM_FIELDS ,ONLY : ZAA,ZAS,ZAA0,ZAS0,TDZAA,TDZAS,KMLOC0 -USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPROCL,D_NPNTGTB1,D_NSTAGT1B +USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1 USE TPM_GEN, ONLY: NOUT USE CUDA_GEMM_BATCHED_MOD USE CUBLAS_MOD, ONLY : CUDA_DGEMM_BATCHED @@ -106,7 +106,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) !$ACC DATA PRESENT(D,D_MYMS,G,G_NDGLU,R) & !$ACC& CREATE (ZINP,ZOUTS,ZOUTA,ZINP0,ZOUT0) & !$ACC& PRESENT(ZAA,ZAS,PIA) & -!$ACC& PRESENT(D_MYMS,D_NPROCL,D_NSTAGT1B,D_NPNTGTB1,G_NDGLU) +!$ACC& PRESENT(D_MYMS,D_NPNTGTB1,G_NDGLU) ! TODO this doesn't make sense that we need it (???) !$ACC KERNELS @@ -308,8 +308,8 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) IF (JGL >= ISL) THEN !(DO JGL=ISL,R_NDGNH) IGLS = R_NDGL+1-JGL - OFFSET1 = (D_NSTAGT1B(D_NPROCL(JGL)) + D_NPNTGTB1(KMLOC,JGL))*KFIELDS - OFFSET2 = (D_NSTAGT1B(D_NPROCL(IGLS)) + D_NPNTGTB1(KMLOC,IGLS))*KFIELDS + OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*KFIELDS + OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*KFIELDS IF(KM /= 0) THEN ZSOA = ZOUTS(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) diff --git a/src/trans/gpu/internal/sump_trans_mod.F90 b/src/trans/gpu/internal/sump_trans_mod.F90 index 57151a836..d750a9367 100755 --- a/src/trans/gpu/internal/sump_trans_mod.F90 +++ b/src/trans/gpu/internal/sump_trans_mod.F90 @@ -110,6 +110,19 @@ SUBROUTINE SUMP_TRANS ALLOCATE(D%NPNTGTB1(D%NUMP,R%NDGL)) IF(LLP2)WRITE(NOUT,9) 'D%NPNTGTB1 ',SIZE(D%NPNTGTB1 ),SHAPE(D%NPNTGTB1 ) + ! Global offsets of processors + D%NSTAGT0B(1) = 0 + D%NSTAGT1B(1) = 0 + DO JA=2,NPRTRNS + D%NSTAGT0B(JA) = D%NSTAGT0B(JA-1)+D%NLTSGTB(JA-1) + D%NSTAGT1B(JA) = D%NSTAGT1B(JA-1)+D%NLTSFTB(JA-1) + ENDDO + + ! Global size of foubuf + D%NLENGT0B = D%NSTAGT0B(NPRTRNS)+D%NLTSGTB(NPRTRNS) + D%NLENGT1B = D%NSTAGT1B(NPRTRNS)+D%NLTSFTB(NPRTRNS) + + ! Global offsets of grid points DO JA=1,NPRTRW IPOS = 0 DO JGL=1,D%NULTPP(MYSETW) @@ -117,7 +130,7 @@ SUBROUTINE SUMP_TRANS DO JML=D%NPTRMS(JA),D%NPTRMS(JA)+D%NUMPP(JA)-1 IM = D%NALLMS(JML) IF (IM <= G%NMEN(IGL)) THEN - D%NPNTGTB0(IM,JGL) = IPOS + D%NPNTGTB0(IM,JGL) = D%NSTAGT0B(D%NPROCM(IM)) + IPOS IPOS = IPOS+1 ELSE D%NPNTGTB0(IM,JGL) = -99 @@ -133,7 +146,7 @@ SUBROUTINE SUMP_TRANS DO JM=1,D%NUMP IM = D%MYMS(JM) IF (IM <= G%NMEN(IGL)) THEN - D%NPNTGTB1(JM,IGL) = IPOS + D%NPNTGTB1(JM,IGL) = D%NSTAGT1B(D%NPROCL(IGL)) + IPOS IPOS = IPOS+1 ELSE D%NPNTGTB1(JM,IGL) = -99 @@ -141,18 +154,9 @@ SUBROUTINE SUMP_TRANS ENDDO ENDDO ENDDO - - D%NSTAGT0B(1) = 0 - D%NSTAGT1B(1) = 0 - DO JA=2,NPRTRNS - D%NSTAGT0B(JA) = D%NSTAGT0B(JA-1)+D%NLTSGTB(JA-1) - D%NSTAGT1B(JA) = D%NSTAGT1B(JA-1)+D%NLTSFTB(JA-1) - ENDDO ! D%NSTAGT0B / D%NSTAGT1B: offset of peer rank in send/recv buffer ! D%NLTSGTB / D%NLTSFTB : size of peer rank in send/recv buffer - ! D%NPNTGTB0 / D%NPNTGTB1: translation inp to send buffer / recv to out buffer - D%NLENGT0B = D%NSTAGT0B(NPRTRNS)+D%NLTSGTB(NPRTRNS) - D%NLENGT1B = D%NSTAGT1B(NPRTRNS)+D%NLTSFTB(NPRTRNS) + ! D%NPNTGTB0 / D%NPNTGTB1: translation inp to global send buffer / recv to out buffer ENDIF ! GRIDPOINT SPACE From 3ee84549dcc88fe0cf557551a06c036f7e0b5a61 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:24 -0700 Subject: [PATCH 177/263] Route all GEMMs through CUDA_GEMM_BATCHED interface --- .../gpu/internal/cuda_gemm_batched_mod.F90 | 175 ++++-------------- src/trans/gpu/internal/ledir_mod.F90 | 15 +- src/trans/gpu/internal/leinv_mod.F90 | 17 +- 3 files changed, 39 insertions(+), 168 deletions(-) diff --git a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 index 8032a1c28..f61eed095 100755 --- a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 +++ b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 @@ -1,137 +1,21 @@ MODULE CUDA_GEMM_BATCHED_MOD USE CUBLAS_MOD - USE PARKIND1, ONLY: JPRD, JPRM, JPIM, JPIB + USE PARKIND1, ONLY: JPRD, JPRM, JPIM - IMPLICIT NONE + IMPLICIT NONE -!! PRIVATE - PUBLIC CUDA_GEMM_BATCHED, CUDA_DGEMM_BATCHED_OVERLOAD, CUDA_DGEMM_BATCHED_1D_OVERLOAD + PRIVATE + PUBLIC CUDA_GEMM_BATCHED INTERFACE CUDA_GEMM_BATCHED - MODULE PROCEDURE CUDA_DGEMM_BATCHED_OVERLOAD - MODULE PROCEDURE CUDA_SGEMM_BATCHED_OVERLOAD - MODULE PROCEDURE CUDA_SGEMM_STRIDED_BATCHED_OVERLOAD - MODULE PROCEDURE CUDA_DGEMM_BATCHED_1D_OVERLOAD - MODULE PROCEDURE CUDA_SGEMM_BATCHED_1D_OVERLOAD - MODULE PROCEDURE CUDA_SGEMM_STRIDED_BATCHED_1D_OVERLOAD - END INTERFACE CUDA_GEMM_BATCHED + MODULE PROCEDURE CUDA_DGEMM_BATCHED_1D_2D_1D_OVERLOAD + MODULE PROCEDURE CUDA_DGEMM_BATCHED_1D_3D_1D_OVERLOAD + MODULE PROCEDURE CUDA_SGEMM_BATCHED_1D_3D_1D_OVERLOAD + END INTERFACE CUDA_GEMM_BATCHED CONTAINS -SUBROUTINE CUDA_DGEMM_BATCHED_OVERLOAD( & - & TRANSA, TRANSB, & - & M, N, K, & - & ALPHA, & - & AARRAY, LDA, STRIDEA, & - & BARRAY, LDB, STRIDEB, & - & BETA, & - & CARRAY, LDC, STRIDEC, & - & BATCHCOUNT) -CHARACTER, INTENT(IN) :: TRANSA -CHARACTER, INTENT(IN) :: TRANSB -INTEGER(KIND=JPIM) :: M -INTEGER(KIND=JPIM) :: N -INTEGER(KIND=JPIM) :: K -REAL(KIND=JPRD) :: ALPHA -REAL(KIND=JPRD), DIMENSION(:,:,:) :: AARRAY -INTEGER(KIND=JPIM) :: LDA -INTEGER(KIND=JPIM) :: STRIDEA -REAL(KIND=JPRD), DIMENSION(:,:,:) :: BARRAY -INTEGER(KIND=JPIM) :: LDB -INTEGER(KIND=JPIM) :: STRIDEB -REAL(KIND=JPRD) :: BETA -REAL(KIND=JPRD), DIMENSION(:,:,:) :: CARRAY -INTEGER(KIND=JPIM) :: LDC -INTEGER(KIND=JPIM) :: STRIDEC -INTEGER(KIND=JPIM) :: BATCHCOUNT -CALL CUDA_DGEMM_BATCHED( & - & TRANSA, TRANSB, & - & M, N, K, & - & ALPHA, & - & AARRAY, LDA, STRIDEA, & - & BARRAY, LDB, STRIDEB, & - & BETA, & - & CARRAY, LDC, STRIDEC, & - & BATCHCOUNT) -END SUBROUTINE CUDA_DGEMM_BATCHED_OVERLOAD - -SUBROUTINE CUDA_SGEMM_BATCHED_OVERLOAD( & - & TRANSA, TRANSB, & - & M, N, K, & - & ALPHA, & - & AARRAY, LDA, STRIDEA, & - & BARRAY, LDB, STRIDEB, & - & BETA, & - & CARRAY, LDC, STRIDEC, & - & BATCHCOUNT) -CHARACTER, INTENT(IN) :: TRANSA -CHARACTER, INTENT(IN) :: TRANSB -INTEGER(KIND=JPIM) :: M -INTEGER(KIND=JPIM) :: N -INTEGER(KIND=JPIM) :: K -REAL(KIND=JPRM) :: ALPHA -REAL(KIND=JPRM), DIMENSION(:,:,:) :: AARRAY -INTEGER(KIND=JPIM) :: LDA -INTEGER(KIND=JPIM) :: STRIDEA -REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY -INTEGER(KIND=JPIM) :: LDB -INTEGER(KIND=JPIM) :: STRIDEB -REAL(KIND=JPRM) :: BETA -REAL(KIND=JPRM), DIMENSION(:,:,:) :: CARRAY -INTEGER(KIND=JPIM) :: LDC -INTEGER(KIND=JPIM) :: STRIDEC -INTEGER(KIND=JPIM) :: BATCHCOUNT - -CALL CUDA_SGEMM_BATCHED( & - & TRANSA, TRANSB, & - & M, N, K, & - & ALPHA, & - & AARRAY, LDA, STRIDEA, & - & BARRAY, LDB, STRIDEB, & - & BETA, & - & CARRAY, LDC, STRIDEC, & - & BATCHCOUNT) -END SUBROUTINE CUDA_SGEMM_BATCHED_OVERLOAD - -SUBROUTINE CUDA_SGEMM_STRIDED_BATCHED_OVERLOAD( & - & TRANSA, TRANSB, & - & M, N, K, & - & ALPHA, & - & AARRAY, LDA, STRIDEA, & - & BARRAY, LDB, STRIDEB, & - & BETA, & - & CARRAY, LDC, STRIDEC, & - & BATCHCOUNT) - CHARACTER, INTENT(IN) :: TRANSA - CHARACTER, INTENT(IN) :: TRANSB - INTEGER(KIND=JPIM) :: M - INTEGER(KIND=JPIM) :: N - INTEGER(KIND=JPIM) :: K - REAL(KIND=JPRM) :: ALPHA - REAL(KIND=JPRM), DIMENSION(:,:,:) :: AARRAY - INTEGER(KIND=JPIM) :: LDA - INTEGER(KIND=JPIB) :: STRIDEA - REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY - INTEGER(KIND=JPIM) :: LDB - INTEGER(KIND=JPIB) :: STRIDEB - REAL(KIND=JPRM) :: BETA - REAL(KIND=JPRM), DIMENSION(:,:,:) :: CARRAY - INTEGER(KIND=JPIM) :: LDC - INTEGER(KIND=JPIB) :: STRIDEC - INTEGER(KIND=JPIM) :: BATCHCOUNT - - CALL CUDA_SGEMM_STRIDED_BATCHED( & - & TRANSA, TRANSB, & - & M, N, K, & - & ALPHA, & - & AARRAY, LDA, STRIDEA, & - & BARRAY, LDB, STRIDEB, & - & BETA, & - & CARRAY, LDC, STRIDEC, & - & BATCHCOUNT) -END SUBROUTINE CUDA_SGEMM_STRIDED_BATCHED_OVERLOAD - -SUBROUTINE CUDA_DGEMM_BATCHED_1D_OVERLOAD( & +SUBROUTINE CUDA_DGEMM_BATCHED_1D_2D_1D_OVERLOAD( & & TRANSA, TRANSB, & & M, N, K, & & ALPHA, & @@ -149,7 +33,7 @@ SUBROUTINE CUDA_DGEMM_BATCHED_1D_OVERLOAD( & REAL(KIND=JPRD), DIMENSION(:) :: AARRAY INTEGER(KIND=JPIM) :: LDA INTEGER(KIND=JPIM) :: STRIDEA - REAL(KIND=JPRD), DIMENSION(:,:,:) :: BARRAY + REAL(KIND=JPRD), DIMENSION(:,:) :: BARRAY INTEGER(KIND=JPIM) :: LDB INTEGER(KIND=JPIM) :: STRIDEB REAL(KIND=JPRD) :: BETA @@ -158,6 +42,7 @@ SUBROUTINE CUDA_DGEMM_BATCHED_1D_OVERLOAD( & INTEGER(KIND=JPIM) :: STRIDEC INTEGER(KIND=JPIM) :: BATCHCOUNT + !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) CALL CUDA_DGEMM_BATCHED( & & TRANSA, TRANSB, & & M, N, K, & @@ -167,9 +52,9 @@ SUBROUTINE CUDA_DGEMM_BATCHED_1D_OVERLOAD( & & BETA, & & CARRAY, LDC, STRIDEC, & & BATCHCOUNT) - END SUBROUTINE CUDA_DGEMM_BATCHED_1D_OVERLOAD - - SUBROUTINE CUDA_SGEMM_BATCHED_1D_OVERLOAD( & + !$ACC END HOST_DATA +END SUBROUTINE CUDA_DGEMM_BATCHED_1D_2D_1D_OVERLOAD +SUBROUTINE CUDA_DGEMM_BATCHED_1D_3D_1D_OVERLOAD( & & TRANSA, TRANSB, & & M, N, K, & & ALPHA, & @@ -183,20 +68,21 @@ SUBROUTINE CUDA_SGEMM_BATCHED_1D_OVERLOAD( & INTEGER(KIND=JPIM) :: M INTEGER(KIND=JPIM) :: N INTEGER(KIND=JPIM) :: K - REAL(KIND=JPRM) :: ALPHA - REAL(KIND=JPRM), DIMENSION(:) :: AARRAY + REAL(KIND=JPRD) :: ALPHA + REAL(KIND=JPRD), DIMENSION(:) :: AARRAY INTEGER(KIND=JPIM) :: LDA INTEGER(KIND=JPIM) :: STRIDEA - REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY + REAL(KIND=JPRD), DIMENSION(:,:,:) :: BARRAY INTEGER(KIND=JPIM) :: LDB INTEGER(KIND=JPIM) :: STRIDEB - REAL(KIND=JPRM) :: BETA - REAL(KIND=JPRM), DIMENSION(:) :: CARRAY + REAL(KIND=JPRD) :: BETA + REAL(KIND=JPRD), DIMENSION(:) :: CARRAY INTEGER(KIND=JPIM) :: LDC INTEGER(KIND=JPIM) :: STRIDEC INTEGER(KIND=JPIM) :: BATCHCOUNT - CALL CUDA_SGEMM_BATCHED( & + !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) + CALL CUDA_DGEMM_BATCHED( & & TRANSA, TRANSB, & & M, N, K, & & ALPHA, & @@ -205,9 +91,10 @@ SUBROUTINE CUDA_SGEMM_BATCHED_1D_OVERLOAD( & & BETA, & & CARRAY, LDC, STRIDEC, & & BATCHCOUNT) - END SUBROUTINE CUDA_SGEMM_BATCHED_1D_OVERLOAD - - SUBROUTINE CUDA_SGEMM_STRIDED_BATCHED_1D_OVERLOAD( & + !$ACC END HOST_DATA + END SUBROUTINE CUDA_DGEMM_BATCHED_1D_3D_1D_OVERLOAD + + SUBROUTINE CUDA_SGEMM_BATCHED_1D_3D_1D_OVERLOAD( & & TRANSA, TRANSB, & & M, N, K, & & ALPHA, & @@ -224,17 +111,18 @@ SUBROUTINE CUDA_SGEMM_STRIDED_BATCHED_1D_OVERLOAD( & REAL(KIND=JPRM) :: ALPHA REAL(KIND=JPRM), DIMENSION(:) :: AARRAY INTEGER(KIND=JPIM) :: LDA - INTEGER(KIND=JPIB) :: STRIDEA + INTEGER(KIND=JPIM) :: STRIDEA REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY INTEGER(KIND=JPIM) :: LDB - INTEGER(KIND=JPIB) :: STRIDEB + INTEGER(KIND=JPIM) :: STRIDEB REAL(KIND=JPRM) :: BETA REAL(KIND=JPRM), DIMENSION(:) :: CARRAY INTEGER(KIND=JPIM) :: LDC - INTEGER(KIND=JPIB) :: STRIDEC + INTEGER(KIND=JPIM) :: STRIDEC INTEGER(KIND=JPIM) :: BATCHCOUNT - CALL CUDA_SGEMM_STRIDED_BATCHED( & + !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) + CALL CUDA_SGEMM_BATCHED( & & TRANSA, TRANSB, & & M, N, K, & & ALPHA, & @@ -243,6 +131,7 @@ SUBROUTINE CUDA_SGEMM_STRIDED_BATCHED_1D_OVERLOAD( & & BETA, & & CARRAY, LDC, STRIDEC, & & BATCHCOUNT) - END SUBROUTINE CUDA_SGEMM_STRIDED_BATCHED_1D_OVERLOAD + !$ACC END HOST_DATA + END SUBROUTINE CUDA_SGEMM_BATCHED_1D_3D_1D_OVERLOAD END MODULE CUDA_GEMM_BATCHED_MOD diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 0066468b8..91290f1cb 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -60,8 +60,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) USE TPM_GEOMETRY ,ONLY : G, G_NDGLU USE TPM_FIELDS ,ONLY : F,ZAA,ZAS,ZAA0,ZAS0,TDZAA,TDZAS,KMLOC0 USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1 -USE CUDA_GEMM_BATCHED_MOD!!, ONLY: CUDA_TCGEMM_BATCHED, CUDA_GEMM_BATCHED -USE CUBLAS_MOD, ONLY : CUDA_DGEMM_BATCHED +USE CUDA_GEMM_BATCHED_MOD, ONLY: CUDA_GEMM_BATCHED USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX USE, INTRINSIC :: ISO_C_BINDING @@ -150,7 +149,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ! Get C in transpose format to get better memory access patterns later !C=A*B => ! C^T=B^T*A^T -!$ACC HOST_DATA USE_DEVICE(ZAA,ZINPA,ZOUT) CALL CUDA_GEMM_BATCHED( & & 'N', 'N', & & 2*KF_FS, TDZAA, R_NDGNH, & @@ -160,7 +158,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & 0.0_JPRBT, & & ZOUT, 2*KF_FS, TDZAA, & & D_NUMP) -!$ACC END HOST_DATA IF (LSYNC_TRANS) THEN CALL GSTATS(434,0) CALL MPL_BARRIER(CDSTRING='') @@ -199,8 +196,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !C=A*B => ! C^T=B^T*A^T - !$ACC HOST_DATA USE_DEVICE(ZAA0,ZINP0,ZOUT0) - CALL CUDA_DGEMM_BATCHED( & + CALL CUDA_GEMM_BATCHED( & & 'N', 'N', & & KF_FS, TDZAA, R_NDGNH, & & 1.0_JPRD, & @@ -209,7 +205,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & 0.0_JPRD, & & ZOUT0, KF_FS, TDZAA, & & 1) - !$ACC END HOST_DATA !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IA) DEFAULT(NONE) DO J=1,(R_NSMAX+2)/2 @@ -231,7 +226,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ! Get C in transpose format to get better memory access patterns later !C=A*B => ! C^T=B^T*A^T -!$ACC HOST_DATA USE_DEVICE(ZAS,ZINPS,ZOUT) CALL CUDA_GEMM_BATCHED( & & 'N', 'N', & & 2*KF_FS, TDZAS, R_NDGNH, & @@ -241,7 +235,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & 0.0_JPRBT, & & ZOUT, 2*KF_FS, TDZAS, & & D_NUMP) -!$ACC END HOST_DATA IF (LSYNC_TRANS) THEN CALL GSTATS(434,0) CALL MPL_BARRIER(CDSTRING='') @@ -276,8 +269,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !C=A*B => ! C^T=B^T*A^T - !$ACC host_data use_device(ZAS0,ZINP0,ZOUT0) - call CUDA_DGEMM_BATCHED( & + call CUDA_GEMM_BATCHED( & & 'N', 'N', & & KF_FS, TDZAS, R_NDGNH, & & 1.0_JPRD, & @@ -286,7 +278,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & 0.0_JPRD, & & ZOUT0, KF_FS, TDZAS, & & 1) - !$ACC end host_data !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IS) DEFAULT(NONE) DO J=1,(R_NSMAX+3)/2 diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 253f21383..71da45119 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -51,15 +51,14 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ! ------------------------------------------------------------------ USE TPM_GEN ,ONLY : LSYNC_TRANS -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT, JPRD USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX, R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU USE TPM_FIELDS ,ONLY : ZAA,ZAS,ZAA0,ZAS0,TDZAA,TDZAS,KMLOC0 USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1 USE TPM_GEN, ONLY: NOUT -USE CUDA_GEMM_BATCHED_MOD -USE CUBLAS_MOD, ONLY : CUDA_DGEMM_BATCHED +USE CUDA_GEMM_BATCHED_MOD, ONLY: CUDA_GEMM_BATCHED USE, INTRINSIC :: ISO_C_BINDING USE MPL_MODULE ,ONLY : MPL_BARRIER USE IEEE_ARITHMETIC @@ -157,7 +156,6 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDIF CALL GSTATS(424,0) ! OVERLOADED FOR SINGLE AND DOUBLE PRECISION -!$ACC HOST_DATA USE_DEVICE(ZAA,ZINP,ZOUTA) CALL CUDA_GEMM_BATCHED( & & 'N', 'T', & & KFIELDS, R_NDGNH, TDZAA, & @@ -167,7 +165,6 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & 0._JPRBT, & & ZOUTA, KFIELDS, R_NDGNH, & & D_NUMP) -!$ACC END HOST_DATA IF (LSYNC_TRANS) THEN CALL GSTATS(444,0) CALL MPL_BARRIER(CDSTRING='') @@ -185,8 +182,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDDO ENDDO - !$ACC HOST_DATA USE_DEVICE(ZAA0,ZINP0,ZOUT0) - CALL CUDA_DGEMM_BATCHED( & + CALL CUDA_GEMM_BATCHED( & & 'N', 'T', & & KFIELDS/2, R_NDGNH, TDZAA, & & 1.0_JPRD, & @@ -195,7 +191,6 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & 0.0_JPRD, & & ZOUT0, KFIELDS/2, R_NDGNH, & & 1) - !$ACC END HOST_DATA !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) DO JGL=1,G_NDGLU(0) @@ -246,7 +241,6 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) CALL GSTATS(440,1) ENDIF CALL GSTATS(424,0) -!$ACC HOST_DATA USE_DEVICE(ZAS,ZINP,ZOUTS) CALL CUDA_GEMM_BATCHED( & & 'N', 'T', & & KFIELDS, R_NDGNH, TDZAS, & @@ -256,7 +250,6 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & 0._JPRBT, & & ZOUTS, KFIELDS, R_NDGNH, & & D_NUMP) -!$ACC END HOST_DATA IF (LSYNC_TRANS) THEN CALL GSTATS(444,0) CALL MPL_BARRIER(CDSTRING='') @@ -273,8 +266,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDDO ENDDO - !$ACC HOST_DATA USE_DEVICE(ZAS0,ZINP0,ZOUT0) - CALL CUDA_DGEMM_BATCHED( & + CALL CUDA_GEMM_BATCHED( & & 'N', 'T', & & KFIELDS/2, R_NDGNH, TDZAS, & & 1.0_JPRD, & @@ -283,7 +275,6 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & 0.0_JPRD, & & ZOUT0, KFIELDS/2, R_NDGNH, & & 1) - !$ACC END HOST_DATA !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) DO JGL=1,G_NDGLU(0) From a3043d95c13d1aeebe93a59c3c79f50f62c4c581 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:25 -0700 Subject: [PATCH 178/263] Partial cleanup in algor folder --- src/trans/gpu/CMakeLists.txt | 9 +- .../gemm/gemm_wrapper.cu} | 125 ++++++++++++++ src/trans/gpu/algor/interface/dbfgsl.h | 16 -- src/trans/gpu/algor/interface/dpseuclid.h | 11 -- src/trans/gpu/algor/interface/dysave.h | 27 --- src/trans/gpu/algor/interface/eigsol.h | 17 -- src/trans/gpu/algor/interface/intavg.h | 9 - src/trans/gpu/algor/interface/layeravg.h | 9 - src/trans/gpu/algor/interface/minv.h | 13 -- src/trans/gpu/algor/interface/minv_8.h | 13 -- src/trans/gpu/algor/interface/minv_caller.h | 9 - src/trans/gpu/algor/interface/multvdv.h | 8 - src/trans/gpu/algor/interface/mxmaop.h | 17 -- src/trans/gpu/algor/interface/mxptma.h | 16 -- src/trans/gpu/algor/interface/mxtrma.h | 14 -- src/trans/gpu/algor/interface/mxture.h | 16 -- src/trans/gpu/algor/interface/mxturhd.h | 14 -- src/trans/gpu/algor/interface/mxturs.h | 14 -- src/trans/gpu/algor/interface/n1cg1.h | 40 ----- src/trans/gpu/algor/interface/n1cga.h | 53 ------ src/trans/gpu/algor/interface/si_mxptco.h | 16 -- src/trans/gpu/algor/interface/simplico.h | 19 --- src/trans/gpu/algor/interface/sublayer.h | 9 - src/trans/gpu/algor/interface/suher.h | 18 -- src/trans/gpu/algor/interface/suhert.h | 14 -- src/trans/gpu/algor/interface/suhes.h | 14 -- src/trans/gpu/algor/interface/tridia.h | 25 --- src/trans/gpu/algor/internal/fourier/qpassf.F | 3 - src/trans/gpu/algor/internal/fourier/rpassf.F | 3 - .../gpu/algor/module/cublasSTCgemmBatched.cu | 107 ------------ .../gpu/algor/module/cublasSgemmBatched.cu | 125 -------------- .../module/cublasTCgemmBatched.actual.cu | 134 --------------- src/trans/gpu/algor/module/cublas_mod.F90 | 158 ------------------ .../gpu/internal/cuda_gemm_batched_mod.F90 | 40 ++++- 34 files changed, 166 insertions(+), 969 deletions(-) rename src/trans/gpu/algor/{module/cublasDgemmBatched.cu => external/gemm/gemm_wrapper.cu} (51%) delete mode 100644 src/trans/gpu/algor/interface/dbfgsl.h delete mode 100644 src/trans/gpu/algor/interface/dpseuclid.h delete mode 100644 src/trans/gpu/algor/interface/dysave.h delete mode 100644 src/trans/gpu/algor/interface/eigsol.h delete mode 100644 src/trans/gpu/algor/interface/intavg.h delete mode 100644 src/trans/gpu/algor/interface/layeravg.h delete mode 100644 src/trans/gpu/algor/interface/minv.h delete mode 100644 src/trans/gpu/algor/interface/minv_8.h delete mode 100644 src/trans/gpu/algor/interface/minv_caller.h delete mode 100644 src/trans/gpu/algor/interface/multvdv.h delete mode 100644 src/trans/gpu/algor/interface/mxmaop.h delete mode 100644 src/trans/gpu/algor/interface/mxptma.h delete mode 100644 src/trans/gpu/algor/interface/mxtrma.h delete mode 100644 src/trans/gpu/algor/interface/mxture.h delete mode 100644 src/trans/gpu/algor/interface/mxturhd.h delete mode 100644 src/trans/gpu/algor/interface/mxturs.h delete mode 100644 src/trans/gpu/algor/interface/n1cg1.h delete mode 100644 src/trans/gpu/algor/interface/n1cga.h delete mode 100644 src/trans/gpu/algor/interface/si_mxptco.h delete mode 100644 src/trans/gpu/algor/interface/simplico.h delete mode 100644 src/trans/gpu/algor/interface/sublayer.h delete mode 100644 src/trans/gpu/algor/interface/suher.h delete mode 100644 src/trans/gpu/algor/interface/suhert.h delete mode 100644 src/trans/gpu/algor/interface/suhes.h delete mode 100644 src/trans/gpu/algor/interface/tridia.h delete mode 100644 src/trans/gpu/algor/internal/fourier/qpassf.F delete mode 100644 src/trans/gpu/algor/internal/fourier/rpassf.F delete mode 100644 src/trans/gpu/algor/module/cublasSTCgemmBatched.cu delete mode 100644 src/trans/gpu/algor/module/cublasSgemmBatched.cu delete mode 100644 src/trans/gpu/algor/module/cublasTCgemmBatched.actual.cu delete mode 100644 src/trans/gpu/algor/module/cublas_mod.F90 diff --git a/src/trans/gpu/CMakeLists.txt b/src/trans/gpu/CMakeLists.txt index 82cd2424b..f04e02045 100644 --- a/src/trans/gpu/CMakeLists.txt +++ b/src/trans/gpu/CMakeLists.txt @@ -38,7 +38,6 @@ foreach( prec sp dp ) TARGET trans_gpu_${prec} LINKER_LANGUAGE Fortran SOURCES ${trans_src} - algor/external/fourier/fft_wrapper.cu PUBLIC_INCLUDES $ $ $ @@ -53,7 +52,6 @@ foreach( prec sp dp ) TARGET trans_gpu_static_${prec} LINKER_LANGUAGE Fortran SOURCES ${trans_src} - algor/external/fourier/fft_wrapper.cu TYPE STATIC PUBLIC_INCLUDES $ $ @@ -69,7 +67,6 @@ foreach( prec sp dp ) TARGET trans_gpu_static_CA_${prec} LINKER_LANGUAGE Fortran SOURCES ${trans_src} - algor/external/fourier/fft_wrapper.cu TYPE STATIC PUBLIC_INCLUDES $ $ @@ -140,10 +137,8 @@ endforeach() ecbuild_add_library( TARGET gpu TYPE STATIC SOURCES - algor/module/cublasSgemmBatched.cu - algor/module/cublasDgemmBatched.cu -## algor/module/cublasSTCgemmBatched.cu - algor/module/IPC_Alltoall.cu + algor/external/fourier/fft_wrapper.cu + algor/external/gemm/gemm_wrapper.cu PRIVATE_INCLUDES ${MPI_C_INCLUDE_PATH} ) diff --git a/src/trans/gpu/algor/module/cublasDgemmBatched.cu b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu similarity index 51% rename from src/trans/gpu/algor/module/cublasDgemmBatched.cu rename to src/trans/gpu/algor/external/gemm/gemm_wrapper.cu index 98fe25c2f..1e38e2ff4 100644 --- a/src/trans/gpu/algor/module/cublasDgemmBatched.cu +++ b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu @@ -139,3 +139,128 @@ extern "C" void cublasDgemmBatched_finalize () alreadyAllocated_dgemm=false; } +// +// Wrapper for cublasSgemm function. +// +// Alan Gray, NVIDIA +// + +#include +#include "cublas_v2.h" + + +bool alreadyAllocated_sgemm=false; +bool alreadyAllocated_sgemm_handle=false; + +float **d_Aarray_sgemm; +float **d_Barray_sgemm; +float **d_Carray_sgemm; + +float **Aarray_sgemm; +float **Barray_sgemm; +float **Carray_sgemm; + +cublasHandle_t handle_sgemm; + +extern "C" void cublasSgemmBatched_wrapper (char transa, char transb, int m, int n,int k, float alpha, const float *A, int lda, int tda, const float *B, int ldb, int tdb, float beta, float *C, int ldc, int tdc, int batchCount) +{ + + // printf("CUBLAS m=%d,n=%d,k=%d,batchcount=%d\n",m,n,k,batchCount); + + cublasOperation_t op_t1=CUBLAS_OP_N, op_t2=CUBLAS_OP_N; + + if (transa=='T' || transa=='t') + op_t1=CUBLAS_OP_T; + + if (transb=='T' || transb=='t') + op_t2=CUBLAS_OP_T; + + //float **Aarray_sgemm = (float**) malloc(batchCount*sizeof(float*)); + //float **Barray_sgemm = (float**) malloc(batchCount*sizeof(float*)); + //float **Carray_sgemm = (float**) malloc(batchCount*sizeof(float*)); + + if (!alreadyAllocated_sgemm_handle){ + cublasCreate(&handle_sgemm); + alreadyAllocated_sgemm_handle=true; + } + + if (!alreadyAllocated_sgemm){ + cudaMallocHost(&Aarray_sgemm,batchCount*sizeof(float*)); + cudaMallocHost(&Barray_sgemm,batchCount*sizeof(float*)); + cudaMallocHost(&Carray_sgemm,batchCount*sizeof(float*)); + alreadyAllocated_sgemm=true; + } + + cudaMalloc(&d_Aarray_sgemm,batchCount*sizeof(float*)); + cudaMalloc(&d_Barray_sgemm,batchCount*sizeof(float*)); + cudaMalloc(&d_Carray_sgemm,batchCount*sizeof(float*)); + + int i; + for(i=0;i -#include "cublas_v2.h" - -bool alreadyAllocated_stcgemm = false; -bool alreadyAllocated_stcgemm_handle = false; - -half **d_Aarray_stcgemm; -half **d_Barray_stcgemm; -float **d_Carray_stcgemm; - -half **Aarray_stcgemm; -half **Barray_stcgemm; -float **Carray_stcgemm; - -cublasHandle_t handle_stcgemm; - -extern "C" void cublasSTCgemmBatched_wrapper( - char transa, char transb, - int m, int n, int k, - float alpha, - const half *A, int lda, int tda, - const half *B, int ldb, int tdb, - float beta, - float *C, int ldc, int tdc, - int batchCount -){ - // Define CUBLAS operation handles - cublasOperation_t op_t1, op_t2; - - // Decide whether to transpose matrices or not - op_t1 = (transa == 'T' || transa == 't') ? CUBLAS_OP_T : CUBLAS_OP_N; - op_t2 = (transb == 'T' || transb == 't') ? CUBLAS_OP_T : CUBLAS_OP_N; - - // Initialize CUBLAS handle - if (!alreadyAllocated_stcgemm_handle) { - cublasCreate(&handle_stcgemm); - alreadyAllocated_stcgemm_handle = true; - } - - // Allocate host arrays - if (!alreadyAllocated_stcgemm) { - cudaMallocHost(&Aarray_stcgemm,batchCount*sizeof(half*)); - cudaMallocHost(&Barray_stcgemm,batchCount*sizeof(half*)); - cudaMallocHost(&Carray_stcgemm,batchCount*sizeof(float*)); - alreadyAllocated_stcgemm = true; - } - - // Allocate device arrays - cudaMalloc(&d_Aarray_stcgemm, batchCount*sizeof(half*)); - cudaMalloc(&d_Barray_stcgemm, batchCount*sizeof(half*)); - cudaMalloc(&d_Carray_stcgemm, batchCount*sizeof(float*)); - - // Transfer data from input arrays to host arrays - for (int i = 0; i < batchCount; i++) { - Aarray_stcgemm[i] = (half*) &(A[i*lda*tda]); - Barray_stcgemm[i] = (half*) &(B[i*ldb*tdb]); - Carray_stcgemm[i] = (float*) &(C[i*ldc*tdc]); - } - - // Transfer data from host arrays to device arrays - cudaMemcpy(d_Aarray_stcgemm, Aarray_stcgemm, batchCount*sizeof(half*), cudaMemcpyHostToDevice); - cudaMemcpy(d_Barray_stcgemm, Barray_stcgemm, batchCount*sizeof(half*), cudaMemcpyHostToDevice); - cudaMemcpy(d_Carray_stcgemm, Carray_stcgemm, batchCount*sizeof(float*), cudaMemcpyHostToDevice); - - // Perform batched SGEMM - cublasGemmBatchedEx(handle_stcgemm, - op_t1, op_t2, - m, n, k, - (const void*)&alpha, - (const void**)d_Aarray_stcgemm, CUDA_R_16F, lda, - (const void**)d_Barray_stcgemm, CUDA_R_16F, ldb, - (const void*)&beta, - (void**)d_Carray_stcgemm, CUDA_R_32F, ldc, - batchCount, - CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP - ); - - cudaDeviceSynchronize(); - - // Free device arrays - cudaFree(d_Aarray_stcgemm); - cudaFree(d_Barray_stcgemm); - cudaFree(d_Carray_stcgemm); -} - -extern "C" void cublasSTCgemmBatched_finalize() { - if (alreadyAllocated_stcgemm) { - cudaFree(Aarray_stcgemm); - cudaFree(Barray_stcgemm); - cudaFree(Carray_stcgemm); - - cudaFree(d_Aarray_stcgemm); - cudaFree(d_Barray_stcgemm); - cudaFree(d_Carray_stcgemm); - } - - if (alreadyAllocated_stcgemm_handle) { - cublasDestroy(handle_stcgemm); - } -} diff --git a/src/trans/gpu/algor/module/cublasSgemmBatched.cu b/src/trans/gpu/algor/module/cublasSgemmBatched.cu deleted file mode 100644 index 61c8384ac..000000000 --- a/src/trans/gpu/algor/module/cublasSgemmBatched.cu +++ /dev/null @@ -1,125 +0,0 @@ -// -// Wrapper for cublasSgemm function. -// -// Alan Gray, NVIDIA -// - -#include -#include "cublas_v2.h" - - -bool alreadyAllocated_sgemm=false; -bool alreadyAllocated_sgemm_handle=false; - -float **d_Aarray_sgemm; -float **d_Barray_sgemm; -float **d_Carray_sgemm; - -float **Aarray_sgemm; -float **Barray_sgemm; -float **Carray_sgemm; - -cublasHandle_t handle_sgemm; - -extern "C" void cublasSgemmBatched_wrapper (char transa, char transb, int m, int n,int k, float alpha, const float *A, int lda, int tda, const float *B, int ldb, int tdb, float beta, float *C, int ldc, int tdc, int batchCount) -{ - - // printf("CUBLAS m=%d,n=%d,k=%d,batchcount=%d\n",m,n,k,batchCount); - - cublasOperation_t op_t1=CUBLAS_OP_N, op_t2=CUBLAS_OP_N; - - if (transa=='T' || transa=='t') - op_t1=CUBLAS_OP_T; - - if (transb=='T' || transb=='t') - op_t2=CUBLAS_OP_T; - - //float **Aarray_sgemm = (float**) malloc(batchCount*sizeof(float*)); - //float **Barray_sgemm = (float**) malloc(batchCount*sizeof(float*)); - //float **Carray_sgemm = (float**) malloc(batchCount*sizeof(float*)); - - if (!alreadyAllocated_sgemm_handle){ - cublasCreate(&handle_sgemm); - alreadyAllocated_sgemm_handle=true; - } - - if (!alreadyAllocated_sgemm){ - cudaMallocHost(&Aarray_sgemm,batchCount*sizeof(float*)); - cudaMallocHost(&Barray_sgemm,batchCount*sizeof(float*)); - cudaMallocHost(&Carray_sgemm,batchCount*sizeof(float*)); - alreadyAllocated_sgemm=true; - } - - cudaMalloc(&d_Aarray_sgemm,batchCount*sizeof(float*)); - cudaMalloc(&d_Barray_sgemm,batchCount*sizeof(float*)); - cudaMalloc(&d_Carray_sgemm,batchCount*sizeof(float*)); - - int i; - for(i=0;i -#include "cublas_v2.h" - - -bool alreadyAllocated_tcgemm=false; -bool alreadyAllocated_tcgemm_handle=false; - -// Device arrays -//half **d_Aarray_h; -//half **d_Barray_h; -float **d_Aarray_h; -float **d_Barray_h; -float **d_Aarray_tcgemm; -float **d_Barray_tcgemm; -float **d_Carray_tcgemm; - -// Host arrays -float **Aarray_tcgemm; -float **Barray_tcgemm; -float **Carray_tcgemm; - -cublasHandle_t handle_tcgemm; - -// Converts from single-precision to half-precision (CUDA kernel) -__global__ void float2half(half *out, const float *in, int n) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - if (idx < n) { - out[idx] = __float2half(in[idx]); - } -} - - - -extern "C" void cublasTCgemmBatched_wrapper(char transa, char transb, - int m, int n, int k, - float alpha, - const float *A, int lda, int tda, - const float *B, int ldb, int tdb, - float beta, - float *C, int ldc, int tdc, - int batchCount) -{ - fprintf(stderr, "Using Tensor Core\n"); - - // Set transpose operation parameters - cublasOperation_t op_t1 = (transa == 'T' || transa == 't') ? CUBLAS_OP_T : CUBLAS_OP_N; - cublasOperation_t op_t2 = (transb == 'T' || transb == 't') ? CUBLAS_OP_T : CUBLAS_OP_N; - - if (!alreadyAllocated_tcgemm_handle) { - cublasCreate(&handle_tcgemm); - alreadyAllocated_tcgemm_handle=true; - } - - //cublasSetMathMode(handle_tcgemm, CUBLAS_TENSOR_OP_MATH); - - if (!alreadyAllocated_tcgemm) { - // Allocate host arrays specifically for host->device transfer - cudaMallocHost(&Aarray_tcgemm, batchCount*sizeof(float*)); - cudaMallocHost(&Barray_tcgemm, batchCount*sizeof(float*)); - cudaMallocHost(&Carray_tcgemm, batchCount*sizeof(float*)); - alreadyAllocated_tcgemm=true; - } - - // Allocate device arrays - cudaMalloc(&d_Aarray_h, batchCount*sizeof(float*)); - cudaMalloc(&d_Barray_h, batchCount*sizeof(float*)); - cudaMalloc(&d_Aarray_tcgemm, batchCount*sizeof(float*)); - cudaMalloc(&d_Barray_tcgemm, batchCount*sizeof(float*)); - cudaMalloc(&d_Carray_tcgemm, batchCount*sizeof(float*)); - - // Copy data from dummy arrays to host arrays - for (int i = 0; i < batchCount; i++) { - Aarray_tcgemm[i] = (float*) &(A[i*lda*tda]); - Barray_tcgemm[i] = (float*) &(B[i*ldb*tdb]); - Carray_tcgemm[i] = (float*) &(C[i*ldc*tdc]); - } - - // Transfer arrays from host to device - cudaMemcpy(d_Aarray_tcgemm, Aarray_tcgemm, batchCount*sizeof(float*), cudaMemcpyHostToDevice); - cudaMemcpy(d_Barray_tcgemm, Barray_tcgemm, batchCount*sizeof(float*), cudaMemcpyHostToDevice); - cudaMemcpy(d_Carray_tcgemm, Carray_tcgemm, batchCount*sizeof(float*), cudaMemcpyHostToDevice); - -// // Convert arrays to half-precision -// for (int i = 0; i < batchCount; i++) { -// float2half<<<(int)(m*k/256) + 1, 256 >>>(d_Aarray_h[i], d_Aarray_tcgemm[i], batchCount); -// float2half<<<(int)(k*n/256) + 1, 256 >>>(d_Barray_h[i], d_Barray_tcgemm[i], batchCount); -// } - - // Perform Tensor Core batched GEMM - cublasGemmBatchedEx(handle_tcgemm, op_t1, op_t2, - m, n, k, - (const void *)&alpha, - (const void **)d_Aarray_h, CUDA_R_32F, lda, - (const void **)d_Barray_h, CUDA_R_32F, ldb, - (const void *)&beta, - (void **)d_Carray_tcgemm, CUDA_R_32F, ldc, - batchCount, - CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT); - - cudaDeviceSynchronize(); - - cudaFree(d_Aarray_h); - cudaFree(d_Barray_h); - cudaFree(d_Aarray_tcgemm); - cudaFree(d_Barray_tcgemm); - cudaFree(d_Carray_tcgemm); -} - -extern "C" void cublasTCgemmBatched_finalize() -{ - if (alreadyAllocated_tcgemm) { - cudaFree(Aarray_tcgemm); - cudaFree(Barray_tcgemm); - cudaFree(Carray_tcgemm); - - cudaFree(d_Aarray_h); - cudaFree(d_Barray_h); - cudaFree(d_Aarray_tcgemm); - cudaFree(d_Barray_tcgemm); - cudaFree(d_Carray_tcgemm); - } - - if (alreadyAllocated_tcgemm_handle) { - cublasDestroy(handle_tcgemm); - } -} - diff --git a/src/trans/gpu/algor/module/cublas_mod.F90 b/src/trans/gpu/algor/module/cublas_mod.F90 deleted file mode 100644 index e69a00e16..000000000 --- a/src/trans/gpu/algor/module/cublas_mod.F90 +++ /dev/null @@ -1,158 +0,0 @@ -MODULE CUBLAS_MOD -! -! Define the interfaces to the NVIDIA C code -! -interface cuda_gemm -! -! void cublasSgemm (char transa, char transb, int m, int n, -! int k, float alpha, const float *A, int lda, -! const float *B, int ldb, float beta, float *C, int ldc) -! -subroutine cuda_sgemm(cta, ctb, m, n, k,& -alpha, A, lda, B, ldb, beta, c, ldc) bind(C,name='cublasSgemm') -use iso_c_binding -character(1,c_char),value :: cta, ctb -integer(c_int),value :: m,n,k,lda,ldb,ldc -real(c_float),value :: alpha,beta -real(c_float), dimension(lda,*) :: A -real(c_float), dimension(ldb,*) :: B -real(c_float), dimension(ldc,*) :: C -end subroutine cuda_sgemm - -! -! void cublasDgemm (char transa, char transb, int m, int n, -! int k, double alpha, const double *A, int lda, -! const double *B, int ldb, double beta, double *C, int ldc) -! -subroutine cuda_dgemm(cta, ctb, m, n, k,& -alpha, A, lda, B, ldb, beta, c, ldc) bind(C,name='cublasDgemm') -use iso_c_binding -character(1,c_char),value :: cta, ctb -integer(c_int),value :: m,n,k,lda,ldb,ldc -real(c_double),value :: alpha,beta -real(c_double), dimension(lda,*) :: A -real(c_double), dimension(ldb,*) :: B -real(c_double), dimension(ldc,*) :: C -end subroutine cuda_dgemm -end interface - - -INTERFACE - SUBROUTINE CUDA_DGEMM_BATCHED(& - & CTA, CTB, & - & M, N, K, & - & ALPHA, & - & A, LDA, TDA, & - & B, LDB, TDB, & - & BETA, & - & C, LDC, TDC, & - & BATCHCOUNT & - &) BIND(C, NAME='cublasDgemmBatched_wrapper') - USE ISO_C_BINDING - CHARACTER(1,C_CHAR), VALUE :: CTA, CTB - INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT - REAL(C_DOUBLE), VALUE :: ALPHA,BETA - REAL(C_DOUBLE), DIMENSION(LDA,*) :: A - REAL(C_DOUBLE), DIMENSION(LDB,*) :: B - REAL(C_DOUBLE), DIMENSION(LDC,*) :: C - END SUBROUTINE CUDA_DGEMM_BATCHED - - SUBROUTINE CUDA_DGEMM_STRIDED_BATCHED(& - & CTA, CTB, & - & M, N, K, & - & ALPHA, & - & A, LDA, TDA, & - & B, LDB, TDB, & - & BETA, & - & C, LDC, TDC, & - & BATCHCOUNT & - &) BIND(C, NAME='cublasDgemmStridedBatched_wrapper') - USE ISO_C_BINDING - CHARACTER(1,C_CHAR), VALUE :: CTA, CTB - INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, BATCHCOUNT - INTEGER(C_LONG_LONG), VALUE :: TDA,TDB,TDC - REAL(C_DOUBLE), VALUE :: ALPHA, BETA - REAL(C_DOUBLE), DIMENSION(LDA,*) :: A - REAL(C_DOUBLE), DIMENSION(LDB,*) :: B - REAL(C_DOUBLE), DIMENSION(LDC,*) :: C - END SUBROUTINE CUDA_DGEMM_STRIDED_BATCHED - - subroutine cuda_dgemm_batched_finalize() bind(C,name='cublasDgemmBatched_finalize') - end subroutine cuda_dgemm_batched_finalize - -END INTERFACE - -INTERFACE - - SUBROUTINE CUDA_SGEMM_BATCHED(& - & CTA, CTB, & - & M, N, K, & - & ALPHA, & - & A, LDA, TDA, & - & B, LDB, TDB, & - & BETA, & - & C, LDC, TDC, & - & BATCHCOUNT & - &) BIND(C, NAME='cublasSgemmBatched_wrapper') - USE ISO_C_BINDING - CHARACTER(1,C_CHAR), VALUE :: CTA, CTB - INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT - REAL(C_FLOAT), VALUE :: ALPHA, BETA - REAL(C_FLOAT), DIMENSION(LDA,*) :: A - REAL(C_FLOAT), DIMENSION(LDB,*) :: B - REAL(C_FLOAT), DIMENSION(LDC,*) :: C - END SUBROUTINE CUDA_SGEMM_BATCHED -!!END INTERFACE - -!!INTERFACE - SUBROUTINE CUDA_SGEMM_STRIDED_BATCHED(& - & CTA, CTB, & - & M, N, K, & - & ALPHA, & - & A, LDA, TDA, & - & B, LDB, TDB, & - & BETA, & - & C, LDC, TDC, & - & BATCHCOUNT & - &) BIND(C, NAME='cublasSgemmStridedBatched_wrapper') - USE ISO_C_BINDING - CHARACTER(1,C_CHAR), VALUE :: CTA, CTB - INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, BATCHCOUNT - INTEGER(C_LONG_LONG), VALUE :: TDA,TDB,TDC - REAL(C_FLOAT), VALUE :: ALPHA, BETA - REAL(C_FLOAT), DIMENSION(LDA,*) :: A - REAL(C_FLOAT), DIMENSION(LDB,*) :: B - REAL(C_FLOAT), DIMENSION(LDC,*) :: C - END SUBROUTINE CUDA_SGEMM_STRIDED_BATCHED - - subroutine cuda_sgemm_batched_finalize() bind(C,name='cublasSgemmBatched_finalize') - end subroutine cuda_sgemm_batched_finalize - - -END INTERFACE - -INTERFACE - SUBROUTINE CUDA_STCGEMM_BATCHED(& - & CTA, CTB, & - & M, N, K, & - & ALPHA, & - & A, LDA, TDA, & - & B, LDB, TDB, & - & BETA, & - & C, LDC, TDC, & - & BATCHCOUNT & - &) BIND(C, NAME='cublasSTCgemmBatched_wrapper') - USE ISO_C_BINDING - CHARACTER(1,C_CHAR), VALUE :: CTA, CTB - INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT - REAL(C_FLOAT), VALUE :: ALPHA, BETA - REAL(2), DIMENSION(LDA,*) :: A - REAL(2), DIMENSION(LDB,*) :: B - REAL(C_FLOAT), DIMENSION(LDC,*) :: C - END SUBROUTINE CUDA_STCGEMM_BATCHED -END INTERFACE - - - - -END MODULE CUBLAS_MOD diff --git a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 index f61eed095..bf9f3adb5 100755 --- a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 +++ b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 @@ -1,5 +1,4 @@ MODULE CUDA_GEMM_BATCHED_MOD - USE CUBLAS_MOD USE PARKIND1, ONLY: JPRD, JPRM, JPIM IMPLICIT NONE @@ -13,6 +12,45 @@ MODULE CUDA_GEMM_BATCHED_MOD MODULE PROCEDURE CUDA_SGEMM_BATCHED_1D_3D_1D_OVERLOAD END INTERFACE CUDA_GEMM_BATCHED + INTERFACE + SUBROUTINE CUDA_SGEMM_BATCHED(& + & CTA, CTB, & + & M, N, K, & + & ALPHA, & + & A, LDA, TDA, & + & B, LDB, TDB, & + & BETA, & + & C, LDC, TDC, & + & BATCHCOUNT & + &) BIND(C, NAME='cublasSgemmBatched_wrapper') + USE ISO_C_BINDING + CHARACTER(1,C_CHAR), VALUE :: CTA, CTB + INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT + REAL(C_FLOAT), VALUE :: ALPHA, BETA + REAL(C_FLOAT), DIMENSION(LDA,*) :: A + REAL(C_FLOAT), DIMENSION(LDB,*) :: B + REAL(C_FLOAT), DIMENSION(LDC,*) :: C + END SUBROUTINE CUDA_SGEMM_BATCHED + SUBROUTINE CUDA_DGEMM_BATCHED(& + & CTA, CTB, & + & M, N, K, & + & ALPHA, & + & A, LDA, TDA, & + & B, LDB, TDB, & + & BETA, & + & C, LDC, TDC, & + & BATCHCOUNT & + &) BIND(C, NAME='cublasDgemmBatched_wrapper') + USE ISO_C_BINDING + CHARACTER(1,C_CHAR), VALUE :: CTA, CTB + INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT + REAL(C_DOUBLE), VALUE :: ALPHA,BETA + REAL(C_DOUBLE), DIMENSION(LDA,*) :: A + REAL(C_DOUBLE), DIMENSION(LDB,*) :: B + REAL(C_DOUBLE), DIMENSION(LDC,*) :: C + END SUBROUTINE CUDA_DGEMM_BATCHED + END INTERFACE + CONTAINS SUBROUTINE CUDA_DGEMM_BATCHED_1D_2D_1D_OVERLOAD( & From 930168ad622ee4a1556a5a15f476c30c46090b67 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:25 -0700 Subject: [PATCH 179/263] Remove cuda_device_mod (use cudafor instead) --- .../gpu/algor/module/cuda_device_mod.F90 | 40 ------------------- src/trans/gpu/external/setup_trans.F90 | 9 ++--- src/trans/gpu/internal/ftdir_mod.F90 | 1 - src/trans/gpu/internal/ftinv_mod.F90 | 2 - 4 files changed, 3 insertions(+), 49 deletions(-) delete mode 100644 src/trans/gpu/algor/module/cuda_device_mod.F90 diff --git a/src/trans/gpu/algor/module/cuda_device_mod.F90 b/src/trans/gpu/algor/module/cuda_device_mod.F90 deleted file mode 100644 index f710b687d..000000000 --- a/src/trans/gpu/algor/module/cuda_device_mod.F90 +++ /dev/null @@ -1,40 +0,0 @@ -module cuda_device_mod - -interface cuda_sync - -integer function cuda_synchronize() bind(C,name='cudaDeviceSynchronize') -use iso_c_binding -end function cuda_synchronize - -integer function cuda_stream_synchronize(stream) bind(C,name='cudaStreamSynchronize') -use iso_c_binding -type(c_ptr) :: stream -end function cuda_stream_synchronize - -integer function cuda_stream_destroy(stream) bind(C,name='cudaStreamDestroy') -use iso_c_binding -type(c_ptr) :: stream -end function cuda_stream_destroy - -end interface cuda_sync - -interface cuda_device - -integer function cuda_SetDevice(devnum) bind(C,name='cudaSetDevice') -use iso_c_binding -integer(c_int),value :: devnum -end function cuda_SetDevice - -integer function cuda_GetDevice(devnum) bind(C,name='cudaGetDevice') -use iso_c_binding -integer(c_int) :: devnum -end function cuda_GetDevice - -integer function cuda_GetDeviceCount(devnum) bind(C,name='cudaGetDeviceCount') -use iso_c_binding -integer(c_int) :: devnum -end function cuda_GetDeviceCount - -end interface cuda_device - -end module cuda_device_mod diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index f6e92c81b..f3a163729 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -130,11 +130,9 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS USE SHAREDMEM_MOD ,ONLY : SHAREDMEM_CREATE USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK -USE CUDA_DEVICE_MOD USE PREPSNM_MOD ,ONLY : PREPSNM -#ifdef _OPENACC -use openacc -#endif +USE CUDAFOR +USE OPENACC !endif INTERFACE @@ -444,13 +442,12 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& iunit=300+myproc #ifdef _OPENACC -!!idevtype=acc_device_nvidia idevtype=acc_get_device_type() inumdevs = acc_get_num_devices(idevtype) mygpu = mod(MYPROC-1,inumdevs) CALL acc_set_device_num(mygpu, idevtype) mygpu = acc_get_device_num(idevtype) -istat = cuda_GetDevice(idev) +istat = cudaGetDevice(idev) WRITE(iunit,*) '===now going to allocate GPU arrays on processor: ', myproc, ' device = ', mygpu, ' ',idev, ' of ', inumdevs #endif diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index e1e995347..eae855cf4 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -50,7 +50,6 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC USE TPM_GEOMETRY ,ONLY : G USE TPM_FFTC ,ONLY : EXECUTE_DIR_FFT -USE CUDA_DEVICE_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index 0ee01bf63..ff3071f45 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -50,7 +50,6 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC USE TPM_GEOMETRY ,ONLY : G USE TPM_FFTC ,ONLY : EXECUTE_INV_FFT -USE CUDA_DEVICE_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX @@ -77,7 +76,6 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) CALL EXECUTE_INV_FFT(PREEL_COMPLEX(:),PREEL_REAL(:),KFIELD, & & LOENS=G%NLOEN(D%NPTRLS(MYSETW):D%NPTRLS(MYSETW)+D%NDGL_FS-1), & & OFFSETS=D%NSTAGTF(1:D%NDGL_FS)) -IRET = CUDA_SYNCHRONIZE() IF (LSYNC_TRANS) THEN CALL GSTATS(443,0) From bf6e6070f638fec110bfa3df6bd18778fcb8eeb3 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:25 -0700 Subject: [PATCH 180/263] Remove unused functions from GEMM wrapper --- .../gpu/algor/external/gemm/gemm_wrapper.cu | 98 ------------------- 1 file changed, 98 deletions(-) diff --git a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu index 1e38e2ff4..a4ad69738 100644 --- a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu +++ b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu @@ -93,61 +93,6 @@ extern "C" void cublasDgemmBatched_wrapper (char transa, char transb, int m, int } -extern "C" void cublasDgemmStridedBatched_wrapper (char transa, char transb, int m, int n,int k, double alpha, const double *A, int lda, long long tda, const double *B, int ldb, long long tdb, double beta, double *C, int ldc, long long tdc, int batchCount) -{ - - - // printf("CUBLAS m=%d,n=%d,k=%d,batchcount=%d\n",m,n,k,batchCount); - - - cublasOperation_t op_t1=CUBLAS_OP_N, op_t2=CUBLAS_OP_N; - - if (transa=='T' || transa=='t') - op_t1=CUBLAS_OP_T; - - if (transb=='T' || transb=='t') - op_t2=CUBLAS_OP_T; - - if (!alreadyAllocated_dgemm_handle){ - cublasCreate(&handle_dgemm); - alreadyAllocated_dgemm_handle=true; - } - cublasDgemmStridedBatched(handle_dgemm,op_t1,op_t2,m,n,k,&alpha,(const double *) A,lda,tda, (const double *) B,ldb,tdb,&beta,(double *) C,ldc,tdc,batchCount); - -} - -extern "C" void cublasDgemmBatched_finalize () -{ - - - - if (alreadyAllocated_dgemm){ - - cudaFree(Aarray); - cudaFree(Barray); - cudaFree(Carray); - - cudaFree(d_Aarray); - cudaFree(d_Barray); - cudaFree(d_Carray); - if (alreadyAllocated_dgemm_handle){ - cublasDestroy(handle_dgemm); - } - alreadyAllocated_dgemm_handle=false; - - } - alreadyAllocated_dgemm=false; - -} -// -// Wrapper for cublasSgemm function. -// -// Alan Gray, NVIDIA -// - -#include -#include "cublas_v2.h" - bool alreadyAllocated_sgemm=false; bool alreadyAllocated_sgemm_handle=false; @@ -221,46 +166,3 @@ extern "C" void cublasSgemmBatched_wrapper (char transa, char transb, int m, int } -extern "C" void cublasSgemmStridedBatched_wrapper (char transa, char transb, int m, int n,int k, float alpha, const float *A, int lda, long long tda, const float *B, int ldb, long long tdb, float beta, float *C, int ldc, long long tdc, int batchCount) -{ - - - // printf("CUBLAS m=%d,n=%d,k=%d,batchcount=%d\n",m,n,k,batchCount); - - - cublasOperation_t op_t1=CUBLAS_OP_N, op_t2=CUBLAS_OP_N; - - if (transa=='T' || transa=='t') - op_t1=CUBLAS_OP_T; - - if (transb=='T' || transb=='t') - op_t2=CUBLAS_OP_T; - - if (!alreadyAllocated_sgemm_handle){ - cublasCreate(&handle_sgemm); - alreadyAllocated_sgemm_handle=true; - } - cublasSgemmStridedBatched(handle_sgemm,op_t1,op_t2,m,n,k,&alpha,(const float *) A,lda,tda, (const float *) B,ldb,tdb,&beta,(float*) C,ldc,tdc,batchCount); - -} - -extern "C" void cublasSgemmBatched_finalize () -{ - - if (alreadyAllocated_sgemm){ - - cudaFree(Aarray_sgemm); - cudaFree(Barray_sgemm); - cudaFree(Carray_sgemm); - - cudaFree(d_Aarray_sgemm); - cudaFree(d_Barray_sgemm); - cudaFree(d_Carray_sgemm); - - } - - if (alreadyAllocated_sgemm_handle){ - cublasDestroy(handle_sgemm); - } - -} From e6506d014cbfe0f22edc96d0757dcba0e29ad79c Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:25 -0700 Subject: [PATCH 181/263] Move culas*gemmBatched to cublas*gemmStridedBatched --- src/trans/gpu/algor/external/gemm/gemm_wrapper.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu index a4ad69738..e3a4af11b 100644 --- a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu +++ b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu @@ -77,7 +77,7 @@ extern "C" void cublasDgemmBatched_wrapper (char transa, char transb, int m, int cudaDeviceSynchronize(); - cublasDgemmBatched(handle_dgemm,op_t1,op_t2,m,n,k,&alpha,(const double**) d_Aarray,lda, (const double**) d_Barray,ldb,&beta,(double**) d_Carray,ldc,batchCount); + cublasDgemmStridedBatched(handle_dgemm,op_t1,op_t2,m,n,k,&alpha,A,lda, lda*tda,B,ldb,ldb*tdb,&beta,C,ldc,ldc*tdc,batchCount); cudaDeviceSynchronize(); @@ -150,7 +150,7 @@ extern "C" void cublasSgemmBatched_wrapper (char transa, char transb, int m, int cudaMemcpy(d_Barray_sgemm,Barray_sgemm,batchCount*sizeof(float*),cudaMemcpyHostToDevice); cudaMemcpy(d_Carray_sgemm,Carray_sgemm,batchCount*sizeof(float*),cudaMemcpyHostToDevice); - cublasSgemmBatched(handle_sgemm,op_t1,op_t2,m,n,k,&alpha,(const float**) d_Aarray_sgemm,lda, (const float**) d_Barray_sgemm,ldb,&beta,(float**) d_Carray_sgemm,ldc,batchCount); + cublasSgemmStridedBatched(handle_sgemm,op_t1,op_t2,m,n,k,&alpha,A,lda, lda*tda,B,ldb,ldb*tdb,&beta,C,ldc,ldc*tdc,batchCount); //printf("after sgemm\n"); cudaDeviceSynchronize(); From 6089809b4c1492b6bac153392444e2f1ed6cc35d Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:25 -0700 Subject: [PATCH 182/263] Cleanup GEMM interfaces --- .../gpu/algor/external/gemm/gemm_wrapper.cu | 205 ++++-------------- .../gpu/internal/cuda_gemm_batched_mod.F90 | 97 +++------ src/trans/gpu/internal/ledir_mod.F90 | 34 +-- src/trans/gpu/internal/leinv_mod.F90 | 34 +-- 4 files changed, 106 insertions(+), 264 deletions(-) diff --git a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu index e3a4af11b..76db5e72f 100644 --- a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu +++ b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu @@ -1,168 +1,53 @@ -// -// Wrapper for cublasDgemm function. -// -// Alan Gray, NVIDIA -// - +#include "cublas_v2.h" #include -#include "cublas_v2.h" - - -bool alreadyAllocated_dgemm=false; -bool alreadyAllocated_dgemm_handle=false; - -double **d_Aarray; -double **d_Barray; -double **d_Carray; - -double **Aarray; -double **Barray; -double **Carray; - -cublasHandle_t handle_dgemm; - -extern "C" void cublasDgemmBatched_wrapper (char transa, char transb, int m, int n,int k, double alpha, const double *A, int lda, int tda, const double *B, int ldb, int tdb, double beta, double *C, int ldc, int tdc, int batchCount) -{ - - - // printf("CUBLAS m=%d,n=%d,k=%d,batchcount=%d\n",m,n,k,batchCount); - cublasStatus_t stat; - - - cublasOperation_t op_t1=CUBLAS_OP_N, op_t2=CUBLAS_OP_N; - - if (transa=='T' || transa=='t') - op_t1=CUBLAS_OP_T; - if (transb=='T' || transb=='t') - op_t2=CUBLAS_OP_T; - - - //double **Aarray = (double**) malloc(batchCount*sizeof(double*)); - //double **Barray = (double**) malloc(batchCount*sizeof(double*)); - //double **Carray = (double**) malloc(batchCount*sizeof(double*)); - - - - if (!alreadyAllocated_dgemm_handle){ - stat = cublasCreate(&handle_dgemm); - if (stat != CUBLAS_STATUS_SUCCESS) { - printf ("CUBLAS initialization failed\n"); - //return EXIT_FAILURE; - } +#define CUDA_CHECK(e) \ + { \ + cudaError_t err = (e); \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error: %s, line %d, %s: %s\n", __FILE__, __LINE__, \ + #e, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ } - alreadyAllocated_dgemm_handle=true; - - if (!alreadyAllocated_dgemm){ - cudaError_t errcm1 = cudaMallocHost(&Aarray,batchCount*sizeof(double*)); - cudaError_t errcm2 = cudaMallocHost(&Barray,batchCount*sizeof(double*)); - cudaError_t errcm3 = cudaMallocHost(&Carray,batchCount*sizeof(double*)); - - cudaError_t errcm4 = cudaMalloc(&d_Aarray,batchCount*sizeof(double*)); - cudaError_t errcm5 = cudaMalloc(&d_Barray,batchCount*sizeof(double*)); - cudaError_t errcm6 = cudaMalloc(&d_Carray,batchCount*sizeof(double*)); - } - alreadyAllocated_dgemm=true; - - int i; - for(i=0;i GSTATS_NVTX USE, INTRINSIC :: ISO_C_BINDING @@ -150,13 +150,13 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !C=A*B => ! C^T=B^T*A^T CALL CUDA_GEMM_BATCHED( & - & 'N', 'N', & + & CUBLAS_OP_N, CUBLAS_OP_N, & & 2*KF_FS, TDZAA, R_NDGNH, & & 1.0_JPRBT, & - & ZINPA, 2*KF_FS, R_NDGNH, & - & ZAA, R_NDGNH, TDZAA, & + & ZINPA, 2*KF_FS, R_NDGNH*2*KF_FS, & + & ZAA, R_NDGNH, TDZAA*R_NDGNH, & & 0.0_JPRBT, & - & ZOUT, 2*KF_FS, TDZAA, & + & ZOUT, 2*KF_FS, TDZAA*2*KF_FS, & & D_NUMP) IF (LSYNC_TRANS) THEN CALL GSTATS(434,0) @@ -197,13 +197,13 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ! C^T=B^T*A^T CALL CUDA_GEMM_BATCHED( & - & 'N', 'N', & + & CUBLAS_OP_N, CUBLAS_OP_N, & & KF_FS, TDZAA, R_NDGNH, & & 1.0_JPRD, & - & ZINP0, KF_FS, R_NDGNH, & - & ZAA0, R_NDGNH, TDZAA, & + & ZINP0, KF_FS, R_NDGNH*KF_FS, & + & ZAA0, R_NDGNH, TDZAA*R_NDGNH, & & 0.0_JPRD, & - & ZOUT0, KF_FS, TDZAA, & + & ZOUT0, KF_FS, TDZAA*KF_FS, & & 1) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IA) DEFAULT(NONE) @@ -227,13 +227,13 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !C=A*B => ! C^T=B^T*A^T CALL CUDA_GEMM_BATCHED( & - & 'N', 'N', & + & CUBLAS_OP_N, CUBLAS_OP_N, & & 2*KF_FS, TDZAS, R_NDGNH, & & 1.0_JPRBT, & - & ZINPS, 2*KF_FS, R_NDGNH, & - & ZAS, R_NDGNH, TDZAS, & + & ZINPS, 2*KF_FS, R_NDGNH*2*KF_FS, & + & ZAS, R_NDGNH, TDZAS*R_NDGNH, & & 0.0_JPRBT, & - & ZOUT, 2*KF_FS, TDZAS, & + & ZOUT, 2*KF_FS, TDZAS*2*KF_FS, & & D_NUMP) IF (LSYNC_TRANS) THEN CALL GSTATS(434,0) @@ -270,13 +270,13 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ! C^T=B^T*A^T call CUDA_GEMM_BATCHED( & - & 'N', 'N', & + & CUBLAS_OP_N, CUBLAS_OP_N, & & KF_FS, TDZAS, R_NDGNH, & & 1.0_JPRD, & - & ZINP0, KF_FS, R_NDGNH, & - & ZAS0, R_NDGNH, TDZAS, & + & ZINP0, KF_FS, R_NDGNH*KF_FS, & + & ZAS0, R_NDGNH, TDZAS*R_NDGNH, & & 0.0_JPRD, & - & ZOUT0, KF_FS, TDZAS, & + & ZOUT0, KF_FS, TDZAS*KF_FS, & & 1) !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IS) DEFAULT(NONE) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 71da45119..2ed100ee4 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -58,7 +58,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) USE TPM_FIELDS ,ONLY : ZAA,ZAS,ZAA0,ZAS0,TDZAA,TDZAS,KMLOC0 USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1 USE TPM_GEN, ONLY: NOUT -USE CUDA_GEMM_BATCHED_MOD, ONLY: CUDA_GEMM_BATCHED +USE CUDA_GEMM_BATCHED_MOD USE, INTRINSIC :: ISO_C_BINDING USE MPL_MODULE ,ONLY : MPL_BARRIER USE IEEE_ARITHMETIC @@ -157,13 +157,13 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) CALL GSTATS(424,0) ! OVERLOADED FOR SINGLE AND DOUBLE PRECISION CALL CUDA_GEMM_BATCHED( & - & 'N', 'T', & + & CUBLAS_OP_N, CUBLAS_OP_T, & & KFIELDS, R_NDGNH, TDZAA, & & 1.0_JPRBT, & - & ZINP, KFIELDS, TDZAA,& - & ZAA, R_NDGNH, TDZAA, & + & ZINP, KFIELDS, TDZAA*KFIELDS, & + & ZAA, R_NDGNH, TDZAA*R_NDGNH, & & 0._JPRBT, & - & ZOUTA, KFIELDS, R_NDGNH, & + & ZOUTA, KFIELDS, R_NDGNH*KFIELDS, & & D_NUMP) IF (LSYNC_TRANS) THEN CALL GSTATS(444,0) @@ -183,13 +183,13 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDDO CALL CUDA_GEMM_BATCHED( & - & 'N', 'T', & + & CUBLAS_OP_N, CUBLAS_OP_T, & & KFIELDS/2, R_NDGNH, TDZAA, & & 1.0_JPRD, & - & ZINP0, KFIELDS/2, TDZAA, & - & ZAA0, R_NDGNH, TDZAA, & + & ZINP0, KFIELDS/2, TDZAA*KFIELDS/2, & + & ZAA0, R_NDGNH, TDZAA*R_NDGNH, & & 0.0_JPRD, & - & ZOUT0, KFIELDS/2, R_NDGNH, & + & ZOUT0, KFIELDS/2, R_NDGNH*KFIELDS/2, & & 1) !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) @@ -242,13 +242,13 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDIF CALL GSTATS(424,0) CALL CUDA_GEMM_BATCHED( & - & 'N', 'T', & + & CUBLAS_OP_N, CUBLAS_OP_T, & & KFIELDS, R_NDGNH, TDZAS, & & 1.0_JPRBT, & - & ZINP, KFIELDS, TDZAS, & - & ZAS, R_NDGNH, TDZAS, & + & ZINP, KFIELDS, TDZAS*KFIELDS, & + & ZAS, R_NDGNH, TDZAS*R_NDGNH, & & 0._JPRBT, & - & ZOUTS, KFIELDS, R_NDGNH, & + & ZOUTS, KFIELDS, R_NDGNH*KFIELDS, & & D_NUMP) IF (LSYNC_TRANS) THEN CALL GSTATS(444,0) @@ -267,13 +267,13 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDDO CALL CUDA_GEMM_BATCHED( & - & 'N', 'T', & + & CUBLAS_OP_N, CUBLAS_OP_T, & & KFIELDS/2, R_NDGNH, TDZAS, & & 1.0_JPRD, & - & ZINP0, KFIELDS/2, TDZAS, & - & ZAS0, R_NDGNH, TDZAS, & + & ZINP0, KFIELDS/2, TDZAS*KFIELDS/2, & + & ZAS0, R_NDGNH, TDZAS*R_NDGNH, & & 0.0_JPRD, & - & ZOUT0, KFIELDS/2, R_NDGNH, & + & ZOUT0, KFIELDS/2, R_NDGNH*KFIELDS/2, & & 1) !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) From 2d1b671787104f508384abc780076b4041c2714f Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:26 -0700 Subject: [PATCH 183/263] VDTUV parallelized properly --- src/trans/gpu/internal/uvtvd_mod.F90 | 1 + src/trans/gpu/internal/vdtuv_mod.F90 | 46 ++++++++++++++-------------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/src/trans/gpu/internal/uvtvd_mod.F90 b/src/trans/gpu/internal/uvtvd_mod.F90 index 3833fef28..7527aee70 100755 --- a/src/trans/gpu/internal/uvtvd_mod.F90 +++ b/src/trans/gpu/internal/uvtvd_mod.F90 @@ -129,6 +129,7 @@ SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) PDIV(II,IN,kmloc) = +ZKM*PU(IR,IN,kmloc)+& &ZJN*ZEPSNM(KMLOC,JN+1)*PV(II,IN-1,kmloc)-& &(ZJN+1)*ZEPSNM(KMLOC,JN)*PV(II,IN+1,kmloc) + ELSEIF(KM == 0) THEN ! (DO JN=0,R_NTMAX) IN = R_NTMAX+3-JN diff --git a/src/trans/gpu/internal/vdtuv_mod.F90 b/src/trans/gpu/internal/vdtuv_mod.F90 index 7e5b68b47..96387f850 100755 --- a/src/trans/gpu/internal/vdtuv_mod.F90 +++ b/src/trans/gpu/internal/vdtuv_mod.F90 @@ -98,30 +98,19 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) !* 1. COMPUTE U V FROM VORTICITY AND DIVERGENCE. ! ------------------------------------------ -!$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(IR,II,KM,ZKM,JI) DEFAULT(NONE) DO KMLOC=1,D%NUMP - DO J=1,KFIELD - IR = 2*J-1 - II = IR+1 - KM = D_MYMS(KMLOC) - ZKM = REAL(KM,JPRBT) - - IF(KM == 0) THEN - !$ACC LOOP SEQ - DO JN=0,R_NTMAX+1 + DO JN=0,R_NTMAX+1 + DO J=1,KFIELD + IR = 2*J-1 + II = IR+1 + KM = D_MYMS(KMLOC) + ZKM = REAL(KM,JPRBT) + + IF(KM /= 0 .AND. JN >= KM) THEN + ! (DO JN=KN,R_NTMAX) JI = R_NTMAX+3-JN - PU(IR,JI,KMLOC) = +& - &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PVOR(IR,JI+1,KMLOC)-& - &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PVOR(IR,JI-1,KMLOC) - PV(IR,JI,KMLOC) = -& - &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PDIV(IR,JI+1,KMLOC)+& - &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PDIV(IR,JI-1,KMLOC) - ENDDO - ELSE - !$ACC LOOP SEQ - DO JN=KM,R_NTMAX+1 - JI = R_NTMAX+3-JN PU(ir,JI,kmloc) = -ZKM*F%RLAPIN(JN)*PDIV(ii,JI,kmloc)+& &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PVOR(ir,JI+1,kmloc)-& &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PVOR(ir,JI-1,kmloc) @@ -134,8 +123,19 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) PV(ii,JI,kmloc) = +ZKM*F%RLAPIN(JN)*PVOR(ir,JI,kmloc)-& &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PDIV(ii,JI+1,kmloc)+& &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PDIV(ii,JI-1,kmloc) - ENDDO - ENDIF + + ELSEIF(KM == 0) THEN + ! (DO JN=0,R_NTMAX) + JI = R_NTMAX+3-JN + + PU(IR,JI,KMLOC) = +& + &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PVOR(IR,JI+1,KMLOC)-& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PVOR(IR,JI-1,KMLOC) + PV(IR,JI,KMLOC) = -& + &(JN-1)*PEPSNM(KMLOC,JN)*F%RLAPIN(JN-1)*PDIV(IR,JI+1,KMLOC)+& + &(JN+2)*PEPSNM(KMLOC,JN+1)*F%RLAPIN(JN+1)*PDIV(IR,JI-1,KMLOC) + ENDIF + ENDDO ENDDO ENDDO From b05f2b7e8db31ef2430b4b56fc2a43715b90b28f Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:26 -0700 Subject: [PATCH 184/263] Parallelize prfi1b properly --- src/trans/gpu/internal/prfi1b_mod.F90 | 39 ++++++++++++--------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/src/trans/gpu/internal/prfi1b_mod.F90 b/src/trans/gpu/internal/prfi1b_mod.F90 index fdd141a99..496ac3d57 100755 --- a/src/trans/gpu/internal/prfi1b_mod.F90 +++ b/src/trans/gpu/internal/prfi1b_mod.F90 @@ -87,7 +87,8 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) IF(PRESENT(KFLDPTR)) THEN - + PRINT *, "Not implemented" + STOP 4 !loop over wavenumber @@ -128,34 +129,28 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) !loop over wavenumber - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IASM0,INM,JN) DEFAULT(NONE) + !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(KM,IASM0,INM) DEFAULT(NONE) DO KMLOC=1,D_NUMP - DO JFLD=1,KFIELDS - KM = D_MYMS(KMLOC) - IASM0 = D_NASM0(KM) + DO JN=0,R_NSMAX+3 + DO JFLD=1,KFIELDS + KM = D_MYMS(KMLOC) - !$ACC LOOP SEQ - DO JN=2,R_NSMAX+2-KM - INM = IASM0+((R_NSMAX+2-JN)-KM)*2 - IF( INM .LT. KDIM ) THEN ! TODO is this really needed, we don't have it in the reverse... + IF (JN <= 1) THEN + PIA(2*JFLD-1,JN+1,KMLOC) = 0.0_JPRB + PIA(2*JFLD ,JN+1,KMLOC) = 0.0_JPRB + ELSEIF (JN <= R_NSMAX+2-KM) THEN + IASM0 = D_NASM0(KM) + INM = IASM0+((R_NSMAX+2-JN)-KM)*2 PIA(2*JFLD-1,JN+1,KMLOC) = PSPEC(JFLD,INM ) PIA(2*JFLD ,JN+1,KMLOC) = PSPEC(JFLD,INM+1) - END IF + ELSEIF (JN <= R_NSMAX+3-KM) THEN + PIA(2*JFLD-1,JN+1,KMLOC) = 0.0_JPRB + PIA(2*JFLD ,JN+1,KMLOC) = 0.0_JPRB + ENDIF ENDDO ENDDO - END DO - - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,JN) - DO KMLOC=1,D_NUMP - DO JFLD=1,2*KFIELDS - PIA(JFLD,1,KMLOC) = 0.0_JPRB - PIA(JFLD,2,KMLOC) = 0.0_JPRB + ENDDO - KM = D_MYMS(KMLOC) - JN = R_NSMAX+3-KM - PIA(JFLD,JN+1,KMLOC) = 0.0_JPRB - ENDDO - END DO END IF !$ACC END DATA From dfcc9322b1bdbc857e769988c4fe198573ba1459 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:26 -0700 Subject: [PATCH 185/263] Paralllelize spnsde properly --- src/trans/gpu/internal/spnsde_mod.F90 | 32 +++++++++++++-------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/src/trans/gpu/internal/spnsde_mod.F90 b/src/trans/gpu/internal/spnsde_mod.F90 index b8e0d02d0..6affe2154 100755 --- a/src/trans/gpu/internal/spnsde_mod.F90 +++ b/src/trans/gpu/internal/spnsde_mod.F90 @@ -96,31 +96,29 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) !* 1.1 COMPUTE -!$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(IR) +!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(IR,II,KM,JI) DEFAULT(NONE) DO KMLOC=1,D%NUMP - DO J=1,KF_SCALARS - KM = D%MYMS(KMLOC) - IR = 2*J-1 - II = IR+1 - - IF(KM == 0) THEN - !$ACC LOOP SEQ - DO JN=0,R_NTMAX+1 + DO JN=0,R_NTMAX+1 + DO J=1,KF_SCALARS + IR = 2*J-1 + II = IR+1 + KM = D%MYMS(KMLOC) + + IF(KM /= 0 .AND. JN >= KM) THEN + ! (DO JN=KN,R_NTMAX+1) JI = R_NTMAX+3-JN PNSD(IR,JI,KMLOC) = -(JN-1)*PEPSNM(KMLOC,JN)*PF(IR,JI+1,KMLOC)+& &(JN+2)*PEPSNM(KMLOC,JN+1)*PF(IR,JI-1,KMLOC) - ENDDO + PNSD(II,JI,KMLOC) = -(JN-1)*PEPSNM(KMLOC,JN)*PF(II,JI+1,KMLOC)+& + &(JN+2)*PEPSNM(KMLOC,JN+1)*PF(II,JI-1,KMLOC) - ELSE - !$ACC LOOP SEQ - DO JN=KM,R_NTMAX+1 + ELSEIF(KM == 0) THEN + ! (DO JN=0,R_NTMAX+1) JI = R_NTMAX+3-JN PNSD(IR,JI,KMLOC) = -(JN-1)*PEPSNM(KMLOC,JN)*PF(IR,JI+1,KMLOC)+& &(JN+2)*PEPSNM(KMLOC,JN+1)*PF(IR,JI-1,KMLOC) - PNSD(II,JI,KMLOC) = -(JN-1)*PEPSNM(KMLOC,JN)*PF(II,JI+1,KMLOC)+& - &(JN+2)*PEPSNM(KMLOC,JN+1)*PF(II,JI-1,KMLOC) - ENDDO - ENDIF + ENDIF + ENDDO ENDDO END DO From 41fc8fede6bcc8c31ae872f2a531f71a3d36b3b1 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:26 -0700 Subject: [PATCH 186/263] Use single GEMM calls, slow because lots of syncs (CHANGE1: 9) (CHANGE2: 11) --- src/trans/gpu/internal/ledir_mod.F90 | 40 +++++++++++++++------------- src/trans/gpu/internal/leinv_mod.F90 | 40 +++++++++++++++------------- 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index ec99297a5..4a541d5ce 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -149,15 +149,17 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ! Get C in transpose format to get better memory access patterns later !C=A*B => ! C^T=B^T*A^T -CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_N, & - & 2*KF_FS, TDZAA, R_NDGNH, & - & 1.0_JPRBT, & - & ZINPA, 2*KF_FS, R_NDGNH*2*KF_FS, & - & ZAA, R_NDGNH, TDZAA*R_NDGNH, & - & 0.0_JPRBT, & - & ZOUT, 2*KF_FS, TDZAA*2*KF_FS, & - & D_NUMP) +DO KMLOC=1,D_NUMP + CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_N, & + & 2*KF_FS, TDZAA, R_NDGNH, & + & 1.0_JPRBT, & + & ZINPA((KMLOC-1)*R_NDGNH*2*KF_FS+1:), 2*KF_FS, R_NDGNH*2*KF_FS, & + & ZAA(:,:,KMLOC), R_NDGNH, TDZAA*R_NDGNH, & + & 0.0_JPRBT, & + & ZOUT((KMLOC-1)*TDZAA*2*KF_FS+1:), 2*KF_FS, TDZAA*2*KF_FS, & + & 1) +ENDDO IF (LSYNC_TRANS) THEN CALL GSTATS(434,0) CALL MPL_BARRIER(CDSTRING='') @@ -226,15 +228,17 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ! Get C in transpose format to get better memory access patterns later !C=A*B => ! C^T=B^T*A^T -CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_N, & - & 2*KF_FS, TDZAS, R_NDGNH, & - & 1.0_JPRBT, & - & ZINPS, 2*KF_FS, R_NDGNH*2*KF_FS, & - & ZAS, R_NDGNH, TDZAS*R_NDGNH, & - & 0.0_JPRBT, & - & ZOUT, 2*KF_FS, TDZAS*2*KF_FS, & - & D_NUMP) +DO KMLOC=1,D_NUMP + CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_N, & + & 2*KF_FS, TDZAS, R_NDGNH, & + & 1.0_JPRBT, & + & ZINPS((KMLOC-1)*R_NDGNH*2*KF_FS+1:), 2*KF_FS, R_NDGNH*2*KF_FS, & + & ZAS(:,:,KMLOC), R_NDGNH, TDZAS*R_NDGNH, & + & 0.0_JPRBT, & + & ZOUT((KMLOC-1)*TDZAS*2*KF_FS+1:), 2*KF_FS, TDZAS*2*KF_FS, & + & 1) +ENDDO IF (LSYNC_TRANS) THEN CALL GSTATS(434,0) CALL MPL_BARRIER(CDSTRING='') diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 2ed100ee4..de38f7398 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -156,15 +156,17 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDIF CALL GSTATS(424,0) ! OVERLOADED FOR SINGLE AND DOUBLE PRECISION -CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_T, & - & KFIELDS, R_NDGNH, TDZAA, & - & 1.0_JPRBT, & - & ZINP, KFIELDS, TDZAA*KFIELDS, & - & ZAA, R_NDGNH, TDZAA*R_NDGNH, & - & 0._JPRBT, & - & ZOUTA, KFIELDS, R_NDGNH*KFIELDS, & - & D_NUMP) +DO KMLOC=1,D_NUMP + CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_T, & + & KFIELDS, R_NDGNH, TDZAA, & + & 1.0_JPRBT, & + & ZINP((KMLOC-1)*TDZAA*KFIELDS+1:), KFIELDS, TDZAA*KFIELDS, & + & ZAA(:,:,KMLOC), R_NDGNH, TDZAA*R_NDGNH, & + & 0.0_JPRBT, & + & ZOUTA((KMLOC-1)*R_NDGNH*KFIELDS+1:), KFIELDS, R_NDGNH*KFIELDS, & + & 1) +ENDDO IF (LSYNC_TRANS) THEN CALL GSTATS(444,0) CALL MPL_BARRIER(CDSTRING='') @@ -241,15 +243,17 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) CALL GSTATS(440,1) ENDIF CALL GSTATS(424,0) -CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_T, & - & KFIELDS, R_NDGNH, TDZAS, & - & 1.0_JPRBT, & - & ZINP, KFIELDS, TDZAS*KFIELDS, & - & ZAS, R_NDGNH, TDZAS*R_NDGNH, & - & 0._JPRBT, & - & ZOUTS, KFIELDS, R_NDGNH*KFIELDS, & - & D_NUMP) +DO KMLOC=1,D_NUMP + CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_T, & + & KFIELDS, R_NDGNH, TDZAS, & + & 1.0_JPRBT, & + & ZINP((KMLOC-1)*TDZAS*KFIELDS+1:), KFIELDS, TDZAS*KFIELDS, & + & ZAS(:,:,KMLOC), R_NDGNH, TDZAS*R_NDGNH, & + & 0.0_JPRBT, & + & ZOUTS((KMLOC-1)*R_NDGNH*KFIELDS+1:), KFIELDS, R_NDGNH*KFIELDS, & + & 1) +ENDDO IF (LSYNC_TRANS) THEN CALL GSTATS(444,0) CALL MPL_BARRIER(CDSTRING='') From b44baa6eae77b74e1f10e8f9abdf60fd476a8071 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:27 -0700 Subject: [PATCH 187/263] Multiple GEMM calls (CHANGE1: 10) (CHANGE2: 12) --- src/trans/gpu/internal/ledir_mod.F90 | 46 ++++++++++++++++------------ src/trans/gpu/internal/leinv_mod.F90 | 46 ++++++++++++++++------------ 2 files changed, 52 insertions(+), 40 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 4a541d5ce..60fedeed6 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -150,15 +150,18 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !C=A*B => ! C^T=B^T*A^T DO KMLOC=1,D_NUMP - CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_N, & - & 2*KF_FS, TDZAA, R_NDGNH, & - & 1.0_JPRBT, & - & ZINPA((KMLOC-1)*R_NDGNH*2*KF_FS+1:), 2*KF_FS, R_NDGNH*2*KF_FS, & - & ZAA(:,:,KMLOC), R_NDGNH, TDZAA*R_NDGNH, & - & 0.0_JPRBT, & - & ZOUT((KMLOC-1)*TDZAA*2*KF_FS+1:), 2*KF_FS, TDZAA*2*KF_FS, & - & 1) + KM = D_MYMS(KMLOC) + IF (KM /= 0) THEN + CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_N, & + & 2*KF_FS, (R%NSMAX-KM+2)/2, G%NDGLU(KM), & + & 1.0_JPRBT, & + & ZINPA((KMLOC-1)*R_NDGNH*2*KF_FS+1:), 2*KF_FS, R_NDGNH*2*KF_FS, & + & ZAA(:,:,KMLOC), R_NDGNH, TDZAA*R_NDGNH, & + & 0.0_JPRBT, & + & ZOUT((KMLOC-1)*TDZAA*2*KF_FS+1:), 2*KF_FS, TDZAA*2*KF_FS, & + & 1) + ENDIF ENDDO IF (LSYNC_TRANS) THEN CALL GSTATS(434,0) @@ -200,7 +203,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) CALL CUDA_GEMM_BATCHED( & & CUBLAS_OP_N, CUBLAS_OP_N, & - & KF_FS, TDZAA, R_NDGNH, & + & KF_FS, (R%NSMAX+2)/2, G%NDGLU(0), & & 1.0_JPRD, & & ZINP0, KF_FS, R_NDGNH*KF_FS, & & ZAA0, R_NDGNH, TDZAA*R_NDGNH, & @@ -229,15 +232,18 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !C=A*B => ! C^T=B^T*A^T DO KMLOC=1,D_NUMP - CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_N, & - & 2*KF_FS, TDZAS, R_NDGNH, & - & 1.0_JPRBT, & - & ZINPS((KMLOC-1)*R_NDGNH*2*KF_FS+1:), 2*KF_FS, R_NDGNH*2*KF_FS, & - & ZAS(:,:,KMLOC), R_NDGNH, TDZAS*R_NDGNH, & - & 0.0_JPRBT, & - & ZOUT((KMLOC-1)*TDZAS*2*KF_FS+1:), 2*KF_FS, TDZAS*2*KF_FS, & - & 1) + KM = D_MYMS(KMLOC) + IF (KM /= 0) THEN + CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_N, & + & 2*KF_FS, (R%NSMAX-KM+3)/2, G%NDGLU(KM), & + & 1.0_JPRBT, & + & ZINPS((KMLOC-1)*R_NDGNH*2*KF_FS+1:), 2*KF_FS, R_NDGNH*2*KF_FS, & + & ZAS(:,:,KMLOC), R_NDGNH, TDZAS*R_NDGNH, & + & 0.0_JPRBT, & + & ZOUT((KMLOC-1)*TDZAS*2*KF_FS+1:), 2*KF_FS, TDZAS*2*KF_FS, & + & 1) + ENDIF ENDDO IF (LSYNC_TRANS) THEN CALL GSTATS(434,0) @@ -275,7 +281,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) call CUDA_GEMM_BATCHED( & & CUBLAS_OP_N, CUBLAS_OP_N, & - & KF_FS, TDZAS, R_NDGNH, & + & KF_FS, (R%NSMAX+3)/2, G%NDGLU(0), & & 1.0_JPRD, & & ZINP0, KF_FS, R_NDGNH*KF_FS, & & ZAS0, R_NDGNH, TDZAS*R_NDGNH, & diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index de38f7398..df7ad9e6b 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -157,15 +157,18 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) CALL GSTATS(424,0) ! OVERLOADED FOR SINGLE AND DOUBLE PRECISION DO KMLOC=1,D_NUMP - CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_T, & - & KFIELDS, R_NDGNH, TDZAA, & - & 1.0_JPRBT, & - & ZINP((KMLOC-1)*TDZAA*KFIELDS+1:), KFIELDS, TDZAA*KFIELDS, & - & ZAA(:,:,KMLOC), R_NDGNH, TDZAA*R_NDGNH, & - & 0.0_JPRBT, & - & ZOUTA((KMLOC-1)*R_NDGNH*KFIELDS+1:), KFIELDS, R_NDGNH*KFIELDS, & - & 1) + KM = D_MYMS(KMLOC) + IF (KM /= 0) THEN + CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_T, & + & KFIELDS, G%NDGLU(KM), (R%NSMAX-KM+2)/2, & + & 1.0_JPRBT, & + & ZINP((KMLOC-1)*TDZAA*KFIELDS+1:), KFIELDS, TDZAA*KFIELDS, & + & ZAA(:,:,KMLOC), R_NDGNH, TDZAA*R_NDGNH, & + & 0.0_JPRBT, & + & ZOUTA((KMLOC-1)*R_NDGNH*KFIELDS+1:), KFIELDS, R_NDGNH*KFIELDS, & + & 1) + ENDIF ENDDO IF (LSYNC_TRANS) THEN CALL GSTATS(444,0) @@ -186,7 +189,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) CALL CUDA_GEMM_BATCHED( & & CUBLAS_OP_N, CUBLAS_OP_T, & - & KFIELDS/2, R_NDGNH, TDZAA, & + & KFIELDS/2, G%NDGLU(0), (R%NSMAX+2)/2, & & 1.0_JPRD, & & ZINP0, KFIELDS/2, TDZAA*KFIELDS/2, & & ZAA0, R_NDGNH, TDZAA*R_NDGNH, & @@ -244,15 +247,18 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDIF CALL GSTATS(424,0) DO KMLOC=1,D_NUMP - CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_T, & - & KFIELDS, R_NDGNH, TDZAS, & - & 1.0_JPRBT, & - & ZINP((KMLOC-1)*TDZAS*KFIELDS+1:), KFIELDS, TDZAS*KFIELDS, & - & ZAS(:,:,KMLOC), R_NDGNH, TDZAS*R_NDGNH, & - & 0.0_JPRBT, & - & ZOUTS((KMLOC-1)*R_NDGNH*KFIELDS+1:), KFIELDS, R_NDGNH*KFIELDS, & - & 1) + KM = D_MYMS(KMLOC) + IF (KM /= 0) THEN + CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_T, & + & KFIELDS, G%NDGLU(KM), (R%NSMAX-KM+3)/2, & + & 1.0_JPRBT, & + & ZINP((KMLOC-1)*TDZAS*KFIELDS+1:), KFIELDS, TDZAS*KFIELDS, & + & ZAS(:,:,KMLOC), R_NDGNH, TDZAS*R_NDGNH, & + & 0.0_JPRBT, & + & ZOUTS((KMLOC-1)*R_NDGNH*KFIELDS+1:), KFIELDS, R_NDGNH*KFIELDS, & + & 1) + ENDIF ENDDO IF (LSYNC_TRANS) THEN CALL GSTATS(444,0) @@ -272,7 +278,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) CALL CUDA_GEMM_BATCHED( & & CUBLAS_OP_N, CUBLAS_OP_T, & - & KFIELDS/2, R_NDGNH, TDZAS, & + & KFIELDS/2, G%NDGLU(0), (R%NSMAX+3)/2, & & 1.0_JPRD, & & ZINP0, KFIELDS/2, TDZAS*KFIELDS/2, & & ZAS0, R_NDGNH, TDZAS*R_NDGNH, & From 96889cc3895fa3d36e931c69fc8abc091bbb956e Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:27 -0700 Subject: [PATCH 188/263] Do not synchronize after each GEMM --- src/trans/gpu/algor/external/gemm/gemm_wrapper.cu | 2 -- src/trans/gpu/internal/ledir_mod.F90 | 4 ++++ src/trans/gpu/internal/leinv_mod.F90 | 4 ++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu index 76db5e72f..41e6b3a3d 100644 --- a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu +++ b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu @@ -33,7 +33,6 @@ void cublas_dgemm_wrapper(cublasOperation_t transa, cublasOperation_t transb, CUBLAS_CHECK(cublasDgemmStridedBatched(handle, transa, transb, m, n, k, &alpha, A, lda, tda, B, ldb, tdb, &beta, C, ldc, tdc, batchCount)); - CUDA_CHECK(cudaDeviceSynchronize()); } void cublas_sgemm_wrapper(cublasOperation_t transa, cublasOperation_t transb, @@ -48,6 +47,5 @@ void cublas_sgemm_wrapper(cublasOperation_t transa, cublasOperation_t transb, CUBLAS_CHECK(cublasSgemmStridedBatched(handle, transa, transb, m, n, k, &alpha, A, lda, tda, B, ldb, tdb, &beta, C, ldc, tdc, batchCount)); - CUDA_CHECK(cudaDeviceSynchronize()); } } diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 60fedeed6..6373ac018 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -163,6 +163,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & 1) ENDIF ENDDO +CALL cudaDeviceSynchronize() IF (LSYNC_TRANS) THEN CALL GSTATS(434,0) CALL MPL_BARRIER(CDSTRING='') @@ -210,6 +211,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & 0.0_JPRD, & & ZOUT0, KF_FS, TDZAA*KF_FS, & & 1) + CALL cudaDeviceSynchronize() !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IA) DEFAULT(NONE) DO J=1,(R_NSMAX+2)/2 @@ -245,6 +247,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & 1) ENDIF ENDDO +CALL cudaDeviceSynchronize() IF (LSYNC_TRANS) THEN CALL GSTATS(434,0) CALL MPL_BARRIER(CDSTRING='') @@ -288,6 +291,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & 0.0_JPRD, & & ZOUT0, KF_FS, TDZAS*KF_FS, & & 1) + call cudaDeviceSynchronize() !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IS) DEFAULT(NONE) DO J=1,(R_NSMAX+3)/2 diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index df7ad9e6b..103354766 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -170,6 +170,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & 1) ENDIF ENDDO +CALL cudaDeviceSynchronize() IF (LSYNC_TRANS) THEN CALL GSTATS(444,0) CALL MPL_BARRIER(CDSTRING='') @@ -196,6 +197,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & 0.0_JPRD, & & ZOUT0, KFIELDS/2, R_NDGNH*KFIELDS/2, & & 1) + CALL cudaDeviceSynchronize() !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) DO JGL=1,G_NDGLU(0) @@ -260,6 +262,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & 1) ENDIF ENDDO +CALL cudaDeviceSynchronize() IF (LSYNC_TRANS) THEN CALL GSTATS(444,0) CALL MPL_BARRIER(CDSTRING='') @@ -285,6 +288,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & 0.0_JPRD, & & ZOUT0, KFIELDS/2, R_NDGNH*KFIELDS/2, & & 1) + CALL cudaDeviceSynchronize() !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) DO JGL=1,G_NDGLU(0) From ba65812dab1232d54f5bad2203ff34d0617613fa Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:27 -0700 Subject: [PATCH 189/263] Add grouped GEMM in LEINV --- .../gpu/algor/external/gemm/gemm_wrapper.cu | 33 ++++++ .../gpu/internal/cuda_gemm_batched_mod.F90 | 112 ++++++++++++++++++ src/trans/gpu/internal/leinv_mod.F90 | 48 ++++---- 3 files changed, 171 insertions(+), 22 deletions(-) diff --git a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu index 41e6b3a3d..58bc3bed8 100644 --- a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu +++ b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu @@ -48,4 +48,37 @@ void cublas_sgemm_wrapper(cublasOperation_t transa, cublasOperation_t transb, &alpha, A, lda, tda, B, ldb, tdb, &beta, C, ldc, tdc, batchCount)); } + +void cublas_sgemm_wrapper_grouped(cublasOperation_t transa, + cublasOperation_t transb, int m, int *n, + int *k, float alpha, const float *A, int lda, + int tda, const float *B, int ldb, int tdb, + float beta, float *C, int ldc, int tdc, + int batchCount) { + static cublasHandle_t handle = nullptr; + if (!handle) + CUBLAS_CHECK(cublasCreate(&handle)); + + for (int i = 0; i < batchCount; ++i) { + CUBLAS_CHECK(cublasSgemm(handle, transa, transb, m, n[i], k[i], &alpha, + A + i * tda, lda, B + i * tdb, ldb, &beta, + C + i * tdc, ldc)); + } +} +void cublas_dgemm_wrapper_grouped(cublasOperation_t transa, + cublasOperation_t transb, int m, int *n, + int *k, double alpha, const double *A, + int lda, int tda, const double *B, int ldb, + int tdb, double beta, double *C, int ldc, + int tdc, int batchCount) { + static cublasHandle_t handle = nullptr; + if (!handle) + CUBLAS_CHECK(cublasCreate(&handle)); + + for (int i = 0; i < batchCount; ++i) { + CUBLAS_CHECK(cublasDgemm(handle, transa, transb, m, n[i], k[i], &alpha, + A + i * tda, lda, B + i * tdb, ldb, &beta, + C + i * tdc, ldc)); + } +} } diff --git a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 index d499f02df..24eecda9b 100755 --- a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 +++ b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 @@ -12,6 +12,8 @@ MODULE CUDA_GEMM_BATCHED_MOD INTERFACE CUDA_GEMM_BATCHED MODULE PROCEDURE CUDA_DGEMM_BATCHED_OVERLOAD MODULE PROCEDURE CUDA_SGEMM_BATCHED_OVERLOAD + MODULE PROCEDURE CUDA_DGEMM_GROUPED_OVERLOAD + MODULE PROCEDURE CUDA_SGEMM_GROUPED_OVERLOAD END INTERFACE CUDA_GEMM_BATCHED INTERFACE @@ -30,6 +32,36 @@ SUBROUTINE CUDA_SGEMM_BATCHED(& REAL(C_FLOAT), VALUE :: ALPHA,BETA REAL(C_FLOAT) :: A(*), B(*), C(*) END SUBROUTINE CUDA_SGEMM_BATCHED + SUBROUTINE CUDA_DGEMM_GROUPED(& + & CTA, CTB, & + & M, N, K, & + & ALPHA, & + & A, LDA, TDA, & + & B, LDB, TDB, & + & BETA, & + & C, LDC, TDC, & + & BATCHCOUNT & + &) BIND(C, NAME='cublas_dgemm_wrapper_grouped') + USE ISO_C_BINDING + INTEGER(C_INT), VALUE :: CTA, CTB, M, N(:), K(:), LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT + REAL(C_DOUBLE), VALUE :: ALPHA,BETA + REAL(C_DOUBLE) :: A(*), B(*), C(*) + END SUBROUTINE CUDA_DGEMM_GROUPED + SUBROUTINE CUDA_SGEMM_GROUPED(& + & CTA, CTB, & + & M, N, K, & + & ALPHA, & + & A, LDA, TDA, & + & B, LDB, TDB, & + & BETA, & + & C, LDC, TDC, & + & BATCHCOUNT & + &) BIND(C, NAME='cublas_sgemm_wrapper_grouped') + USE ISO_C_BINDING + INTEGER(C_INT), VALUE :: CTA, CTB, M, N(:), K(:), LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT + REAL(C_FLOAT), VALUE :: ALPHA,BETA + REAL(C_FLOAT) :: A(*), B(*), C(*) + END SUBROUTINE CUDA_SGEMM_GROUPED SUBROUTINE CUDA_DGEMM_BATCHED(& & CTA, CTB, & & M, N, K, & @@ -129,4 +161,84 @@ SUBROUTINE CUDA_SGEMM_BATCHED_OVERLOAD( & !$ACC END HOST_DATA END SUBROUTINE CUDA_SGEMM_BATCHED_OVERLOAD + SUBROUTINE CUDA_DGEMM_GROUPED_OVERLOAD( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT) + INTEGER(KIND=C_INT), INTENT(IN) :: TRANSA + INTEGER(KIND=C_INT), INTENT(IN) :: TRANSB + INTEGER(KIND=JPIM) :: M + INTEGER(KIND=JPIM) :: N(:) + INTEGER(KIND=JPIM) :: K(:) + REAL(KIND=JPRD) :: ALPHA + REAL(KIND=JPRD), DIMENSION(*) :: AARRAY + INTEGER(KIND=JPIM) :: LDA + INTEGER(KIND=JPIM) :: STRIDEA + REAL(KIND=JPRD), DIMENSION(*) :: BARRAY + INTEGER(KIND=JPIM) :: LDB + INTEGER(KIND=JPIM) :: STRIDEB + REAL(KIND=JPRD) :: BETA + REAL(KIND=JPRD), DIMENSION(*) :: CARRAY + INTEGER(KIND=JPIM) :: LDC + INTEGER(KIND=JPIM) :: STRIDEC + INTEGER(KIND=JPIM) :: BATCHCOUNT + + !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) + CALL CUDA_DGEMM_GROUPED( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT) + !$ACC END HOST_DATA + END SUBROUTINE CUDA_DGEMM_GROUPED_OVERLOAD + + SUBROUTINE CUDA_SGEMM_GROUPED_OVERLOAD( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT) + INTEGER(KIND=C_INT), INTENT(IN) :: TRANSA + INTEGER(KIND=C_INT), INTENT(IN) :: TRANSB + INTEGER(KIND=JPIM) :: M + INTEGER(KIND=JPIM) :: N(:) + INTEGER(KIND=JPIM) :: K(:) + REAL(KIND=JPRM) :: ALPHA + REAL(KIND=JPRM), DIMENSION(*) :: AARRAY + INTEGER(KIND=JPIM) :: LDA + INTEGER(KIND=JPIM) :: STRIDEA + REAL(KIND=JPRM), DIMENSION(*) :: BARRAY + INTEGER(KIND=JPIM) :: LDB + INTEGER(KIND=JPIM) :: STRIDEB + REAL(KIND=JPRM) :: BETA + REAL(KIND=JPRM), DIMENSION(*) :: CARRAY + INTEGER(KIND=JPIM) :: LDC + INTEGER(KIND=JPIM) :: STRIDEC + INTEGER(KIND=JPIM) :: BATCHCOUNT + + !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) + CALL CUDA_SGEMM_GROUPED( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT) + !$ACC END HOST_DATA + END SUBROUTINE CUDA_SGEMM_GROUPED_OVERLOAD + END MODULE CUDA_GEMM_BATCHED_MOD diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 103354766..7a45ce8f3 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -71,6 +71,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) INTEGER(KIND=JPIM) :: KM INTEGER(KIND=JPIM) :: KMLOC INTEGER(KIND=JPIM) :: KIFC +INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP) REAL(KIND=JPRB), INTENT(IN) :: PIA(:,:,:) REAL(KIND=JPRB), INTENT(OUT), ALLOCATABLE :: FOUBUF_IN(:) @@ -158,19 +159,21 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ! OVERLOADED FOR SINGLE AND DOUBLE PRECISION DO KMLOC=1,D_NUMP KM = D_MYMS(KMLOC) - IF (KM /= 0) THEN - CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_T, & - & KFIELDS, G%NDGLU(KM), (R%NSMAX-KM+2)/2, & - & 1.0_JPRBT, & - & ZINP((KMLOC-1)*TDZAA*KFIELDS+1:), KFIELDS, TDZAA*KFIELDS, & - & ZAA(:,:,KMLOC), R_NDGNH, TDZAA*R_NDGNH, & - & 0.0_JPRBT, & - & ZOUTA((KMLOC-1)*R_NDGNH*KFIELDS+1:), KFIELDS, R_NDGNH*KFIELDS, & - & 1) - ENDIF + KS(KMLOC) = (R%NSMAX-KM+2)/2 + NS(KMLOC) = G%NDGLU(KM) ENDDO +! TODO SKIP KMLOC0 +CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_T, & + & KFIELDS, NS(:), KS(:), & + & 1.0_JPRBT, & + & ZINP, KFIELDS, TDZAA*KFIELDS, & + & ZAA, R_NDGNH, TDZAA*R_NDGNH, & + & 0.0_JPRBT, & + & ZOUTA, KFIELDS, R_NDGNH*KFIELDS, & + & D_NUMP) CALL cudaDeviceSynchronize() + IF (LSYNC_TRANS) THEN CALL GSTATS(444,0) CALL MPL_BARRIER(CDSTRING='') @@ -250,18 +253,19 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) CALL GSTATS(424,0) DO KMLOC=1,D_NUMP KM = D_MYMS(KMLOC) - IF (KM /= 0) THEN - CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_T, & - & KFIELDS, G%NDGLU(KM), (R%NSMAX-KM+3)/2, & - & 1.0_JPRBT, & - & ZINP((KMLOC-1)*TDZAS*KFIELDS+1:), KFIELDS, TDZAS*KFIELDS, & - & ZAS(:,:,KMLOC), R_NDGNH, TDZAS*R_NDGNH, & - & 0.0_JPRBT, & - & ZOUTS((KMLOC-1)*R_NDGNH*KFIELDS+1:), KFIELDS, R_NDGNH*KFIELDS, & - & 1) - ENDIF + KS(KMLOC) = (R%NSMAX-KM+3)/2 + NS(KMLOC) = G%NDGLU(KM) ENDDO +! TODO SKIP KMLOC0 +CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_T, & + & KFIELDS, NS(:), KS(:), & + & 1.0_JPRBT, & + & ZINP, KFIELDS, TDZAS*KFIELDS, & + & ZAS, R_NDGNH, TDZAS*R_NDGNH, & + & 0.0_JPRBT, & + & ZOUTS, KFIELDS, R_NDGNH*KFIELDS, & + & D_NUMP) CALL cudaDeviceSynchronize() IF (LSYNC_TRANS) THEN CALL GSTATS(444,0) From 31dc1d08c47c8c368bda4f653dcfc69a00259196 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:27 -0700 Subject: [PATCH 190/263] Add grouped GEMM in LEDIR --- src/trans/gpu/internal/ledir_mod.F90 | 47 +++++++++++++++------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 6373ac018..1d3a33a8c 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -79,6 +79,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) INTEGER(KIND=JPIM) :: KM INTEGER(KIND=JPIM) :: KMLOC INTEGER(KIND=JPIM) :: IA, IS, ISL, J, JK +INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPRBT) :: PAIA, PAIS @@ -151,18 +152,19 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ! C^T=B^T*A^T DO KMLOC=1,D_NUMP KM = D_MYMS(KMLOC) - IF (KM /= 0) THEN - CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_N, & - & 2*KF_FS, (R%NSMAX-KM+2)/2, G%NDGLU(KM), & - & 1.0_JPRBT, & - & ZINPA((KMLOC-1)*R_NDGNH*2*KF_FS+1:), 2*KF_FS, R_NDGNH*2*KF_FS, & - & ZAA(:,:,KMLOC), R_NDGNH, TDZAA*R_NDGNH, & - & 0.0_JPRBT, & - & ZOUT((KMLOC-1)*TDZAA*2*KF_FS+1:), 2*KF_FS, TDZAA*2*KF_FS, & - & 1) - ENDIF + NS(KMLOC) = (R%NSMAX-KM+2)/2 + KS(KMLOC) = G%NDGLU(KM) ENDDO +! TODO SKIP KMLOC0 +CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_N, & + & 2*KF_FS, NS(:), KS(:), & + & 1.0_JPRBT, & + & ZINPA, 2*KF_FS, R_NDGNH*2*KF_FS, & + & ZAA, R_NDGNH, TDZAA*R_NDGNH, & + & 0.0_JPRBT, & + & ZOUT, 2*KF_FS, TDZAA*2*KF_FS, & + & D_NUMP) CALL cudaDeviceSynchronize() IF (LSYNC_TRANS) THEN CALL GSTATS(434,0) @@ -235,18 +237,19 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ! C^T=B^T*A^T DO KMLOC=1,D_NUMP KM = D_MYMS(KMLOC) - IF (KM /= 0) THEN - CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_N, & - & 2*KF_FS, (R%NSMAX-KM+3)/2, G%NDGLU(KM), & - & 1.0_JPRBT, & - & ZINPS((KMLOC-1)*R_NDGNH*2*KF_FS+1:), 2*KF_FS, R_NDGNH*2*KF_FS, & - & ZAS(:,:,KMLOC), R_NDGNH, TDZAS*R_NDGNH, & - & 0.0_JPRBT, & - & ZOUT((KMLOC-1)*TDZAS*2*KF_FS+1:), 2*KF_FS, TDZAS*2*KF_FS, & - & 1) - ENDIF + NS(KMLOC) = (R%NSMAX-KM+3)/2 + KS(KMLOC) = G%NDGLU(KM) ENDDO +! TODO SKIP KMLOC0 +CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_N, & + & 2*KF_FS, NS(:), KS(:), & + & 1.0_JPRBT, & + & ZINPS, 2*KF_FS, R_NDGNH*2*KF_FS, & + & ZAS, R_NDGNH, TDZAS*R_NDGNH, & + & 0.0_JPRBT, & + & ZOUT, 2*KF_FS, TDZAS*2*KF_FS, & + & D_NUMP) CALL cudaDeviceSynchronize() IF (LSYNC_TRANS) THEN CALL GSTATS(434,0) From 73629963f47cf906d1ce3ae43a969af3902d59ab Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:27 -0700 Subject: [PATCH 191/263] remove TDZAA/TDZAS and add "strides" variables to leinv/dir --- src/trans/gpu/external/setup_trans.F90 | 39 ++++--------- src/trans/gpu/internal/ledir_mod.F90 | 70 +++++++++++++---------- src/trans/gpu/internal/leinv_mod.F90 | 77 +++++++++++++++----------- src/trans/gpu/internal/tpm_fields.F90 | 3 - 4 files changed, 97 insertions(+), 92 deletions(-) diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index f3a163729..350bc252b 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -109,7 +109,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& & D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS,mysetv,mysetw, MYPROC USE TPM_GEOMETRY ,ONLY : G, GEOM_RESOL, G_NDGLU, G_NMEN, G_NMEN_MAX,G_NLOEN, G_NLOEN_MAX USE TPM_FIELDS ,ONLY : FIELDS_RESOL, F,F_RW, ZEPSNM, & -& ZAA,ZAS,TDZAA,TDZAS,& +& ZAA,ZAS,& & ZAA0,& & ZAS0,KMLOC0 USE TPM_FFT ,ONLY : T, FFT_RESOL @@ -166,7 +166,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& !ifndef INTERFACE ! Local variables -INTEGER(KIND=JPIM),PARAMETER :: IMAXFLD=240 INTEGER(KIND=JPIM) :: JGL,JRES,IDEF_RESOL INTEGER(KIND=JPIM) :: JMLOC, KM, ILA, ILS, KMLOC, KDGLU, JK, i, J @@ -451,10 +450,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& WRITE(iunit,*) '===now going to allocate GPU arrays on processor: ', myproc, ' device = ', mygpu, ' ',idev, ' of ', inumdevs #endif -!leading and trailing dimensions of A for symmetric and antisymmetric cases -! (same for ltinv and ltdir) -TDZAA=(R%NTMAX+2)/2 -TDZAS=(R%NTMAX+3)/2 print*,'R%NTMAX=',R%NTMAX print*,'R%NSMAX=',R%NSMAX @@ -464,37 +459,23 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! Initialize A arrays -ALLOCATE(ZAA(R%NDGNH,TDZAA,D%NUMP)) -ALLOCATE(ZAS(R%NDGNH,TDZAS,D%NUMP)) +ALLOCATE(ZAA(R%NDGNH,(R%NTMAX+2)/2,D%NUMP)) +ALLOCATE(ZAS(R%NDGNH,(R%NTMAX+3)/2,D%NUMP)) write(nout,*)'setup_trans: sizes1 NUMP=',D%NUMP write(nout,*)'ZAS:',size(ZAS) write(nout,*)'ZAA:',size(ZAA) ZAA(:,:,:) = 0 -DO JMLOC=1,D%NUMP - KM = D%MYMS(JMLOC) - KDGLU = MIN(R%NDGNH,G%NDGLU(KM)) - - ILA = (R%NSMAX-KM+2)/2 - DO JK=1,KDGLU - DO J=1,ILA - ZAA(JK,J,JMLOC)=S%FA(JMLOC)%RPNMA(JK,J) - ENDDO - ENDDO -ENDDO - ZAS(:,:,:) = 0 DO JMLOC=1,D%NUMP KM = D%MYMS(JMLOC) - KDGLU = MIN(R%NDGNH,G%NDGLU(KM)) - + KDGLU = G%NDGLU(KM) + ILA = (R%NSMAX-KM+2)/2 ILS = (R%NSMAX-KM+3)/2 - DO JK=1,KDGLU - DO J=1,ILS - ZAS(JK,J,JMLOC)=S%FA(JMLOC)%RPNMS(JK,J) - ENDDO - ENDDO + + ZAA(1:KDGLU,1:ILA,JMLOC)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) + ZAS(1:KDGLU,1:ILS,JMLOC)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) ENDDO !$ACC ENTER DATA COPYIN(ZAA,ZAS) @@ -615,8 +596,8 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! arrays for m=0 in ledir_mod: IF(KMLOC0 >= 0) THEN - ALLOCATE(ZAA0(R%NDGNH,TDZAA)) - ALLOCATE(ZAS0(R%NDGNH,TDZAS)) + ALLOCATE(ZAA0(SIZE(ZAA,1),SIZE(ZAA,2))) + ALLOCATE(ZAS0(SIZE(ZAS,1),SIZE(ZAS,2))) ZAA0 = ZAA(:,:,KMLOC0) ZAS0 = ZAS(:,:,KMLOC0) !$ACC ENTER DATA COPYIN(ZAA0,ZAS0) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 1d3a33a8c..0efd65991 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -58,7 +58,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX,R_NTMAX,R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU -USE TPM_FIELDS ,ONLY : F,ZAA,ZAS,ZAA0,ZAS0,TDZAA,TDZAS,KMLOC0 +USE TPM_FIELDS ,ONLY : F,ZAA,ZAS,ZAA0,ZAS0,KMLOC0 USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1 USE CUDA_GEMM_BATCHED_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER @@ -88,13 +88,27 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) REAL(KIND=JPRBT), ALLOCATABLE :: ZINPS(:), ZINPA(:), ZOUT(:) REAL(KIND=JPRD), ALLOCATABLE :: ZINP0(:), ZOUT0(:) +INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 +INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 +INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 +INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) -ALLOCATE(ZINPA(2*KF_FS*R_NDGNH*D_NUMP)) -ALLOCATE(ZINPS(2*KF_FS*R_NDGNH*D_NUMP)) -ALLOCATE(ZOUT(2*KF_FS*TDZAS*D_NUMP)) -ALLOCATE(ZINP0(KF_FS*R_NDGNH)) -ALLOCATE(ZOUT0(KF_FS*TDZAS)) +IOUT_STRIDES0 = 2*KF_FS +IOUT_STRIDES1 = IOUT_STRIDES0 * MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2) +IIN_STRIDES0 = 2*KF_FS +IIN_STRIDES1 = IIN_STRIDES0 * R_NDGNH +IOUT0_STRIDES0 = KF_FS +IOUT0_STRIDES1 = IOUT0_STRIDES0 * MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2) +IIN0_STRIDES0 = KF_FS +IIN0_STRIDES1 = IIN0_STRIDES0 * R_NDGNH + +ALLOCATE(ZINPA(IIN_STRIDES1*D_NUMP)) +ALLOCATE(ZINPS(IIN_STRIDES1*D_NUMP)) +ALLOCATE(ZOUT(IOUT_STRIDES1*D_NUMP)) +ALLOCATE(ZINP0(IIN0_STRIDES1*1)) +ALLOCATE(ZOUT0(IOUT0_STRIDES1*1)) !$ACC DATA & !$ACC& CREATE(ZINPS,ZINPA,ZOUT,ZINP0,ZOUT0) & @@ -128,8 +142,8 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) PAIA = PAIA*F%RACTHE(JGL) PAIS = PAIS*F%RACTHE(JGL) ENDIF - ZINPA(JF+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIA*F%RW(JGL) - ZINPS(JF+(JGL-ISL+(KMLOC-1)*R_NDGNH)*2*KF_FS)=PAIS*F%RW(JGL) + ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIA*F%RW(JGL) + ZINPS(JF+(JGL-ISL)*IIN_striDes0+(KMLOC-1)*IIN_STRIDES1)=PAIS*F%RW(JGL) ENDIF ENDDO ENDDO @@ -160,10 +174,10 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & CUBLAS_OP_N, CUBLAS_OP_N, & & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & - & ZINPA, 2*KF_FS, R_NDGNH*2*KF_FS, & - & ZAA, R_NDGNH, TDZAA*R_NDGNH, & + & ZINPA, IIN_STRIDES0, IIN_STRIDES1, & + & ZAA, SIZE(ZAA,1), SIZE(ZAA,1)*SIZE(ZAA,2), & & 0.0_JPRBT, & - & ZOUT, 2*KF_FS, TDZAA*2*KF_FS, & + & ZOUT, IOUT_STRIDES0, IOUT_STRIDES1, & & D_NUMP) CALL cudaDeviceSynchronize() IF (LSYNC_TRANS) THEN @@ -181,7 +195,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) IA = 1+MOD(R_NTMAX-KM+2,2) !$ACC LOOP SEQ DO J=1,(R%NSMAX-KM+2)/2 - POA1(JK,IA+1+(J-1)*2,KMLOC) = ZOUT(JK+(J-1+(KMLOC-1)*TDZAA)*2*KF_FS) + POA1(JK,IA+1+(J-1)*2,KMLOC) = ZOUT(JK+(J-1)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) ENDDO ENDIF ENDDO @@ -194,8 +208,8 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) DO JGL=1,G_NDGLU(0) DO JF=1,2*KF_FS,2 - ZINP0((JF-1)/2+1+(JGL-1)*KF_FS) & - & = ZINPA((JF-1)+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*2*KF_FS) + ZINP0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0) & + & = ZINPA((JF-1)+1+(JGL-1)*IIN_STRIDES0+(KMLOC0-1)*IIN_STRIDES1) ENDDO ENDDO @@ -208,10 +222,10 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & CUBLAS_OP_N, CUBLAS_OP_N, & & KF_FS, (R%NSMAX+2)/2, G%NDGLU(0), & & 1.0_JPRD, & - & ZINP0, KF_FS, R_NDGNH*KF_FS, & - & ZAA0, R_NDGNH, TDZAA*R_NDGNH, & + & ZINP0, IIN0_STRIDES0, IIN0_STRIDES1, & + & ZAA0, SIZE(ZAA0,1), SIZE(ZAA0,1)*SIZE(ZAA0,2), & & 0.0_JPRD, & - & ZOUT0, KF_FS, TDZAA*KF_FS, & + & ZOUT0, IOUT0_STRIDES0, IOUT_STRIDES1, & & 1) CALL cudaDeviceSynchronize() @@ -219,7 +233,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) DO J=1,(R_NSMAX+2)/2 DO JK=1,2*KF_FS,2 IA = 1+MOD(R_NTMAX+2,2) - POA1(JK,IA+1+(J-1)*2,KMLOC0) = ZOUT0((JK-1)/2+1+(J-1)*KF_FS) + POA1(JK,IA+1+(J-1)*2,KMLOC0) = ZOUT0((JK-1)/2+1+(J-1)*IOUT0_STRIDES0) ENDDO ENDDO ENDIF @@ -245,10 +259,10 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & CUBLAS_OP_N, CUBLAS_OP_N, & & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & - & ZINPS, 2*KF_FS, R_NDGNH*2*KF_FS, & - & ZAS, R_NDGNH, TDZAS*R_NDGNH, & + & ZINPS, IIN_STRIDES0, IIN_STRIDES1, & + & ZAS, SIZE(ZAS,1), SIZE(ZAS,1)*SIZE(ZAS,2), & & 0.0_JPRBT, & - & ZOUT, 2*KF_FS, TDZAS*2*KF_FS, & + & ZOUT, IOUT_STRIDES0, IOUT_STRIDES1, & & D_NUMP) CALL cudaDeviceSynchronize() IF (LSYNC_TRANS) THEN @@ -266,7 +280,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) IS = 1+MOD(R_NTMAX-KM+1,2) !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+3)/2 - POA1(JK,IS+1+(J-1)*2,KMLOC) = ZOUT(JK+(J-1+(KMLOC-1)*TDZAS)*2*KF_FS) + POA1(JK,IS+1+(J-1)*2,KMLOC) = ZOUT(JK+(J-1)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) ENDDO ENDIF ENDDO @@ -276,8 +290,8 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) DO JGL=1,G_NDGLU(0) DO JF=1,2*KF_FS,2 - ZINP0((JF-1)/2+1+(JGL-1)*KF_FS) & - & = ZINPS((JF-1)+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*2*KF_FS) + ZINP0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0) & + & = ZINPS((JF-1)+1+(JGL-1)*IIN_STRIDES0+(KMLOC0-1)*IIN_STRIDES1) ENDDO ENDDO @@ -289,10 +303,10 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & CUBLAS_OP_N, CUBLAS_OP_N, & & KF_FS, (R%NSMAX+3)/2, G%NDGLU(0), & & 1.0_JPRD, & - & ZINP0, KF_FS, R_NDGNH*KF_FS, & - & ZAS0, R_NDGNH, TDZAS*R_NDGNH, & + & ZINP0, IIN0_STRIDES0, IIN_STRIDES1, & + & ZAS0, SIZE(ZAS0,1), SIZE(ZAS0,1)*SIZE(ZAS0,2), & & 0.0_JPRD, & - & ZOUT0, KF_FS, TDZAS*KF_FS, & + & ZOUT0, IOUT0_STRIDES0, IOUT0_STRIDES1, & & 1) call cudaDeviceSynchronize() @@ -300,7 +314,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) DO J=1,(R_NSMAX+3)/2 DO JK=1,2*KF_FS,2 IS = 1+MOD(R_NTMAX+1,2) - POA1(JK,IS+1+(J-1)*2,KMLOC0) = ZOUT0((JK-1)/2+1+(J-1)*KF_FS) + POA1(JK,IS+1+(J-1)*2,KMLOC0) = ZOUT0((JK-1)/2+1+(J-1)*IOUT0_STRIDES0) ENDDO ENDDO diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 7a45ce8f3..15d60293f 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -55,7 +55,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX, R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU -USE TPM_FIELDS ,ONLY : ZAA,ZAS,ZAA0,ZAS0,TDZAA,TDZAS,KMLOC0 +USE TPM_FIELDS ,ONLY : ZAA,ZAS,ZAA0,ZAS0,KMLOC0 USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1 USE TPM_GEN, ONLY: NOUT USE CUDA_GEMM_BATCHED_MOD @@ -82,6 +82,10 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) INTEGER(KIND=JPIM) :: IA, IS, ISL, J1, JGL, JK, J, IGLS, ISTAS, OFFSET1, OFFSET2 INTEGER(KIND=JPIM) :: KFIELDS +INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 +INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 +INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 +INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 REAL(KIND=JPHOOK) :: ZHOOK_HANDLE @@ -97,11 +101,20 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) !* 1.1 PREPARATIONS. -ALLOCATE(ZINP(KFIELDS*TDZAS*D_NUMP)) -ALLOCATE(ZOUTS(KFIELDS*R_NDGNH*D_NUMP)) -ALLOCATE(ZOUTA(KFIELDS*R_NDGNH*D_NUMP)) -ALLOCATE(ZINP0(KFIELDS/2*TDZAS)) -ALLOCATE(ZOUT0(KFIELDS/2*R_NDGNH)) +IIN_STRIDES0 = KFIELDS +IIN_STRIDES1 = IIN_STRIDES0 * MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2) +IOUT_STRIDES0 = KFIELDS +IOUT_STRIDES1 = IOUT_STRIDES0 * R_NDGNH +IIN0_STRIDES0 = KFIELDS/2 +IIN0_STRIDES1 = IIN0_STRIDES0 * MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2) +IOUT0_STRIDES0 = KFIELDS/2 +IOUT0_STRIDES1 = IOUT0_STRIDES0 * R_NDGNH + +ALLOCATE(ZINP(IIN_STRIDES1*D_NUMP)) +ALLOCATE(ZOUTS(IOUT_STRIDES1*D_NUMP)) +ALLOCATE(ZOUTA(IOUT_STRIDES1*D_NUMP)) +ALLOCATE(ZINP0(IIN0_STRIDES1*1)) +ALLOCATE(ZOUT0(IOUT0_STRIDES1*1)) !$ACC DATA PRESENT(D,D_MYMS,G,G_NDGLU,R) & !$ACC& CREATE (ZINP,ZOUTS,ZOUTA,ZINP0,ZOUT0) & @@ -134,12 +147,12 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) IF(KM /= 0)THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+2)/2 - ZINP(JK+(J-1+(KMLOC-1)*TDZAA)*KFIELDS)=PIA(JK,IA+1+(J-1)*2,KMLOC) + ZINP(JK+(J-1)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PIA(JK,IA+1+(J-1)*2,KMLOC) ENDDO ELSEIF (MOD((JK+1),2) .EQ. 0) THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX+2)/2 - ZINP((JK-1)/2+1+(J-1+(KMLOC-1)*TDZAA)*KFIELDS)=PIA(JK,IA+1+(J-1)*2,KMLOC) + ZINP((JK-1)/2+1+(J-1)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PIA(JK,IA+1+(J-1)*2,KMLOC) ENDDO ENDIF ENDDO @@ -167,10 +180,10 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & CUBLAS_OP_N, CUBLAS_OP_T, & & KFIELDS, NS(:), KS(:), & & 1.0_JPRBT, & - & ZINP, KFIELDS, TDZAA*KFIELDS, & - & ZAA, R_NDGNH, TDZAA*R_NDGNH, & + & ZINP, IIN_STRIDES0, IIN_STRIDES1, & + & ZAA, SIZE(ZAA,1), SIZE(ZAA,1)*SIZE(ZAA,2), & & 0.0_JPRBT, & - & ZOUTA, KFIELDS, R_NDGNH*KFIELDS, & + & ZOUTA, IOUT_STRIDES0, IOUT_STRIDES1, & & D_NUMP) CALL cudaDeviceSynchronize() @@ -187,7 +200,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) DO J=1,(R_NSMAX+2)/2 DO JK=1,KFIELDS,2 IA = 1+MOD(R_NSMAX+2,2) - ZINP0((JK-1)/2+1+(J-1)*KFIELDS/2) = PIA(JK,IA+1+(J-1)*2,KMLOC0) + ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IA+1+(J-1)*2,KMLOC0) ENDDO ENDDO @@ -195,10 +208,10 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & CUBLAS_OP_N, CUBLAS_OP_T, & & KFIELDS/2, G%NDGLU(0), (R%NSMAX+2)/2, & & 1.0_JPRD, & - & ZINP0, KFIELDS/2, TDZAA*KFIELDS/2, & - & ZAA0, R_NDGNH, TDZAA*R_NDGNH, & + & ZINP0, IIN0_STRIDES0, IIN0_STRIDES1, & + & ZAA0, SIZE(ZAA0,1), SIZE(ZAA0,1)*SIZE(ZAA0,2), & & 0.0_JPRD, & - & ZOUT0, KFIELDS/2, R_NDGNH*KFIELDS/2, & + & ZOUT0, IOUT0_STRIDES0, IOUT0_STRIDES1, & & 1) CALL cudaDeviceSynchronize() @@ -206,8 +219,8 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) DO JGL=1,G_NDGLU(0) DO JK=1,KFIELDS,2 ! ZAOA = ZOUTA((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) - ZOUTA((JK-1)/2+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*KFIELDS) & - & = ZOUT0((JK-1)/2+1+(JGL-1)*KFIELDS/2) + ZOUTA((JK-1)/2+1+(JGL-1)*IOUT_STRIDES0+(KMLOC0-1)*IOUT_STRIDES1) & + & = ZOUT0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) ENDDO ENDDO @@ -231,12 +244,12 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) IF(KM /= 0)THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+3)/2 - ZINP(JK+(J-1+(KMLOC-1)*TDZAS)*KFIELDS)=PIA(JK,IS+1+(J-1)*2,KMLOC) + ZINP(JK+(J-1)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PIA(JK,IS+1+(J-1)*2,KMLOC) ENDDO ELSEIF (MOD((JK+1),2) .EQ. 0) THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX+3)/2 - ZINP((JK-1)/2+1+(J-1+(KMLOC-1)*TDZAS)*KFIELDS)=PIA(JK,IS+1+(J-1)*2,KMLOC) + ZINP((JK-1)/2+1+(J-1)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PIA(JK,IS+1+(J-1)*2,KMLOC) ENDDO ENDIF ENDDO @@ -261,10 +274,10 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & CUBLAS_OP_N, CUBLAS_OP_T, & & KFIELDS, NS(:), KS(:), & & 1.0_JPRBT, & - & ZINP, KFIELDS, TDZAS*KFIELDS, & - & ZAS, R_NDGNH, TDZAS*R_NDGNH, & + & ZINP, IIN_STRIDES0, IIN_STRIDES1, & + & ZAS, SIZE(ZAS,1), SIZE(ZAS,1)*SIZE(ZAS,2), & & 0.0_JPRBT, & - & ZOUTS, KFIELDS, R_NDGNH*KFIELDS, & + & ZOUTS, IOUT_STRIDES0, IOUT_STRIDES1, & & D_NUMP) CALL cudaDeviceSynchronize() IF (LSYNC_TRANS) THEN @@ -279,7 +292,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) DO J=1,(R_NSMAX+3)/2 DO JK=1,KFIELDS,2 IS = 1+MOD(R_NSMAX+1,2) - ZINP0((JK-1)/2+1+(J-1)*KFIELDS/2) = PIA(JK,IS+1+(J-1)*2,KMLOC0) + ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IS+1+(J-1)*2,KMLOC0) ENDDO ENDDO @@ -287,18 +300,18 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & CUBLAS_OP_N, CUBLAS_OP_T, & & KFIELDS/2, G%NDGLU(0), (R%NSMAX+3)/2, & & 1.0_JPRD, & - & ZINP0, KFIELDS/2, TDZAS*KFIELDS/2, & - & ZAS0, R_NDGNH, TDZAS*R_NDGNH, & + & ZINP0, IIN0_STRIDES0, IIN0_STRIDES1, & + & ZAS0, SIZE(ZAS0,1), SIZE(ZAS0,1)*SIZE(ZAS0,2), & & 0.0_JPRD, & - & ZOUT0, KFIELDS/2, R_NDGNH*KFIELDS/2, & + & ZOUT0, IOUT0_STRIDES0, IOUT0_STRIDES1, & & 1) CALL cudaDeviceSynchronize() !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) DO JGL=1,G_NDGLU(0) DO JK=1,KFIELDS,2 - ZOUTS((JK-1)/2+1+(JGL-1+(KMLOC0-1)*R_NDGNH)*KFIELDS) & - & = ZOUT0((JK-1)/2+1+(JGL-1)*KFIELDS/2) + ZOUTS((JK-1)/2+1+(JGL-1)*IOUT_STRIDES0+(KMLOC0-1)*IOUT_STRIDES1) & + & = ZOUT0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) ENDDO ENDDO @@ -321,11 +334,11 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*KFIELDS IF(KM /= 0) THEN - ZSOA = ZOUTS(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) - ZAOA = ZOUTA(JK+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) + ZSOA = ZOUTS(JK+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) + ZAOA = ZOUTA(JK+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) ELSEIF (MOD((JK-1),2) .EQ. 0) THEN - ZSOA = ZOUTS((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) - ZAOA = ZOUTA((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) + ZSOA = ZOUTS((JK-1)/2+1+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) + ZAOA = ZOUTA((JK-1)/2+1+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) ELSE ! Imaginary values of KM=0 is zero, though I don't think we care ZSOA = 0_JPRBT diff --git a/src/trans/gpu/internal/tpm_fields.F90 b/src/trans/gpu/internal/tpm_fields.F90 index affaf7248..a27ad81a2 100755 --- a/src/trans/gpu/internal/tpm_fields.F90 +++ b/src/trans/gpu/internal/tpm_fields.F90 @@ -47,9 +47,6 @@ MODULE TPM_FIELDS REAL(KIND=JPRD),ALLOCATABLE :: ZAS0(:,:) INTEGER(KIND=JPIM) :: KMLOC0 -INTEGER(KIND=JPIM) :: TDZAA -INTEGER(KIND=JPIM) :: TDZAS - REAL(KIND=JPRBT),ALLOCATABLE :: ZEPSNM(:,:) END MODULE TPM_FIELDS From b3e0625f400a385ee9455ad2575dd86e3db041d0 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:28 -0700 Subject: [PATCH 192/263] Add alignment option --- src/trans/gpu/external/setup_trans.F90 | 5 +++-- src/trans/gpu/internal/ledir_mod.F90 | 17 +++++++++-------- src/trans/gpu/internal/leinv_mod.F90 | 17 +++++++++-------- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index 350bc252b..34eb818d9 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -1,3 +1,4 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2022- NVIDIA. ! @@ -459,8 +460,8 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! Initialize A arrays -ALLOCATE(ZAA(R%NDGNH,(R%NTMAX+2)/2,D%NUMP)) -ALLOCATE(ZAS(R%NDGNH,(R%NTMAX+3)/2,D%NUMP)) +ALLOCATE(ZAA(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+2)/2,8),D%NUMP)) +ALLOCATE(ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP)) write(nout,*)'setup_trans: sizes1 NUMP=',D%NUMP write(nout,*)'ZAS:',size(ZAS) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 0efd65991..5b5fd6c96 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -1,3 +1,4 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2022- NVIDIA. ! @@ -95,14 +96,14 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) -IOUT_STRIDES0 = 2*KF_FS -IOUT_STRIDES1 = IOUT_STRIDES0 * MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2) -IIN_STRIDES0 = 2*KF_FS -IIN_STRIDES1 = IIN_STRIDES0 * R_NDGNH -IOUT0_STRIDES0 = KF_FS -IOUT0_STRIDES1 = IOUT0_STRIDES0 * MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2) -IIN0_STRIDES0 = KF_FS -IIN0_STRIDES1 = IIN0_STRIDES0 * R_NDGNH +IOUT_STRIDES0 = ALIGN(2*KF_FS,8) +IOUT_STRIDES1 = IOUT_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),8) +IIN_STRIDES0 = ALIGN(2*KF_FS,8) +IIN_STRIDES1 = IIN_STRIDES0 * ALIGN(R_NDGNH,8) +IOUT0_STRIDES0 = ALIGN(KF_FS,8) +IOUT0_STRIDES1 = IOUT0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),8) +IIN0_STRIDES0 = ALIGN(KF_FS,8) +IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(R_NDGNH,8) ALLOCATE(ZINPA(IIN_STRIDES1*D_NUMP)) ALLOCATE(ZINPS(IIN_STRIDES1*D_NUMP)) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 15d60293f..365e07efb 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -1,3 +1,4 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2022- NVIDIA. ! @@ -101,14 +102,14 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) !* 1.1 PREPARATIONS. -IIN_STRIDES0 = KFIELDS -IIN_STRIDES1 = IIN_STRIDES0 * MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2) -IOUT_STRIDES0 = KFIELDS -IOUT_STRIDES1 = IOUT_STRIDES0 * R_NDGNH -IIN0_STRIDES0 = KFIELDS/2 -IIN0_STRIDES1 = IIN0_STRIDES0 * MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2) -IOUT0_STRIDES0 = KFIELDS/2 -IOUT0_STRIDES1 = IOUT0_STRIDES0 * R_NDGNH +IIN_STRIDES0 = ALIGN(KFIELDS,8) +IIN_STRIDES1 = IIN_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),8) +IOUT_STRIDES0 = ALIGN(KFIELDS,8) +IOUT_STRIDES1 = IOUT_STRIDES0 * ALIGN(R_NDGNH,8) +IIN0_STRIDES0 = ALIGN(KFIELDS/2,8) +IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),8) +IOUT0_STRIDES0 = ALIGN(KFIELDS/2,8) +IOUT0_STRIDES1 = IOUT0_STRIDES0 * ALIGN(R_NDGNH,8) ALLOCATE(ZINP(IIN_STRIDES1*D_NUMP)) ALLOCATE(ZOUTS(IOUT_STRIDES1*D_NUMP)) From 8936ce00db0bf5a30aa9f703799953a99bec31d7 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:28 -0700 Subject: [PATCH 193/263] add first cutlass implementation (CHANGE1: 11) (CHANGE2: 13) --- CMakeLists.txt | 3 +- src/trans/gpu/CMakeLists.txt | 3 + .../gpu/algor/external/gemm/gemm_wrapper.cu | 201 +++++++++++++++--- .../gpu/internal/cuda_gemm_batched_mod.F90 | 4 +- src/trans/gpu/internal/ledir_mod.F90 | 18 +- src/trans/gpu/internal/leinv_mod.F90 | 18 +- 6 files changed, 203 insertions(+), 44 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f5c833fb2..1e57b1b26 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,5 @@ # (C) Copyright 2020- ECMWF. +# (C) Copyright 2022- NVIDIA. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -9,7 +10,7 @@ cmake_minimum_required( VERSION 3.12 FATAL_ERROR ) find_package( ecbuild 3.4 REQUIRED HINTS ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../ecbuild ) -project( ectrans LANGUAGES C Fortran ) +project( ectrans LANGUAGES C CXX Fortran ) include( ectrans_macros ) ecbuild_enable_fortran( REQUIRED NO_MODULE_DIRECTORY ) diff --git a/src/trans/gpu/CMakeLists.txt b/src/trans/gpu/CMakeLists.txt index f04e02045..103d6660f 100644 --- a/src/trans/gpu/CMakeLists.txt +++ b/src/trans/gpu/CMakeLists.txt @@ -133,6 +133,8 @@ foreach( prec sp dp ) endif() endforeach() +ecbuild_find_package( NAME NvidiaCutlass REQUIRED) + ## precision-independent GPU library with CUDA kernels ecbuild_add_library( TARGET gpu TYPE STATIC @@ -140,6 +142,7 @@ ecbuild_add_library( TARGET gpu algor/external/fourier/fft_wrapper.cu algor/external/gemm/gemm_wrapper.cu PRIVATE_INCLUDES ${MPI_C_INCLUDE_PATH} + PRIVATE_LIBS nvidia::cutlass::cutlass ) ## CUDA architecture diff --git a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu index 58bc3bed8..28c524ab2 100644 --- a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu +++ b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu @@ -1,6 +1,9 @@ #include "cublas_v2.h" +#include "cutlass/gemm/device/gemm.h" #include +constexpr bool use_cutlass = true; + #define CUDA_CHECK(e) \ { \ cudaError_t err = (e); \ @@ -19,36 +22,134 @@ exit(EXIT_FAILURE); \ } \ } +#define CUTLASS_CHECK(e) \ + { \ + cutlass::Status err = (e); \ + if (err != cutlass::Status::kSuccess) { \ + fprintf(stderr, "CUTLASS error: %s, line %d, %s: %i\n", __FILE__, \ + __LINE__, #e, (int)err); \ + exit(EXIT_FAILURE); \ + } \ + } -extern "C" { -void cublas_dgemm_wrapper(cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, double alpha, const double *A, - int lda, int tda, const double *B, int ldb, int tdb, - double beta, double *C, int ldc, int tdc, - int batchCount) { - static cublasHandle_t handle = nullptr; - if (!handle) - CUBLAS_CHECK(cublasCreate(&handle)); +template +void cutlass_sgemm_wrapper_grouped_v(int m, int *n, int *k, float alpha, + const float *A, int lda, int tda, + const float *B, int ldb, int tdb, + float beta, float *C, int ldc, int tdc, + int batchCount) { +#if 0 + // we will enable this later (this ifdefs did not work, so I am going to enable this properly ltaer) + // this was verified using Ampere and uses 3XTF32 + constexpr int AlignmentA = 4; + constexpr int AlignmentB = 4; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 32>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using OperatorClass = cutlass::arch::OpClassTensorOp; + using MyOp = cutlass::arch::OpMultiplyAddFastF32; - CUBLAS_CHECK(cublasDgemmStridedBatched(handle, transa, transb, m, n, k, - &alpha, A, lda, tda, B, ldb, tdb, - &beta, C, ldc, tdc, batchCount)); -} + using Gemm = cutlass::gemm::device::Gemm< + float, + std::conditional_t, // + float, + std::conditional_t, // + float, cutlass::layout::ColumnMajor, // + float, // + OperatorClass, cutlass::arch::Sm80, // + ThreadblockShape, WarpShape, InstructionShape, // + cutlass::epilogue::thread::LinearCombination< // + float, // + 128 / cutlass::sizeof_bits::value, + float, // + float // + >, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, // + 3, // + AlignmentA, // + AlignmentB, // + true, // + MyOp // + >; + constexpr int sz_align = 8; +#else + // this was verified using Volta and uses FP32 + constexpr int AlignmentA = 1; + constexpr int AlignmentB = 1; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + using OperatorClass = cutlass::arch::OpClassSimt; + using MyOp = cutlass::arch::OpMultiplyAdd; -void cublas_sgemm_wrapper(cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, float alpha, const float *A, - int lda, int tda, const float *B, int ldb, int tdb, - float beta, float *C, int ldc, int tdc, - int batchCount) { - static cublasHandle_t handle = nullptr; - if (!handle) - CUBLAS_CHECK(cublasCreate(&handle)); + using Gemm = cutlass::gemm::device::Gemm< + float, // + std::conditional_t, // + float, // + std::conditional_t, // + float, cutlass::layout::ColumnMajor, // + float, // + OperatorClass, cutlass::arch::Sm50, // + ThreadblockShape, WarpShape, InstructionShape, // + cutlass::epilogue::thread::LinearCombination< // + float, // + 1, // + float, // + float // + >, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, // + 2, // + AlignmentA, // + AlignmentB, // + true, // + MyOp // + >; + constexpr int sz_align = 1; +#endif - CUBLAS_CHECK(cublasSgemmStridedBatched(handle, transa, transb, m, n, k, - &alpha, A, lda, tda, B, ldb, tdb, - &beta, C, ldc, tdc, batchCount)); + Gemm gemm_op; + for (int i = 0; i < batchCount; ++i) { + CUTLASS_CHECK(gemm_op({// + {(m + sz_align - 1) / sz_align * sz_align, + (n[i] + sz_align - 1) / sz_align * sz_align, + (k[i] + sz_align - 1) / sz_align * sz_align}, + {const_cast(A + i * tda), lda}, + {const_cast(B + i * tdb), ldb}, + {C + i * tdc, ldc}, + {C + i * tdc, ldc}, + {alpha, beta}})); + } + CUDA_CHECK(cudaDeviceSynchronize()); +} +void cutlass_sgemm_wrapper_grouped(cublasOperation_t transa, + cublasOperation_t transb, int m, int *n, + int *k, float alpha, const float *A, int lda, + int tda, const float *B, int ldb, int tdb, + float beta, float *C, int ldc, int tdc, + int batchCount) { + if (transa == CUBLAS_OP_N && transb == CUBLAS_OP_N) + cutlass_sgemm_wrapper_grouped_v( + m, n, k, alpha, A, lda, tda, B, ldb, tdb, beta, C, ldc, tdc, + batchCount); + else if (transa == CUBLAS_OP_N && transb == CUBLAS_OP_T) + cutlass_sgemm_wrapper_grouped_v( + m, n, k, alpha, A, lda, tda, B, ldb, tdb, beta, C, ldc, tdc, + batchCount); + else if (transa == CUBLAS_OP_T && transb == CUBLAS_OP_N) + cutlass_sgemm_wrapper_grouped_v( + m, n, k, alpha, A, lda, tda, B, ldb, tdb, beta, C, ldc, tdc, + batchCount); + else if (transa == CUBLAS_OP_T && transb == CUBLAS_OP_T) + cutlass_sgemm_wrapper_grouped_v( + m, n, k, alpha, A, lda, tda, B, ldb, tdb, beta, C, ldc, tdc, + batchCount); + else + assert(false); } - void cublas_sgemm_wrapper_grouped(cublasOperation_t transa, cublasOperation_t transb, int m, int *n, int *k, float alpha, const float *A, int lda, @@ -81,4 +182,54 @@ void cublas_dgemm_wrapper_grouped(cublasOperation_t transa, C + i * tdc, ldc)); } } + +extern "C" { +void cublas_dgemm_wrapper(cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, double alpha, const double *A, + int lda, int tda, const double *B, int ldb, int tdb, + double beta, double *C, int ldc, int tdc, + int batchCount) { + static cublasHandle_t handle = nullptr; + if (!handle) + CUBLAS_CHECK(cublasCreate(&handle)); + + CUBLAS_CHECK(cublasDgemmStridedBatched(handle, transa, transb, m, n, k, + &alpha, A, lda, tda, B, ldb, tdb, + &beta, C, ldc, tdc, batchCount)); +} + +void cublas_sgemm_wrapper(cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, float alpha, const float *A, + int lda, int tda, const float *B, int ldb, int tdb, + float beta, float *C, int ldc, int tdc, + int batchCount) { + static cublasHandle_t handle = nullptr; + if (!handle) + CUBLAS_CHECK(cublasCreate(&handle)); + + CUBLAS_CHECK(cublasSgemmStridedBatched(handle, transa, transb, m, n, k, + &alpha, A, lda, tda, B, ldb, tdb, + &beta, C, ldc, tdc, batchCount)); +} + +void blas_sgemm_wrapper_grouped(cublasOperation_t transa, + cublasOperation_t transb, int m, int *n, int *k, + float alpha, const float *A, int lda, int tda, + const float *B, int ldb, int tdb, float beta, + float *C, int ldc, int tdc, int batchCount) { + if (use_cutlass) + cutlass_sgemm_wrapper_grouped(transa, transb, m, n, k, alpha, A, lda, tda, + B, ldb, tdb, beta, C, ldc, tdc, batchCount); + else + cublas_sgemm_wrapper_grouped(transa, transb, m, n, k, alpha, A, lda, tda, B, + ldb, tdb, beta, C, ldc, tdc, batchCount); +} +void blas_dgemm_wrapper_grouped(cublasOperation_t transa, + cublasOperation_t transb, int m, int *n, int *k, + double alpha, const double *A, int lda, int tda, + const double *B, int ldb, int tdb, double beta, + double *C, int ldc, int tdc, int batchCount) { + cublas_dgemm_wrapper_grouped(transa, transb, m, n, k, alpha, A, lda, tda, B, + ldb, tdb, beta, C, ldc, tdc, batchCount); +} } diff --git a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 index 24eecda9b..33c9e8619 100755 --- a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 +++ b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 @@ -41,7 +41,7 @@ SUBROUTINE CUDA_DGEMM_GROUPED(& & BETA, & & C, LDC, TDC, & & BATCHCOUNT & - &) BIND(C, NAME='cublas_dgemm_wrapper_grouped') + &) BIND(C, NAME='blas_dgemm_wrapper_grouped') USE ISO_C_BINDING INTEGER(C_INT), VALUE :: CTA, CTB, M, N(:), K(:), LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT REAL(C_DOUBLE), VALUE :: ALPHA,BETA @@ -56,7 +56,7 @@ SUBROUTINE CUDA_SGEMM_GROUPED(& & BETA, & & C, LDC, TDC, & & BATCHCOUNT & - &) BIND(C, NAME='cublas_sgemm_wrapper_grouped') + &) BIND(C, NAME='blas_sgemm_wrapper_grouped') USE ISO_C_BINDING INTEGER(C_INT), VALUE :: CTA, CTB, M, N(:), K(:), LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT REAL(C_FLOAT), VALUE :: ALPHA,BETA diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 5b5fd6c96..e0d694345 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -94,16 +94,18 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 +INTEGER(KIND=JPIM) :: A = 8 !Alignment + IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) -IOUT_STRIDES0 = ALIGN(2*KF_FS,8) -IOUT_STRIDES1 = IOUT_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),8) -IIN_STRIDES0 = ALIGN(2*KF_FS,8) -IIN_STRIDES1 = IIN_STRIDES0 * ALIGN(R_NDGNH,8) -IOUT0_STRIDES0 = ALIGN(KF_FS,8) -IOUT0_STRIDES1 = IOUT0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),8) -IIN0_STRIDES0 = ALIGN(KF_FS,8) -IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(R_NDGNH,8) +IOUT_STRIDES0 = ALIGN(2*KF_FS,A) +IOUT_STRIDES1 = IOUT_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) +IIN_STRIDES0 = ALIGN(2*KF_FS,A) +IIN_STRIDES1 = IIN_STRIDES0 * ALIGN(R_NDGNH,A) +IOUT0_STRIDES0 = ALIGN(KF_FS,A) +IOUT0_STRIDES1 = IOUT0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) +IIN0_STRIDES0 = ALIGN(KF_FS,A) +IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(R_NDGNH,A) ALLOCATE(ZINPA(IIN_STRIDES1*D_NUMP)) ALLOCATE(ZINPS(IIN_STRIDES1*D_NUMP)) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 365e07efb..dde5f1a69 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -88,6 +88,8 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 +INTEGER(KIND=JPIM) :: A = 8 !Alignment + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE @@ -102,14 +104,14 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) !* 1.1 PREPARATIONS. -IIN_STRIDES0 = ALIGN(KFIELDS,8) -IIN_STRIDES1 = IIN_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),8) -IOUT_STRIDES0 = ALIGN(KFIELDS,8) -IOUT_STRIDES1 = IOUT_STRIDES0 * ALIGN(R_NDGNH,8) -IIN0_STRIDES0 = ALIGN(KFIELDS/2,8) -IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),8) -IOUT0_STRIDES0 = ALIGN(KFIELDS/2,8) -IOUT0_STRIDES1 = IOUT0_STRIDES0 * ALIGN(R_NDGNH,8) +IIN_STRIDES0 = ALIGN(KFIELDS,A) +IIN_STRIDES1 = IIN_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) +IOUT_STRIDES0 = ALIGN(KFIELDS,A) +IOUT_STRIDES1 = IOUT_STRIDES0 * ALIGN(R_NDGNH,A) +IIN0_STRIDES0 = ALIGN(KFIELDS/2,A) +IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) +IOUT0_STRIDES0 = ALIGN(KFIELDS/2,A) +IOUT0_STRIDES1 = IOUT0_STRIDES0 * ALIGN(R_NDGNH,A) ALLOCATE(ZINP(IIN_STRIDES1*D_NUMP)) ALLOCATE(ZOUTS(IOUT_STRIDES1*D_NUMP)) From 10d0883dd0eddc8dc42704c4ab8caeb77e9d8c18 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:28 -0700 Subject: [PATCH 194/263] change grouped gemm from stride to offset based --- .../gpu/algor/external/gemm/gemm_wrapper.cu | 88 ++++++++++--------- .../gpu/internal/cuda_gemm_batched_mod.F90 | 52 +++++------ src/trans/gpu/internal/ledir_mod.F90 | 32 ++++--- src/trans/gpu/internal/leinv_mod.F90 | 32 ++++--- 4 files changed, 111 insertions(+), 93 deletions(-) diff --git a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu index 28c524ab2..b8ac2ac64 100644 --- a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu +++ b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu @@ -34,10 +34,10 @@ constexpr bool use_cutlass = true; template void cutlass_sgemm_wrapper_grouped_v(int m, int *n, int *k, float alpha, - const float *A, int lda, int tda, - const float *B, int ldb, int tdb, - float beta, float *C, int ldc, int tdc, - int batchCount) { + const float *A, int lda, int *offsetsA, + const float *B, int ldb, int *offsetsB, + float beta, float *C, int ldc, + int *offsetsC, int batchCount) { #if 0 // we will enable this later (this ifdefs did not work, so I am going to enable this properly ltaer) // this was verified using Ampere and uses 3XTF32 @@ -117,10 +117,10 @@ void cutlass_sgemm_wrapper_grouped_v(int m, int *n, int *k, float alpha, {(m + sz_align - 1) / sz_align * sz_align, (n[i] + sz_align - 1) / sz_align * sz_align, (k[i] + sz_align - 1) / sz_align * sz_align}, - {const_cast(A + i * tda), lda}, - {const_cast(B + i * tdb), ldb}, - {C + i * tdc, ldc}, - {C + i * tdc, ldc}, + {const_cast(A + offsetsA[i]), lda}, + {const_cast(B + offsetsB[i]), ldb}, + {C + offsetsC[i], ldc}, + {C + offsetsC[i], ldc}, {alpha, beta}})); } CUDA_CHECK(cudaDeviceSynchronize()); @@ -128,58 +128,59 @@ void cutlass_sgemm_wrapper_grouped_v(int m, int *n, int *k, float alpha, void cutlass_sgemm_wrapper_grouped(cublasOperation_t transa, cublasOperation_t transb, int m, int *n, int *k, float alpha, const float *A, int lda, - int tda, const float *B, int ldb, int tdb, - float beta, float *C, int ldc, int tdc, - int batchCount) { + int *offsetsA, const float *B, int ldb, + int *offsetsB, float beta, float *C, int ldc, + int *offsetsC, int batchCount) { if (transa == CUBLAS_OP_N && transb == CUBLAS_OP_N) cutlass_sgemm_wrapper_grouped_v( - m, n, k, alpha, A, lda, tda, B, ldb, tdb, beta, C, ldc, tdc, - batchCount); + m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, + offsetsC, batchCount); else if (transa == CUBLAS_OP_N && transb == CUBLAS_OP_T) cutlass_sgemm_wrapper_grouped_v( - m, n, k, alpha, A, lda, tda, B, ldb, tdb, beta, C, ldc, tdc, - batchCount); + m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, + offsetsC, batchCount); else if (transa == CUBLAS_OP_T && transb == CUBLAS_OP_N) cutlass_sgemm_wrapper_grouped_v( - m, n, k, alpha, A, lda, tda, B, ldb, tdb, beta, C, ldc, tdc, - batchCount); + m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, + offsetsC, batchCount); else if (transa == CUBLAS_OP_T && transb == CUBLAS_OP_T) cutlass_sgemm_wrapper_grouped_v( - m, n, k, alpha, A, lda, tda, B, ldb, tdb, beta, C, ldc, tdc, - batchCount); + m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, + offsetsC, batchCount); else assert(false); } void cublas_sgemm_wrapper_grouped(cublasOperation_t transa, cublasOperation_t transb, int m, int *n, int *k, float alpha, const float *A, int lda, - int tda, const float *B, int ldb, int tdb, - float beta, float *C, int ldc, int tdc, - int batchCount) { + int *offsetsA, const float *B, int ldb, + int *offsetsB, float beta, float *C, int ldc, + int *offsetsC, int batchCount) { static cublasHandle_t handle = nullptr; if (!handle) CUBLAS_CHECK(cublasCreate(&handle)); for (int i = 0; i < batchCount; ++i) { CUBLAS_CHECK(cublasSgemm(handle, transa, transb, m, n[i], k[i], &alpha, - A + i * tda, lda, B + i * tdb, ldb, &beta, - C + i * tdc, ldc)); + A + offsetsA[i], lda, B + offsetsB[i], ldb, &beta, + C + offsetsC[i], ldc)); } } void cublas_dgemm_wrapper_grouped(cublasOperation_t transa, cublasOperation_t transb, int m, int *n, int *k, double alpha, const double *A, - int lda, int tda, const double *B, int ldb, - int tdb, double beta, double *C, int ldc, - int tdc, int batchCount) { + int lda, int *offsetsA, const double *B, + int ldb, int *offsetsB, double beta, + double *C, int ldc, int *offsetsC, + int batchCount) { static cublasHandle_t handle = nullptr; if (!handle) CUBLAS_CHECK(cublasCreate(&handle)); for (int i = 0; i < batchCount; ++i) { CUBLAS_CHECK(cublasDgemm(handle, transa, transb, m, n[i], k[i], &alpha, - A + i * tda, lda, B + i * tdb, ldb, &beta, - C + i * tdc, ldc)); + A + offsetsA[i], lda, B + offsetsB[i], ldb, &beta, + C + offsetsC[i], ldc)); } } @@ -214,22 +215,27 @@ void cublas_sgemm_wrapper(cublasOperation_t transa, cublasOperation_t transb, void blas_sgemm_wrapper_grouped(cublasOperation_t transa, cublasOperation_t transb, int m, int *n, int *k, - float alpha, const float *A, int lda, int tda, - const float *B, int ldb, int tdb, float beta, - float *C, int ldc, int tdc, int batchCount) { + float alpha, const float *A, int lda, + int *offsetsA, const float *B, int ldb, + int *offsetsB, float beta, float *C, int ldc, + int *offsetsC, int batchCount) { if (use_cutlass) - cutlass_sgemm_wrapper_grouped(transa, transb, m, n, k, alpha, A, lda, tda, - B, ldb, tdb, beta, C, ldc, tdc, batchCount); + cutlass_sgemm_wrapper_grouped(transa, transb, m, n, k, alpha, A, lda, + offsetsA, B, ldb, offsetsB, beta, C, ldc, + offsetsC, batchCount); else - cublas_sgemm_wrapper_grouped(transa, transb, m, n, k, alpha, A, lda, tda, B, - ldb, tdb, beta, C, ldc, tdc, batchCount); + cublas_sgemm_wrapper_grouped(transa, transb, m, n, k, alpha, A, lda, + offsetsA, B, ldb, offsetsB, beta, C, ldc, + offsetsC, batchCount); } void blas_dgemm_wrapper_grouped(cublasOperation_t transa, cublasOperation_t transb, int m, int *n, int *k, - double alpha, const double *A, int lda, int tda, - const double *B, int ldb, int tdb, double beta, - double *C, int ldc, int tdc, int batchCount) { - cublas_dgemm_wrapper_grouped(transa, transb, m, n, k, alpha, A, lda, tda, B, - ldb, tdb, beta, C, ldc, tdc, batchCount); + double alpha, const double *A, int lda, + int *offsetsA, const double *B, int ldb, + int *offsetsB, double beta, double *C, int ldc, + int *offsetsC, int batchCount) { + cublas_dgemm_wrapper_grouped(transa, transb, m, n, k, alpha, A, lda, offsetsA, + B, ldb, offsetsB, beta, C, ldc, offsetsC, + batchCount); } } diff --git a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 index 33c9e8619..762d1def6 100755 --- a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 +++ b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 @@ -36,14 +36,14 @@ SUBROUTINE CUDA_DGEMM_GROUPED(& & CTA, CTB, & & M, N, K, & & ALPHA, & - & A, LDA, TDA, & - & B, LDB, TDB, & + & A, LDA, OFFSETA, & + & B, LDB, OFFSETB, & & BETA, & - & C, LDC, TDC, & + & C, LDC, OFFSETC, & & BATCHCOUNT & &) BIND(C, NAME='blas_dgemm_wrapper_grouped') USE ISO_C_BINDING - INTEGER(C_INT), VALUE :: CTA, CTB, M, N(:), K(:), LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT + INTEGER(C_INT), VALUE :: CTA, CTB, M, N(:), K(:), LDA, LDB, LDC, OFFSETA(:), OFFSETB(:), OFFSETC(:), BATCHCOUNT REAL(C_DOUBLE), VALUE :: ALPHA,BETA REAL(C_DOUBLE) :: A(*), B(*), C(*) END SUBROUTINE CUDA_DGEMM_GROUPED @@ -51,14 +51,14 @@ SUBROUTINE CUDA_SGEMM_GROUPED(& & CTA, CTB, & & M, N, K, & & ALPHA, & - & A, LDA, TDA, & - & B, LDB, TDB, & + & A, LDA, OFFSETA, & + & B, LDB, OFFSETB, & & BETA, & - & C, LDC, TDC, & + & C, LDC, OFFSETC, & & BATCHCOUNT & &) BIND(C, NAME='blas_sgemm_wrapper_grouped') USE ISO_C_BINDING - INTEGER(C_INT), VALUE :: CTA, CTB, M, N(:), K(:), LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT + INTEGER(C_INT), VALUE :: CTA, CTB, M, N(:), K(:), LDA, LDB, LDC, OFFSETA(:), OFFSETB(:), OFFSETC(:), BATCHCOUNT REAL(C_FLOAT), VALUE :: ALPHA,BETA REAL(C_FLOAT) :: A(*), B(*), C(*) END SUBROUTINE CUDA_SGEMM_GROUPED @@ -165,10 +165,10 @@ SUBROUTINE CUDA_DGEMM_GROUPED_OVERLOAD( & & TRANSA, TRANSB, & & M, N, K, & & ALPHA, & - & AARRAY, LDA, STRIDEA, & - & BARRAY, LDB, STRIDEB, & + & AARRAY, LDA, OFFSETA, & + & BARRAY, LDB, OFFSETB, & & BETA, & - & CARRAY, LDC, STRIDEC, & + & CARRAY, LDC, OFFSETC, & & BATCHCOUNT) INTEGER(KIND=C_INT), INTENT(IN) :: TRANSA INTEGER(KIND=C_INT), INTENT(IN) :: TRANSB @@ -178,14 +178,14 @@ SUBROUTINE CUDA_DGEMM_GROUPED_OVERLOAD( & REAL(KIND=JPRD) :: ALPHA REAL(KIND=JPRD), DIMENSION(*) :: AARRAY INTEGER(KIND=JPIM) :: LDA - INTEGER(KIND=JPIM) :: STRIDEA + INTEGER(KIND=JPIM) :: OFFSETA(:) REAL(KIND=JPRD), DIMENSION(*) :: BARRAY INTEGER(KIND=JPIM) :: LDB - INTEGER(KIND=JPIM) :: STRIDEB + INTEGER(KIND=JPIM) :: OFFSETB(:) REAL(KIND=JPRD) :: BETA REAL(KIND=JPRD), DIMENSION(*) :: CARRAY INTEGER(KIND=JPIM) :: LDC - INTEGER(KIND=JPIM) :: STRIDEC + INTEGER(KIND=JPIM) :: OFFSETC(:) INTEGER(KIND=JPIM) :: BATCHCOUNT !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) @@ -193,10 +193,10 @@ SUBROUTINE CUDA_DGEMM_GROUPED_OVERLOAD( & & TRANSA, TRANSB, & & M, N, K, & & ALPHA, & - & AARRAY, LDA, STRIDEA, & - & BARRAY, LDB, STRIDEB, & + & AARRAY, LDA, OFFSETA, & + & BARRAY, LDB, OFFSETB, & & BETA, & - & CARRAY, LDC, STRIDEC, & + & CARRAY, LDC, OFFSETC, & & BATCHCOUNT) !$ACC END HOST_DATA END SUBROUTINE CUDA_DGEMM_GROUPED_OVERLOAD @@ -205,10 +205,10 @@ SUBROUTINE CUDA_SGEMM_GROUPED_OVERLOAD( & & TRANSA, TRANSB, & & M, N, K, & & ALPHA, & - & AARRAY, LDA, STRIDEA, & - & BARRAY, LDB, STRIDEB, & + & AARRAY, LDA, OFFSETA, & + & BARRAY, LDB, OFFSETB, & & BETA, & - & CARRAY, LDC, STRIDEC, & + & CARRAY, LDC, OFFSETC, & & BATCHCOUNT) INTEGER(KIND=C_INT), INTENT(IN) :: TRANSA INTEGER(KIND=C_INT), INTENT(IN) :: TRANSB @@ -218,14 +218,14 @@ SUBROUTINE CUDA_SGEMM_GROUPED_OVERLOAD( & REAL(KIND=JPRM) :: ALPHA REAL(KIND=JPRM), DIMENSION(*) :: AARRAY INTEGER(KIND=JPIM) :: LDA - INTEGER(KIND=JPIM) :: STRIDEA + INTEGER(KIND=JPIM) :: OFFSETA(:) REAL(KIND=JPRM), DIMENSION(*) :: BARRAY INTEGER(KIND=JPIM) :: LDB - INTEGER(KIND=JPIM) :: STRIDEB + INTEGER(KIND=JPIM) :: OFFSETB(:) REAL(KIND=JPRM) :: BETA REAL(KIND=JPRM), DIMENSION(*) :: CARRAY INTEGER(KIND=JPIM) :: LDC - INTEGER(KIND=JPIM) :: STRIDEC + INTEGER(KIND=JPIM) :: OFFSETC(:) INTEGER(KIND=JPIM) :: BATCHCOUNT !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) @@ -233,10 +233,10 @@ SUBROUTINE CUDA_SGEMM_GROUPED_OVERLOAD( & & TRANSA, TRANSB, & & M, N, K, & & ALPHA, & - & AARRAY, LDA, STRIDEA, & - & BARRAY, LDB, STRIDEB, & + & AARRAY, LDA, OFFSETA, & + & BARRAY, LDB, OFFSETB, & & BETA, & - & CARRAY, LDC, STRIDEC, & + & CARRAY, LDC, OFFSETC, & & BATCHCOUNT) !$ACC END HOST_DATA END SUBROUTINE CUDA_SGEMM_GROUPED_OVERLOAD diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index e0d694345..125181832 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -80,7 +80,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) INTEGER(KIND=JPIM) :: KM INTEGER(KIND=JPIM) :: KMLOC INTEGER(KIND=JPIM) :: IA, IS, ISL, J, JK -INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP) +INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPRBT) :: PAIA, PAIS @@ -171,16 +171,19 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) KM = D_MYMS(KMLOC) NS(KMLOC) = (R%NSMAX-KM+2)/2 KS(KMLOC) = G%NDGLU(KM) + AOFFSETS(KMLOC) = IIN_STRIDES1*(KMLOC-1) + BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) + COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) ENDDO ! TODO SKIP KMLOC0 CALL CUDA_GEMM_BATCHED( & & CUBLAS_OP_N, CUBLAS_OP_N, & & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & - & ZINPA, IIN_STRIDES0, IIN_STRIDES1, & - & ZAA, SIZE(ZAA,1), SIZE(ZAA,1)*SIZE(ZAA,2), & + & ZINPA, IIN_STRIDES0, AOFFSETS, & + & ZAA, SIZE(ZAA,1), BOFFSETS, & & 0.0_JPRBT, & - & ZOUT, IOUT_STRIDES0, IOUT_STRIDES1, & + & ZOUT, IOUT_STRIDES0, COFFSETS, & & D_NUMP) CALL cudaDeviceSynchronize() IF (LSYNC_TRANS) THEN @@ -225,10 +228,10 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & CUBLAS_OP_N, CUBLAS_OP_N, & & KF_FS, (R%NSMAX+2)/2, G%NDGLU(0), & & 1.0_JPRD, & - & ZINP0, IIN0_STRIDES0, IIN0_STRIDES1, & - & ZAA0, SIZE(ZAA0,1), SIZE(ZAA0,1)*SIZE(ZAA0,2), & + & ZINP0, IIN0_STRIDES0, 0, & + & ZAA0, SIZE(ZAA0,1), 0, & & 0.0_JPRD, & - & ZOUT0, IOUT0_STRIDES0, IOUT_STRIDES1, & + & ZOUT0, IOUT0_STRIDES0, 0, & & 1) CALL cudaDeviceSynchronize() @@ -256,16 +259,19 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) KM = D_MYMS(KMLOC) NS(KMLOC) = (R%NSMAX-KM+3)/2 KS(KMLOC) = G%NDGLU(KM) + AOFFSETS(KMLOC) = IIN_STRIDES1*(KMLOC-1) + BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) + COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) ENDDO ! TODO SKIP KMLOC0 CALL CUDA_GEMM_BATCHED( & & CUBLAS_OP_N, CUBLAS_OP_N, & & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & - & ZINPS, IIN_STRIDES0, IIN_STRIDES1, & - & ZAS, SIZE(ZAS,1), SIZE(ZAS,1)*SIZE(ZAS,2), & + & ZINPS, IIN_STRIDES0, AOFFSETS, & + & ZAS, SIZE(ZAS,1), BOFFSETS, & & 0.0_JPRBT, & - & ZOUT, IOUT_STRIDES0, IOUT_STRIDES1, & + & ZOUT, IOUT_STRIDES0, COFFSETS, & & D_NUMP) CALL cudaDeviceSynchronize() IF (LSYNC_TRANS) THEN @@ -306,10 +312,10 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & CUBLAS_OP_N, CUBLAS_OP_N, & & KF_FS, (R%NSMAX+3)/2, G%NDGLU(0), & & 1.0_JPRD, & - & ZINP0, IIN0_STRIDES0, IIN_STRIDES1, & - & ZAS0, SIZE(ZAS0,1), SIZE(ZAS0,1)*SIZE(ZAS0,2), & + & ZINP0, IIN0_STRIDES0, 0, & + & ZAS0, SIZE(ZAS0,1), 0, & & 0.0_JPRD, & - & ZOUT0, IOUT0_STRIDES0, IOUT0_STRIDES1, & + & ZOUT0, IOUT0_STRIDES0, 0, & & 1) call cudaDeviceSynchronize() diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index dde5f1a69..a6f35d3c3 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -72,7 +72,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) INTEGER(KIND=JPIM) :: KM INTEGER(KIND=JPIM) :: KMLOC INTEGER(KIND=JPIM) :: KIFC -INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP) +INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) REAL(KIND=JPRB), INTENT(IN) :: PIA(:,:,:) REAL(KIND=JPRB), INTENT(OUT), ALLOCATABLE :: FOUBUF_IN(:) @@ -177,16 +177,19 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) KM = D_MYMS(KMLOC) KS(KMLOC) = (R%NSMAX-KM+2)/2 NS(KMLOC) = G%NDGLU(KM) + AOFFSETS(KMLOC) = IIN_STRIDES1*(KMLOC-1) + BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) + COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) ENDDO ! TODO SKIP KMLOC0 CALL CUDA_GEMM_BATCHED( & & CUBLAS_OP_N, CUBLAS_OP_T, & & KFIELDS, NS(:), KS(:), & & 1.0_JPRBT, & - & ZINP, IIN_STRIDES0, IIN_STRIDES1, & - & ZAA, SIZE(ZAA,1), SIZE(ZAA,1)*SIZE(ZAA,2), & + & ZINP, IIN_STRIDES0, AOFFSETS, & + & ZAA, SIZE(ZAA,1), BOFFSETS, & & 0.0_JPRBT, & - & ZOUTA, IOUT_STRIDES0, IOUT_STRIDES1, & + & ZOUTA, IOUT_STRIDES0, COFFSETS, & & D_NUMP) CALL cudaDeviceSynchronize() @@ -211,10 +214,10 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & CUBLAS_OP_N, CUBLAS_OP_T, & & KFIELDS/2, G%NDGLU(0), (R%NSMAX+2)/2, & & 1.0_JPRD, & - & ZINP0, IIN0_STRIDES0, IIN0_STRIDES1, & - & ZAA0, SIZE(ZAA0,1), SIZE(ZAA0,1)*SIZE(ZAA0,2), & + & ZINP0, IIN0_STRIDES0, 0, & + & ZAA0, SIZE(ZAA0,1), 0, & & 0.0_JPRD, & - & ZOUT0, IOUT0_STRIDES0, IOUT0_STRIDES1, & + & ZOUT0, IOUT0_STRIDES0, 0, & & 1) CALL cudaDeviceSynchronize() @@ -271,16 +274,19 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) KM = D_MYMS(KMLOC) KS(KMLOC) = (R%NSMAX-KM+3)/2 NS(KMLOC) = G%NDGLU(KM) + AOFFSETS(KMLOC) = IIN_STRIDES1*(KMLOC-1) + BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) + COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) ENDDO ! TODO SKIP KMLOC0 CALL CUDA_GEMM_BATCHED( & & CUBLAS_OP_N, CUBLAS_OP_T, & & KFIELDS, NS(:), KS(:), & & 1.0_JPRBT, & - & ZINP, IIN_STRIDES0, IIN_STRIDES1, & - & ZAS, SIZE(ZAS,1), SIZE(ZAS,1)*SIZE(ZAS,2), & + & ZINP, IIN_STRIDES0, AOFFSETS, & + & ZAS, SIZE(ZAS,1), BOFFSETS, & & 0.0_JPRBT, & - & ZOUTS, IOUT_STRIDES0, IOUT_STRIDES1, & + & ZOUTS, IOUT_STRIDES0, COFFSETS, & & D_NUMP) CALL cudaDeviceSynchronize() IF (LSYNC_TRANS) THEN @@ -303,10 +309,10 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & CUBLAS_OP_N, CUBLAS_OP_T, & & KFIELDS/2, G%NDGLU(0), (R%NSMAX+3)/2, & & 1.0_JPRD, & - & ZINP0, IIN0_STRIDES0, IIN0_STRIDES1, & - & ZAS0, SIZE(ZAS0,1), SIZE(ZAS0,1)*SIZE(ZAS0,2), & + & ZINP0, IIN0_STRIDES0, 0, & + & ZAS0, SIZE(ZAS0,1), 0, & & 0.0_JPRD, & - & ZOUT0, IOUT0_STRIDES0, IOUT0_STRIDES1, & + & ZOUT0, IOUT0_STRIDES0, 0, & & 1) CALL cudaDeviceSynchronize() From 5ffc23e04e7823b89bcc0c57c17ac653c5d3b8ff Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:28 -0700 Subject: [PATCH 195/263] Skip KMLOC0 if on my proc --- src/trans/gpu/algor/external/gemm/gemm_wrapper.cu | 6 ++++++ src/trans/gpu/internal/ledir_mod.F90 | 10 ++++++++-- src/trans/gpu/internal/leinv_mod.F90 | 10 ++++++++-- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu index b8ac2ac64..60a5823e5 100644 --- a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu +++ b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu @@ -113,6 +113,8 @@ void cutlass_sgemm_wrapper_grouped_v(int m, int *n, int *k, float alpha, Gemm gemm_op; for (int i = 0; i < batchCount; ++i) { + if (m == 0 || n[i] == 0 || k[i] == 0) + continue; CUTLASS_CHECK(gemm_op({// {(m + sz_align - 1) / sz_align * sz_align, (n[i] + sz_align - 1) / sz_align * sz_align, @@ -161,6 +163,8 @@ void cublas_sgemm_wrapper_grouped(cublasOperation_t transa, CUBLAS_CHECK(cublasCreate(&handle)); for (int i = 0; i < batchCount; ++i) { + if (m == 0 || n[i] == 0 || k[i] == 0) + continue; CUBLAS_CHECK(cublasSgemm(handle, transa, transb, m, n[i], k[i], &alpha, A + offsetsA[i], lda, B + offsetsB[i], ldb, &beta, C + offsetsC[i], ldc)); @@ -178,6 +182,8 @@ void cublas_dgemm_wrapper_grouped(cublasOperation_t transa, CUBLAS_CHECK(cublasCreate(&handle)); for (int i = 0; i < batchCount; ++i) { + if (m == 0 || n[i] == 0 || k[i] == 0) + continue; CUBLAS_CHECK(cublasDgemm(handle, transa, transb, m, n[i], k[i], &alpha, A + offsetsA[i], lda, B + offsetsB[i], ldb, &beta, C + offsetsC[i], ldc)); diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 125181832..db3c5c9a7 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -175,7 +175,10 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) ENDDO -! TODO SKIP KMLOC0 +IF(KMLOC0 > 0) THEN + NS(KMLOC0) = 0 + KS(KMLOC0) = 0 +ENDIF CALL CUDA_GEMM_BATCHED( & & CUBLAS_OP_N, CUBLAS_OP_N, & & 2*KF_FS, NS(:), KS(:), & @@ -263,7 +266,10 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) ENDDO -! TODO SKIP KMLOC0 +IF(KMLOC0 > 0) THEN + NS(KMLOC0) = 0 + KS(KMLOC0) = 0 +ENDIF CALL CUDA_GEMM_BATCHED( & & CUBLAS_OP_N, CUBLAS_OP_N, & & 2*KF_FS, NS(:), KS(:), & diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index a6f35d3c3..e2694f3bb 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -181,7 +181,10 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) ENDDO -! TODO SKIP KMLOC0 +IF(KMLOC0 > 0) THEN + NS(KMLOC0) = 0 + KS(KMLOC0) = 0 +ENDIF CALL CUDA_GEMM_BATCHED( & & CUBLAS_OP_N, CUBLAS_OP_T, & & KFIELDS, NS(:), KS(:), & @@ -278,7 +281,10 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) ENDDO -! TODO SKIP KMLOC0 +IF(KMLOC0 > 0) THEN + NS(KMLOC0) = 0 + KS(KMLOC0) = 0 +ENDIF CALL CUDA_GEMM_BATCHED( & & CUBLAS_OP_N, CUBLAS_OP_T, & & KFIELDS, NS(:), KS(:), & From 63cf51827a7dac6f870507c8f4af914336abdf53 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:28 -0700 Subject: [PATCH 196/263] Update arch to Sm70 for pre-Ampere archs --- src/trans/gpu/algor/external/gemm/gemm_wrapper.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu index 60a5823e5..586103dc7 100644 --- a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu +++ b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu @@ -93,7 +93,7 @@ void cutlass_sgemm_wrapper_grouped_v(int m, int *n, int *k, float alpha, cutlass::layout::RowMajor>, // float, cutlass::layout::ColumnMajor, // float, // - OperatorClass, cutlass::arch::Sm50, // + OperatorClass, cutlass::arch::Sm70, // ThreadblockShape, WarpShape, InstructionShape, // cutlass::epilogue::thread::LinearCombination< // float, // From 40ab86a22fee8b7b9b6299fc9a9c19c7b816cd9e Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:29 -0700 Subject: [PATCH 197/263] add cuda graphs for GEMMS (slow at this point because no caching) --- src/trans/gpu/CMakeLists.txt | 1 + .../gpu/algor/external/gemm/gemm_wrapper.cu | 381 ++++++++++++------ .../gpu/internal/cuda_gemm_batched_mod.F90 | 18 +- src/trans/gpu/internal/ledir_mod.F90 | 2 + src/trans/gpu/internal/leinv_mod.F90 | 2 + 5 files changed, 270 insertions(+), 134 deletions(-) diff --git a/src/trans/gpu/CMakeLists.txt b/src/trans/gpu/CMakeLists.txt index 103d6660f..749bec737 100644 --- a/src/trans/gpu/CMakeLists.txt +++ b/src/trans/gpu/CMakeLists.txt @@ -144,6 +144,7 @@ ecbuild_add_library( TARGET gpu PRIVATE_INCLUDES ${MPI_C_INCLUDE_PATH} PRIVATE_LIBS nvidia::cutlass::cutlass ) +target_compile_features(gpu PRIVATE cxx_std_17) ## CUDA architecture set_property( TARGET gpu PROPERTY CUDA_ARCHITECTURES 70 ) diff --git a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu index 586103dc7..7678f69a4 100644 --- a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu +++ b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu @@ -1,6 +1,12 @@ +#include + +#include +#include +#include +#include + #include "cublas_v2.h" #include "cutlass/gemm/device/gemm.h" -#include constexpr bool use_cutlass = true; @@ -13,36 +19,125 @@ constexpr bool use_cutlass = true; exit(EXIT_FAILURE); \ } \ } -#define CUBLAS_CHECK(e) \ - { \ - cublasStatus_t err = (e); \ - if (err != CUBLAS_STATUS_SUCCESS) { \ - fprintf(stderr, "CUBLAS error: %s, line %d, %s: %i\n", __FILE__, \ - __LINE__, #e, err); \ - exit(EXIT_FAILURE); \ - } \ +#define CUBLAS_CHECK(e) \ + { \ + cublasStatus_t err = (e); \ + if (err != CUBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "CUBLAS error: %s, line %d, %s: %i\n", __FILE__, \ + __LINE__, #e, err); \ + exit(EXIT_FAILURE); \ + } \ } -#define CUTLASS_CHECK(e) \ - { \ - cutlass::Status err = (e); \ - if (err != cutlass::Status::kSuccess) { \ - fprintf(stderr, "CUTLASS error: %s, line %d, %s: %i\n", __FILE__, \ - __LINE__, #e, (int)err); \ - exit(EXIT_FAILURE); \ - } \ +#define CUTLASS_CHECK(e) \ + { \ + cutlass::Status err = (e); \ + if (err != cutlass::Status::kSuccess) { \ + fprintf(stderr, "CUTLASS error: %s, line %d, %s: %i\n", __FILE__, \ + __LINE__, #e, (int)err); \ + exit(EXIT_FAILURE); \ + } \ + } + +namespace { +namespace detail { +struct pair_hash { + std::size_t operator()(const std::pair &p) const { + return p.first * 10000 + p.second; + } +}; +} // namespace detail + +// this version is using cuda graphs and caches the graphs +template +void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, + const Real *A, int lda, int *offsetsA, const Real *B, + int ldb, int *offsetsB, Real beta, Real *C, int ldc, + int *offsetsC, int batchCount, int blas_id = -1) { + // we store at most one graph per "m" (# fields) and "blas id" + static std::unordered_map, cudaGraphExec_t, + detail::pair_hash> + graphCache; + + // we also store A, B, and C and recreate the graph if they change + static std::unordered_map< + std::pair, std::tuple, + detail::pair_hash> + ptrCache; + + auto key = std::make_pair(m, blas_id); + + auto ptrs = ptrCache.find(key); + if (ptrs != ptrCache.end() && + (std::get<0>(ptrs->second) != A || std::get<1>(ptrs->second) != B || + std::get<2>(ptrs->second) != C)) { + // the plan is cached, but the pointers are not correct. we remove and + // delete the graph, but we keep the cublas handles, if this happens more + // often, we should cache this... + std::cout << "WARNING: POINTER CHANGE --> THIS MIGHT BE SLOW" << std::endl; + CUDA_CHECK(cudaGraphExecDestroy(graphCache[key])); + graphCache.erase(key); + ptrCache.erase(key); + } + + auto graph = graphCache.find(key); + if (graph == graphCache.end()) { + // this graph does not exist yet + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + cudaGraph_t new_graph; + cudaGraphCreate(&new_graph, 0); + for (int i = 0; i < batchCount; ++i) { + if (m == 0 || n[i] == 0 || k[i] == 0) continue; + + CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); + gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], + ldb, beta, C + offsetsC[i], ldc); + cudaGraph_t my_graph; + CUDA_CHECK(cudaStreamEndCapture(stream, &my_graph)); + cudaGraphNode_t my_node; + CUDA_CHECK(cudaGraphAddChildGraphNode(&my_node, new_graph, nullptr, 0, + my_graph)); + } + cudaGraphExec_t instance; + CUDA_CHECK(cudaGraphInstantiate(&instance, new_graph, NULL, NULL, 0)); + CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaGraphDestroy(new_graph)); + + graphCache.insert({key, instance}); + ptrCache.insert({key, std::make_tuple(A, B, C)}); + } + + CUDA_CHECK(cudaGraphLaunch(graphCache.at(key), 0)); +} + +// stupid simple gemm calls +template +void run_group(Gemm &&gemm, int m, int *n, int *k, Real alpha, const Real *A, + int lda, int *offsetsA, const Real *B, int ldb, int *offsetsB, + Real beta, Real *C, int ldc, int *offsetsC, int batchCount, + int = -1) { + for (int i = 0; i < batchCount; ++i) { + if (m == 0 || n[i] == 0 || k[i] == 0) continue; + gemm(0, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], ldb, + beta, C + offsetsC[i], ldc); } +} + +template +CutlassGemm &get_cutlass_handle() { + static auto handle = std::make_unique(); + return *handle; +} +namespace detail { template -void cutlass_sgemm_wrapper_grouped_v(int m, int *n, int *k, float alpha, - const float *A, int lda, int *offsetsA, - const float *B, int ldb, int *offsetsB, - float beta, float *C, int ldc, - int *offsetsC, int batchCount) { +class cutlass_sgemm_grouped { #if 0 // we will enable this later (this ifdefs did not work, so I am going to enable this properly ltaer) // this was verified using Ampere and uses 3XTF32 - constexpr int AlignmentA = 4; - constexpr int AlignmentB = 4; + static constexpr int AlignmentA = 4; + static constexpr int AlignmentB = 4; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; @@ -52,32 +147,32 @@ void cutlass_sgemm_wrapper_grouped_v(int m, int *n, int *k, float alpha, using Gemm = cutlass::gemm::device::Gemm< float, std::conditional_t, // + cutlass::layout::RowMajor>, // float, std::conditional_t, // - float, cutlass::layout::ColumnMajor, // - float, // - OperatorClass, cutlass::arch::Sm80, // - ThreadblockShape, WarpShape, InstructionShape, // - cutlass::epilogue::thread::LinearCombination< // - float, // + cutlass::layout::RowMajor>, // + float, cutlass::layout::ColumnMajor, // + float, // + OperatorClass, cutlass::arch::Sm80, // + ThreadblockShape, WarpShape, InstructionShape, // + cutlass::epilogue::thread::LinearCombination< // + float, // 128 / cutlass::sizeof_bits::value, - float, // - float // - >, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, // - 3, // - AlignmentA, // - AlignmentB, // - true, // - MyOp // + float, // + float // + >, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, // + 3, // + AlignmentA, // + AlignmentB, // + true, // + MyOp // >; - constexpr int sz_align = 8; + static constexpr int sz_align = 8; #else // this was verified using Volta and uses FP32 - constexpr int AlignmentA = 1; - constexpr int AlignmentB = 1; + static constexpr int AlignmentA = 1; + static constexpr int AlignmentB = 1; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; @@ -85,110 +180,146 @@ void cutlass_sgemm_wrapper_grouped_v(int m, int *n, int *k, float alpha, using MyOp = cutlass::arch::OpMultiplyAdd; using Gemm = cutlass::gemm::device::Gemm< - float, // + float, // std::conditional_t, // - float, // + cutlass::layout::RowMajor>, // + float, // std::conditional_t, // - float, cutlass::layout::ColumnMajor, // - float, // - OperatorClass, cutlass::arch::Sm70, // - ThreadblockShape, WarpShape, InstructionShape, // - cutlass::epilogue::thread::LinearCombination< // - float, // - 1, // - float, // - float // - >, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, // - 2, // - AlignmentA, // - AlignmentB, // - true, // - MyOp // + cutlass::layout::RowMajor>, // + float, cutlass::layout::ColumnMajor, // + float, // + OperatorClass, cutlass::arch::Sm70, // + ThreadblockShape, WarpShape, InstructionShape, // + cutlass::epilogue::thread::LinearCombination< // + float, // + 1, // + float, // + float // + >, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, // + 2, // + AlignmentA, // + AlignmentB, // + true, // + MyOp // >; - constexpr int sz_align = 1; + static constexpr int sz_align = 1; #endif - Gemm gemm_op; - for (int i = 0; i < batchCount; ++i) { - if (m == 0 || n[i] == 0 || k[i] == 0) - continue; - CUTLASS_CHECK(gemm_op({// - {(m + sz_align - 1) / sz_align * sz_align, - (n[i] + sz_align - 1) / sz_align * sz_align, - (k[i] + sz_align - 1) / sz_align * sz_align}, - {const_cast(A + offsetsA[i]), lda}, - {const_cast(B + offsetsB[i]), ldb}, - {C + offsetsC[i], ldc}, - {C + offsetsC[i], ldc}, - {alpha, beta}})); + public: + void operator()(cudaStream_t stream, int m, int n, int k, float alpha, + const float *A, int lda, const float *B, int ldb, float beta, + float *C, int ldc) const { + auto &gemm_op = get_cutlass_handle(); + CUTLASS_CHECK(gemm_op( + {// + {(m + sz_align - 1) / sz_align * sz_align, + (n + sz_align - 1) / sz_align * sz_align, + (k + sz_align - 1) / sz_align * sz_align}, + {const_cast(A), lda}, + {const_cast(B), ldb}, + {C, ldc}, + {C, ldc}, + {alpha, beta}}, + nullptr, stream)); } - CUDA_CHECK(cudaDeviceSynchronize()); +}; + +} // namespace detail +template +void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k, + float alpha, const float *A, int lda, + int *offsetsA, const float *B, int ldb, + int *offsetsB, float beta, float *C, + int ldc, int *offsetsC, int batchCount) { + using namespace detail; + run_group_graph(cutlass_sgemm_grouped(), m, n, k, alpha, A, + lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, + batchCount, blas_id); } -void cutlass_sgemm_wrapper_grouped(cublasOperation_t transa, +void cutlass_sgemm_wrapper_grouped(int blas_id, cublasOperation_t transa, cublasOperation_t transb, int m, int *n, int *k, float alpha, const float *A, int lda, int *offsetsA, const float *B, int ldb, int *offsetsB, float beta, float *C, int ldc, int *offsetsC, int batchCount) { if (transa == CUBLAS_OP_N && transb == CUBLAS_OP_N) - cutlass_sgemm_wrapper_grouped_v( - m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, - offsetsC, batchCount); + cutlass_sgemm_wrapper_grouped_op( + blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + ldc, offsetsC, batchCount); else if (transa == CUBLAS_OP_N && transb == CUBLAS_OP_T) - cutlass_sgemm_wrapper_grouped_v( - m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, - offsetsC, batchCount); + cutlass_sgemm_wrapper_grouped_op( + blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + ldc, offsetsC, batchCount); else if (transa == CUBLAS_OP_T && transb == CUBLAS_OP_N) - cutlass_sgemm_wrapper_grouped_v( - m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, - offsetsC, batchCount); + cutlass_sgemm_wrapper_grouped_op( + blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + ldc, offsetsC, batchCount); else if (transa == CUBLAS_OP_T && transb == CUBLAS_OP_T) - cutlass_sgemm_wrapper_grouped_v( - m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, - offsetsC, batchCount); + cutlass_sgemm_wrapper_grouped_op( + blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + ldc, offsetsC, batchCount); else assert(false); } -void cublas_sgemm_wrapper_grouped(cublasOperation_t transa, + +namespace detail { +cublasHandle_t get_cublas_handle() { + static cublasHandle_t handle; + if (!handle) CUBLAS_CHECK(cublasCreate(&handle)); + return handle; +} +template +struct cublas_gemm_grouped { + public: + cublas_gemm_grouped(cublasOperation_t transa, cublasOperation_t transb) + : transa_(transa), transb_(transb) { + // we need to get the cublas handle here, otherwise this could be created + // during graph capturing + get_cublas_handle(); + }; + void operator()(cudaStream_t stream, int m, int n, int k, Real alpha, + const Real *A, int lda, const Real *B, int ldb, Real beta, + Real *C, int ldc) const { + cublasHandle_t handle = get_cublas_handle(); + CUBLAS_CHECK(cublasSetStream(handle, stream)); + + if constexpr (std::is_same::value) + CUBLAS_CHECK(cublasSgemm(handle, transa_, transb_, m, n, k, &alpha, A, + lda, B, ldb, &beta, C, ldc)); + if constexpr (std::is_same::value) + CUBLAS_CHECK(cublasDgemm(handle, transa_, transb_, m, n, k, &alpha, A, + lda, B, ldb, &beta, C, ldc)); + } + + private: + cublasOperation_t transa_, transb_; +}; +} // namespace detail +void cublas_sgemm_wrapper_grouped(int blas_id, cublasOperation_t transa, cublasOperation_t transb, int m, int *n, int *k, float alpha, const float *A, int lda, int *offsetsA, const float *B, int ldb, int *offsetsB, float beta, float *C, int ldc, int *offsetsC, int batchCount) { - static cublasHandle_t handle = nullptr; - if (!handle) - CUBLAS_CHECK(cublasCreate(&handle)); - - for (int i = 0; i < batchCount; ++i) { - if (m == 0 || n[i] == 0 || k[i] == 0) - continue; - CUBLAS_CHECK(cublasSgemm(handle, transa, transb, m, n[i], k[i], &alpha, - A + offsetsA[i], lda, B + offsetsB[i], ldb, &beta, - C + offsetsC[i], ldc)); - } + using namespace detail; + run_group_graph(cublas_gemm_grouped(transa, transb), m, n, k, alpha, A, + lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, + batchCount, blas_id); } -void cublas_dgemm_wrapper_grouped(cublasOperation_t transa, +void cublas_dgemm_wrapper_grouped(int blas_id, cublasOperation_t transa, cublasOperation_t transb, int m, int *n, int *k, double alpha, const double *A, int lda, int *offsetsA, const double *B, int ldb, int *offsetsB, double beta, double *C, int ldc, int *offsetsC, int batchCount) { - static cublasHandle_t handle = nullptr; - if (!handle) - CUBLAS_CHECK(cublasCreate(&handle)); - - for (int i = 0; i < batchCount; ++i) { - if (m == 0 || n[i] == 0 || k[i] == 0) - continue; - CUBLAS_CHECK(cublasDgemm(handle, transa, transb, m, n[i], k[i], &alpha, - A + offsetsA[i], lda, B + offsetsB[i], ldb, &beta, - C + offsetsC[i], ldc)); - } + using namespace detail; + run_group_graph(cublas_gemm_grouped(transa, transb), m, n, k, alpha, + A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, + batchCount, blas_id); } +} // namespace extern "C" { void cublas_dgemm_wrapper(cublasOperation_t transa, cublasOperation_t transb, @@ -197,8 +328,7 @@ void cublas_dgemm_wrapper(cublasOperation_t transa, cublasOperation_t transb, double beta, double *C, int ldc, int tdc, int batchCount) { static cublasHandle_t handle = nullptr; - if (!handle) - CUBLAS_CHECK(cublasCreate(&handle)); + if (!handle) CUBLAS_CHECK(cublasCreate(&handle)); CUBLAS_CHECK(cublasDgemmStridedBatched(handle, transa, transb, m, n, k, &alpha, A, lda, tda, B, ldb, tdb, @@ -211,36 +341,35 @@ void cublas_sgemm_wrapper(cublasOperation_t transa, cublasOperation_t transb, float beta, float *C, int ldc, int tdc, int batchCount) { static cublasHandle_t handle = nullptr; - if (!handle) - CUBLAS_CHECK(cublasCreate(&handle)); + if (!handle) CUBLAS_CHECK(cublasCreate(&handle)); CUBLAS_CHECK(cublasSgemmStridedBatched(handle, transa, transb, m, n, k, &alpha, A, lda, tda, B, ldb, tdb, &beta, C, ldc, tdc, batchCount)); } -void blas_sgemm_wrapper_grouped(cublasOperation_t transa, +void blas_sgemm_wrapper_grouped(int blas_id, cublasOperation_t transa, cublasOperation_t transb, int m, int *n, int *k, float alpha, const float *A, int lda, int *offsetsA, const float *B, int ldb, int *offsetsB, float beta, float *C, int ldc, int *offsetsC, int batchCount) { if (use_cutlass) - cutlass_sgemm_wrapper_grouped(transa, transb, m, n, k, alpha, A, lda, - offsetsA, B, ldb, offsetsB, beta, C, ldc, + cutlass_sgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, + lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount); else - cublas_sgemm_wrapper_grouped(transa, transb, m, n, k, alpha, A, lda, + cublas_sgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount); } -void blas_dgemm_wrapper_grouped(cublasOperation_t transa, +void blas_dgemm_wrapper_grouped(int blas_id, cublasOperation_t transa, cublasOperation_t transb, int m, int *n, int *k, double alpha, const double *A, int lda, int *offsetsA, const double *B, int ldb, int *offsetsB, double beta, double *C, int ldc, int *offsetsC, int batchCount) { - cublas_dgemm_wrapper_grouped(transa, transb, m, n, k, alpha, A, lda, offsetsA, + cublas_dgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount); } diff --git a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 index 762d1def6..301b5d5a8 100755 --- a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 +++ b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 @@ -33,7 +33,7 @@ SUBROUTINE CUDA_SGEMM_BATCHED(& REAL(C_FLOAT) :: A(*), B(*), C(*) END SUBROUTINE CUDA_SGEMM_BATCHED SUBROUTINE CUDA_DGEMM_GROUPED(& - & CTA, CTB, & + & BLAS_ID, CTA, CTB, & & M, N, K, & & ALPHA, & & A, LDA, OFFSETA, & @@ -43,12 +43,12 @@ SUBROUTINE CUDA_DGEMM_GROUPED(& & BATCHCOUNT & &) BIND(C, NAME='blas_dgemm_wrapper_grouped') USE ISO_C_BINDING - INTEGER(C_INT), VALUE :: CTA, CTB, M, N(:), K(:), LDA, LDB, LDC, OFFSETA(:), OFFSETB(:), OFFSETC(:), BATCHCOUNT + INTEGER(C_INT), VALUE :: BLAS_ID, CTA, CTB, M, N(:), K(:), LDA, LDB, LDC, OFFSETA(:), OFFSETB(:), OFFSETC(:), BATCHCOUNT REAL(C_DOUBLE), VALUE :: ALPHA,BETA REAL(C_DOUBLE) :: A(*), B(*), C(*) END SUBROUTINE CUDA_DGEMM_GROUPED SUBROUTINE CUDA_SGEMM_GROUPED(& - & CTA, CTB, & + & BLAS_ID, CTA, CTB, & & M, N, K, & & ALPHA, & & A, LDA, OFFSETA, & @@ -58,7 +58,7 @@ SUBROUTINE CUDA_SGEMM_GROUPED(& & BATCHCOUNT & &) BIND(C, NAME='blas_sgemm_wrapper_grouped') USE ISO_C_BINDING - INTEGER(C_INT), VALUE :: CTA, CTB, M, N(:), K(:), LDA, LDB, LDC, OFFSETA(:), OFFSETB(:), OFFSETC(:), BATCHCOUNT + INTEGER(C_INT), VALUE :: BLAS_ID, CTA, CTB, M, N(:), K(:), LDA, LDB, LDC, OFFSETA(:), OFFSETB(:), OFFSETC(:), BATCHCOUNT REAL(C_FLOAT), VALUE :: ALPHA,BETA REAL(C_FLOAT) :: A(*), B(*), C(*) END SUBROUTINE CUDA_SGEMM_GROUPED @@ -162,7 +162,7 @@ SUBROUTINE CUDA_SGEMM_BATCHED_OVERLOAD( & END SUBROUTINE CUDA_SGEMM_BATCHED_OVERLOAD SUBROUTINE CUDA_DGEMM_GROUPED_OVERLOAD( & - & TRANSA, TRANSB, & + & BLAS_ID, TRANSA, TRANSB, & & M, N, K, & & ALPHA, & & AARRAY, LDA, OFFSETA, & @@ -170,6 +170,7 @@ SUBROUTINE CUDA_DGEMM_GROUPED_OVERLOAD( & & BETA, & & CARRAY, LDC, OFFSETC, & & BATCHCOUNT) + INTEGER(KIND=C_INT), INTENT(IN) :: BLAS_ID INTEGER(KIND=C_INT), INTENT(IN) :: TRANSA INTEGER(KIND=C_INT), INTENT(IN) :: TRANSB INTEGER(KIND=JPIM) :: M @@ -190,7 +191,7 @@ SUBROUTINE CUDA_DGEMM_GROUPED_OVERLOAD( & !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) CALL CUDA_DGEMM_GROUPED( & - & TRANSA, TRANSB, & + & BLAS_ID, TRANSA, TRANSB, & & M, N, K, & & ALPHA, & & AARRAY, LDA, OFFSETA, & @@ -202,7 +203,7 @@ SUBROUTINE CUDA_DGEMM_GROUPED_OVERLOAD( & END SUBROUTINE CUDA_DGEMM_GROUPED_OVERLOAD SUBROUTINE CUDA_SGEMM_GROUPED_OVERLOAD( & - & TRANSA, TRANSB, & + & BLAS_ID, TRANSA, TRANSB, & & M, N, K, & & ALPHA, & & AARRAY, LDA, OFFSETA, & @@ -210,6 +211,7 @@ SUBROUTINE CUDA_SGEMM_GROUPED_OVERLOAD( & & BETA, & & CARRAY, LDC, OFFSETC, & & BATCHCOUNT) + INTEGER(KIND=C_INT), INTENT(IN) :: BLAS_ID INTEGER(KIND=C_INT), INTENT(IN) :: TRANSA INTEGER(KIND=C_INT), INTENT(IN) :: TRANSB INTEGER(KIND=JPIM) :: M @@ -230,7 +232,7 @@ SUBROUTINE CUDA_SGEMM_GROUPED_OVERLOAD( & !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) CALL CUDA_SGEMM_GROUPED( & - & TRANSA, TRANSB, & + & BLAS_ID, TRANSA, TRANSB, & & M, N, K, & & ALPHA, & & AARRAY, LDA, OFFSETA, & diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index db3c5c9a7..8ecb38cb6 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -180,6 +180,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) KS(KMLOC0) = 0 ENDIF CALL CUDA_GEMM_BATCHED( & + & 21, & ! unique identifier & CUBLAS_OP_N, CUBLAS_OP_N, & & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & @@ -271,6 +272,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) KS(KMLOC0) = 0 ENDIF CALL CUDA_GEMM_BATCHED( & + & 22, & ! unique identifier & CUBLAS_OP_N, CUBLAS_OP_N, & & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index e2694f3bb..2b774a6bd 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -186,6 +186,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) KS(KMLOC0) = 0 ENDIF CALL CUDA_GEMM_BATCHED( & + & 11, & ! unique identifier & CUBLAS_OP_N, CUBLAS_OP_T, & & KFIELDS, NS(:), KS(:), & & 1.0_JPRBT, & @@ -286,6 +287,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) KS(KMLOC0) = 0 ENDIF CALL CUDA_GEMM_BATCHED( & + & 12, & ! unique identifier & CUBLAS_OP_N, CUBLAS_OP_T, & & KFIELDS, NS(:), KS(:), & & 1.0_JPRBT, & From 353e6e80710d1811807e10f29f4657ef3b4514a7 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:29 -0700 Subject: [PATCH 198/263] re-use buffers for leinv --- .../gpu/internal/cuda_gemm_batched_mod.F90 | 16 +++--- src/trans/gpu/internal/leinv_mod.F90 | 54 ++++++++++++++----- 2 files changed, 50 insertions(+), 20 deletions(-) diff --git a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 index 301b5d5a8..1782d7fda 100755 --- a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 +++ b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 @@ -96,14 +96,14 @@ SUBROUTINE CUDA_DGEMM_BATCHED_OVERLOAD( & INTEGER(KIND=JPIM) :: N INTEGER(KIND=JPIM) :: K REAL(KIND=JPRD) :: ALPHA - REAL(KIND=JPRD), DIMENSION(*) :: AARRAY + REAL(KIND=JPRD), DIMENSION(:) :: AARRAY INTEGER(KIND=JPIM) :: LDA INTEGER(KIND=JPIM) :: STRIDEA REAL(KIND=JPRD), DIMENSION(*) :: BARRAY INTEGER(KIND=JPIM) :: LDB INTEGER(KIND=JPIM) :: STRIDEB REAL(KIND=JPRD) :: BETA - REAL(KIND=JPRD), DIMENSION(*) :: CARRAY + REAL(KIND=JPRD), DIMENSION(:) :: CARRAY INTEGER(KIND=JPIM) :: LDC INTEGER(KIND=JPIM) :: STRIDEC INTEGER(KIND=JPIM) :: BATCHCOUNT @@ -136,14 +136,14 @@ SUBROUTINE CUDA_SGEMM_BATCHED_OVERLOAD( & INTEGER(KIND=JPIM) :: N INTEGER(KIND=JPIM) :: K REAL(KIND=JPRM) :: ALPHA - REAL(KIND=JPRM), DIMENSION(*) :: AARRAY + REAL(KIND=JPRM), DIMENSION(:) :: AARRAY INTEGER(KIND=JPIM) :: LDA INTEGER(KIND=JPIM) :: STRIDEA REAL(KIND=JPRM), DIMENSION(*) :: BARRAY INTEGER(KIND=JPIM) :: LDB INTEGER(KIND=JPIM) :: STRIDEB REAL(KIND=JPRM) :: BETA - REAL(KIND=JPRM), DIMENSION(*) :: CARRAY + REAL(KIND=JPRM), DIMENSION(:) :: CARRAY INTEGER(KIND=JPIM) :: LDC INTEGER(KIND=JPIM) :: STRIDEC INTEGER(KIND=JPIM) :: BATCHCOUNT @@ -177,14 +177,14 @@ SUBROUTINE CUDA_DGEMM_GROUPED_OVERLOAD( & INTEGER(KIND=JPIM) :: N(:) INTEGER(KIND=JPIM) :: K(:) REAL(KIND=JPRD) :: ALPHA - REAL(KIND=JPRD), DIMENSION(*) :: AARRAY + REAL(KIND=JPRD), DIMENSION(:) :: AARRAY INTEGER(KIND=JPIM) :: LDA INTEGER(KIND=JPIM) :: OFFSETA(:) REAL(KIND=JPRD), DIMENSION(*) :: BARRAY INTEGER(KIND=JPIM) :: LDB INTEGER(KIND=JPIM) :: OFFSETB(:) REAL(KIND=JPRD) :: BETA - REAL(KIND=JPRD), DIMENSION(*) :: CARRAY + REAL(KIND=JPRD), DIMENSION(:) :: CARRAY INTEGER(KIND=JPIM) :: LDC INTEGER(KIND=JPIM) :: OFFSETC(:) INTEGER(KIND=JPIM) :: BATCHCOUNT @@ -218,14 +218,14 @@ SUBROUTINE CUDA_SGEMM_GROUPED_OVERLOAD( & INTEGER(KIND=JPIM) :: N(:) INTEGER(KIND=JPIM) :: K(:) REAL(KIND=JPRM) :: ALPHA - REAL(KIND=JPRM), DIMENSION(*) :: AARRAY + REAL(KIND=JPRM), DIMENSION(:) :: AARRAY INTEGER(KIND=JPIM) :: LDA INTEGER(KIND=JPIM) :: OFFSETA(:) REAL(KIND=JPRM), DIMENSION(*) :: BARRAY INTEGER(KIND=JPIM) :: LDB INTEGER(KIND=JPIM) :: OFFSETB(:) REAL(KIND=JPRM) :: BETA - REAL(KIND=JPRM), DIMENSION(*) :: CARRAY + REAL(KIND=JPRM), DIMENSION(:) :: CARRAY INTEGER(KIND=JPIM) :: LDC INTEGER(KIND=JPIM) :: OFFSETC(:) INTEGER(KIND=JPIM) :: BATCHCOUNT diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 2b774a6bd..5f24a2329 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -56,6 +56,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX, R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU +USE TPM_TRANS, ONLY: PREEL_PTR USE TPM_FIELDS ,ONLY : ZAA,ZAS,ZAA0,ZAS0,KMLOC0 USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1 USE TPM_GEN, ONLY: NOUT @@ -77,9 +78,9 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) REAL(KIND=JPRB), INTENT(OUT), ALLOCATABLE :: FOUBUF_IN(:) ! LOCAL -REAL(KIND=JPRBT), ALLOCATABLE :: ZINP(:), ZOUTS(:), ZOUTA(:) +REAL(KIND=JPRBT), POINTER :: ZBASE(:), ZINP(:), ZOUTS(:), ZOUTA(:) +REAL(KIND=JPRD), POINTER :: ZBASE0(:), ZINP0(:), ZOUT0(:) REAL(KIND=JPRBT) :: ZAOA, ZSOA -REAL(KIND=JPRD), ALLOCATABLE :: ZINP0(:), ZOUT0(:) INTEGER(KIND=JPIM) :: IA, IS, ISL, J1, JGL, JK, J, IGLS, ISTAS, OFFSET1, OFFSET2 INTEGER(KIND=JPIM) :: KFIELDS @@ -87,6 +88,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 +INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS INTEGER(KIND=JPIM) :: A = 8 !Alignment @@ -113,14 +115,46 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) IOUT0_STRIDES0 = ALIGN(KFIELDS/2,A) IOUT0_STRIDES1 = IOUT0_STRIDES0 * ALIGN(R_NDGNH,A) -ALLOCATE(ZINP(IIN_STRIDES1*D_NUMP)) -ALLOCATE(ZOUTS(IOUT_STRIDES1*D_NUMP)) -ALLOCATE(ZOUTA(IOUT_STRIDES1*D_NUMP)) -ALLOCATE(ZINP0(IIN0_STRIDES1*1)) -ALLOCATE(ZOUT0(IOUT0_STRIDES1*1)) +! Check if the reuse buffer is large enough +ALLOC_SZ = ALIGN(IIN_STRIDES1*D_NUMP,8)*SIZEOF(ZINP(1)) & + +ALIGN(IOUT_STRIDES1*D_NUMP,8)*SIZEOF(ZOUTS(1)) & + +ALIGN(IOUT_STRIDES1*D_NUMP,8)*SIZEOF(ZOUTA(1)) & + +ALIGN(IIN0_STRIDES1,8)*SIZEOF(ZINP0(1)) & + +ALIGN(IOUT0_STRIDES1,8)*SIZEOF(ZOUT0(1)) +IF (.NOT. ALLOCATED(PREEL_PTR)) THEN + ALLOCATE(PREEL_PTR(ALLOC_SZ/SIZEOF(PREEL_PTR(1)))) + !$ACC ENTER DATA CREATE(PREEL_PTR) +ELSEIF (SIZEOF(PREEL_PTR) <= ALLOC_SZ) THEN + ! and reallocate if needed + !$ACC EXIT DATA DELETE(PREEL_PTR) + DEALLOCATE(PREEL_PTR) + ALLOCATE(PREEL_PTR(ALLOC_SZ/SIZEOF(PREEL_PTR(1)))) + !$ACC ENTER DATA CREATE(PREEL_PTR) +ENDIF + +! Figure out which pointers to use +ALLOC_POS=1 +CALL C_F_POINTER(C_LOC(PREEL_PTR(ALLOC_POS:)), ZBASE, & + & [SIZEOF(PREEL_PTR(ALLOC_POS:))/SIZEOF(ZBASE(0))]) + +ZINP(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IIN_STRIDES1*D_NUMP-1) +ALLOC_POS=ALLOC_POS+ALIGN(IIN_STRIDES1*D_NUMP,8) +ZOUTS(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IOUT_STRIDES1*D_NUMP-1) +ALLOC_POS=ALLOC_POS+ALIGN(IOUT_STRIDES1*D_NUMP,8) +ZOUTA(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IOUT_STRIDES1*D_NUMP-1) +ALLOC_POS=ALLOC_POS+ALIGN(IOUT_STRIDES1*D_NUMP,8) + +! The BASE0 pointer points to the rest, but likely in a different type! +CALL C_F_POINTER(C_LOC(PREEL_PTR(ALLOC_POS:)), ZBASE0, & + & [SIZEOF(PREEL_PTR(ALLOC_POS:))/SIZEOF(ZBASE0(0))]) +ALLOC_POS=1 +ZINP0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IIN0_STRIDES1-1) +ALLOC_POS=ALLOC_POS+ALIGN(IIN0_STRIDES1,8) +ZOUT0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IOUT0_STRIDES1-1) +ALLOC_POS=ALLOC_POS+ALIGN(IOUT0_STRIDES1,8) !$ACC DATA PRESENT(D,D_MYMS,G,G_NDGLU,R) & -!$ACC& CREATE (ZINP,ZOUTS,ZOUTA,ZINP0,ZOUT0) & +!$ACC& COPYIN(ZINP,ZOUTS,ZOUTA,ZINP0,ZOUT0) & !$ACC& PRESENT(ZAA,ZAS,PIA) & !$ACC& PRESENT(D_MYMS,D_NPNTGTB1,G_NDGLU) @@ -369,11 +403,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDDO ENDDO !$ACC END DATA - !$ACC END DATA -DEALLOCATE(ZINP) -DEALLOCATE(ZOUTS) -DEALLOCATE(ZOUTA) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ From e0008eda7aaf37fafa2076a5883eaeb556077756 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:29 -0700 Subject: [PATCH 199/263] re-use buffers for ledir --- src/trans/gpu/internal/ledir_mod.F90 | 54 +++++++++++++++++++++------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 8ecb38cb6..37fa93c4c 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -60,6 +60,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX,R_NTMAX,R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU USE TPM_FIELDS ,ONLY : F,ZAA,ZAS,ZAA0,ZAS0,KMLOC0 +USE TPM_TRANS, ONLY: PREEL_PTR USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1 USE CUDA_GEMM_BATCHED_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER @@ -86,13 +87,14 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) INTEGER(KIND=JPIM) :: IGLS, JF, JGL INTEGER(KIND=JPIM) :: OFFSET1, OFFSET2 -REAL(KIND=JPRBT), ALLOCATABLE :: ZINPS(:), ZINPA(:), ZOUT(:) -REAL(KIND=JPRD), ALLOCATABLE :: ZINP0(:), ZOUT0(:) +REAL(KIND=JPRBT), POINTER :: ZBASE(:), ZINPS(:), ZINPA(:), ZOUT(:) +REAL(KIND=JPRD), POINTER :: ZBASE0(:), ZINP0(:), ZOUT0(:) INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 +INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS INTEGER(KIND=JPIM) :: A = 8 !Alignment @@ -107,11 +109,44 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) IIN0_STRIDES0 = ALIGN(KF_FS,A) IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(R_NDGNH,A) -ALLOCATE(ZINPA(IIN_STRIDES1*D_NUMP)) -ALLOCATE(ZINPS(IIN_STRIDES1*D_NUMP)) -ALLOCATE(ZOUT(IOUT_STRIDES1*D_NUMP)) -ALLOCATE(ZINP0(IIN0_STRIDES1*1)) -ALLOCATE(ZOUT0(IOUT0_STRIDES1*1)) +! Check if the reuse buffer is large enough +ALLOC_SZ = ALIGN(IIN_STRIDES1*D_NUMP,8)*SIZEOF(ZINPS(1)) & + +ALIGN(IIN_STRIDES1*D_NUMP,8)*SIZEOF(ZINPA(1)) & + +ALIGN(IOUT_STRIDES1*D_NUMP,8)*SIZEOF(ZOUT(1)) & + +ALIGN(IIN0_STRIDES1,8)*SIZEOF(ZINP0(1)) & + +ALIGN(IOUT0_STRIDES1,8)*SIZEOF(ZOUT0(1)) + +IF (.NOT. ALLOCATED(PREEL_PTR)) THEN + ALLOCATE(PREEL_PTR(ALLOC_SZ/SIZEOF(PREEL_PTR(1)))) + !$ACC ENTER DATA CREATE(PREEL_PTR) +ELSEIF (SIZEOF(PREEL_PTR) <= ALLOC_SZ) THEN + ! and reallocate if needed + !$ACC EXIT DATA DELETE(PREEL_PTR) + DEALLOCATE(PREEL_PTR) + ALLOCATE(PREEL_PTR(ALLOC_SZ/SIZEOF(PREEL_PTR(1)))) + !$ACC ENTER DATA CREATE(PREEL_PTR) +ENDIF + +! Figure out which pointers to use +ALLOC_POS=1 +CALL C_F_POINTER(C_LOC(PREEL_PTR(ALLOC_POS:)), ZBASE, & + & [SIZEOF(PREEL_PTR(ALLOC_POS:))/SIZEOF(ZBASE(0))]) + +ZINPS(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IIN_STRIDES1*D_NUMP-1) +ALLOC_POS=ALLOC_POS+ALIGN(IIN_STRIDES1*D_NUMP,8) +ZINPA(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IIN_STRIDES1*D_NUMP-1) +ALLOC_POS=ALLOC_POS+ALIGN(IIN_STRIDES1*D_NUMP,8) +ZOUT(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IOUT_STRIDES1*D_NUMP-1) +ALLOC_POS=ALLOC_POS+ALIGN(IOUT_STRIDES1*D_NUMP,8) + +! The BASE0 pointer points to the rest, but likely in a different type! +CALL C_F_POINTER(C_LOC(PREEL_PTR(ALLOC_POS:)), ZBASE0, & + & [SIZEOF(PREEL_PTR(ALLOC_POS:))/SIZEOF(ZBASE0(0))]) +ALLOC_POS=1 +ZINP0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IIN0_STRIDES1-1) +ALLOC_POS=ALLOC_POS+ALIGN(IIN0_STRIDES1,8) +ZOUT0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IOUT0_STRIDES1-1) +ALLOC_POS=ALLOC_POS+ALIGN(IOUT0_STRIDES1,8) !$ACC DATA & !$ACC& CREATE(ZINPS,ZINPA,ZOUT,ZINP0,ZOUT0) & @@ -338,11 +373,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ENDIF !$ACC END DATA -DEALLOCATE(ZINPA) -DEALLOCATE(ZINPS) -DEALLOCATE(ZOUT) -DEALLOCATE(ZINP0) -DEALLOCATE(ZOUT0) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) From 832dce183244e66368b4369f337b0f3e1b593034 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:29 -0700 Subject: [PATCH 200/263] Rename reuse pointer --- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 20 +++++++++--------- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 20 +++++++++--------- src/trans/gpu/internal/ledir_mod.F90 | 26 ++++++++++++------------ src/trans/gpu/internal/leinv_mod.F90 | 26 ++++++++++++------------ src/trans/gpu/internal/tpm_trans.F90 | 11 ++++++---- 5 files changed, 53 insertions(+), 50 deletions(-) diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index ddbe4f473..cfe456bdf 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -62,7 +62,7 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & !USE TPM_GEOMETRY USE TPM_DISTR ,ONLY : D, MYPROC, NPROC USE FOURIER_OUT_MOD , ONLY: FOURIER_OUT -USE TPM_TRANS, ONLY: PREEL_PTR +USE TPM_TRANS, ONLY: REUSE_PTR USE TRGTOL_MOD ,ONLY : TRGTOL, TRGTOL_CUDAAWARE USE FTDIR_MOD ,ONLY : FTDIR @@ -151,16 +151,16 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & IST = IST+KF_SCALARS_G ENDIF -IF (.NOT. ALLOCATED(PREEL_PTR)) THEN - ALLOCATE(PREEL_PTR(KF_FS*D%NLENGTF)) - !$ACC ENTER DATA CREATE(PREEL_PTR) -ELSEIF (SIZE(PREEL_PTR) < KF_FS*D%NLENGTF) THEN - !$ACC EXIT DATA DELETE(PREEL_PTR) - DEALLOCATE(PREEL_PTR) - ALLOCATE(PREEL_PTR(KF_FS*D%NLENGTF)) - !$ACC ENTER DATA CREATE(PREEL_PTR) +IF (.NOT. ALLOCATED(REUSE_PTR)) THEN + ALLOCATE(REUSE_PTR(KF_FS*D%NLENGTF)) + !$ACC ENTER DATA CREATE(REUSE_PTR) +ELSEIF (SIZE(REUSE_PTR) < KF_FS*D%NLENGTF) THEN + !$ACC EXIT DATA DELETE(REUSE_PTR) + DEALLOCATE(REUSE_PTR) + ALLOCATE(REUSE_PTR(KF_FS*D%NLENGTF)) + !$ACC ENTER DATA CREATE(REUSE_PTR) ENDIF -PREEL_REAL => PREEL_PTR +PREEL_REAL => REUSE_PTR ! Transposition diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 index ccdd218ed..cd96670b3 100755 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctl_mod.F90 @@ -62,7 +62,7 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& USE TPM_GEN ,ONLY : NERR, nout !USE TPM_DIM !USE TPM_GEOMETRY -USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP,LATLON, PREEL_PTR +USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP,LATLON, REUSE_PTR USE TPM_DISTR ,ONLY : D, MYPROC, NPROC USE TPM_FLT ,ONLY : S USE FOURIER_IN_MOD ,ONLY : FOURIER_IN @@ -131,16 +131,16 @@ SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives KF_FS = IFIRST -IF (.NOT. ALLOCATED(PREEL_PTR)) THEN - ALLOCATE(PREEL_PTR(KF_FS*D%NLENGTF)) - !$ACC ENTER DATA CREATE(PREEL_PTR) -ELSEIF (SIZE(PREEL_PTR) < KF_FS*D%NLENGTF) THEN - !$ACC EXIT DATA DELETE(PREEL_PTR) - DEALLOCATE(PREEL_PTR) - ALLOCATE(PREEL_PTR(KF_FS*D%NLENGTF)) - !$ACC ENTER DATA CREATE(PREEL_PTR) +IF (.NOT. ALLOCATED(REUSE_PTR)) THEN + ALLOCATE(REUSE_PTR(KF_FS*D%NLENGTF)) + !$ACC ENTER DATA CREATE(REUSE_PTR) +ELSEIF (SIZE(REUSE_PTR) < KF_FS*D%NLENGTF) THEN + !$ACC EXIT DATA DELETE(REUSE_PTR) + DEALLOCATE(REUSE_PTR) + ALLOCATE(REUSE_PTR(KF_FS*D%NLENGTF)) + !$ACC ENTER DATA CREATE(REUSE_PTR) ENDIF -PREEL_COMPLEX => PREEL_PTR +PREEL_COMPLEX => REUSE_PTR ! Initialize potentially unset offsets KSCALARS_NSDER_OFFSET = -1 diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 37fa93c4c..8ecc43f13 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -60,7 +60,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX,R_NTMAX,R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU USE TPM_FIELDS ,ONLY : F,ZAA,ZAS,ZAA0,ZAS0,KMLOC0 -USE TPM_TRANS, ONLY: PREEL_PTR +USE TPM_TRANS, ONLY: REUSE_PTR USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1 USE CUDA_GEMM_BATCHED_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER @@ -116,21 +116,21 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) +ALIGN(IIN0_STRIDES1,8)*SIZEOF(ZINP0(1)) & +ALIGN(IOUT0_STRIDES1,8)*SIZEOF(ZOUT0(1)) -IF (.NOT. ALLOCATED(PREEL_PTR)) THEN - ALLOCATE(PREEL_PTR(ALLOC_SZ/SIZEOF(PREEL_PTR(1)))) - !$ACC ENTER DATA CREATE(PREEL_PTR) -ELSEIF (SIZEOF(PREEL_PTR) <= ALLOC_SZ) THEN +IF (.NOT. ALLOCATED(REUSE_PTR)) THEN + ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) + !$ACC ENTER DATA CREATE(REUSE_PTR) +ELSEIF (SIZEOF(REUSE_PTR) <= ALLOC_SZ) THEN ! and reallocate if needed - !$ACC EXIT DATA DELETE(PREEL_PTR) - DEALLOCATE(PREEL_PTR) - ALLOCATE(PREEL_PTR(ALLOC_SZ/SIZEOF(PREEL_PTR(1)))) - !$ACC ENTER DATA CREATE(PREEL_PTR) + !$ACC EXIT DATA DELETE(REUSE_PTR) + DEALLOCATE(REUSE_PTR) + ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) + !$ACC ENTER DATA CREATE(REUSE_PTR) ENDIF ! Figure out which pointers to use ALLOC_POS=1 -CALL C_F_POINTER(C_LOC(PREEL_PTR(ALLOC_POS:)), ZBASE, & - & [SIZEOF(PREEL_PTR(ALLOC_POS:))/SIZEOF(ZBASE(0))]) +CALL C_F_POINTER(C_LOC(REUSE_PTR(ALLOC_POS:)), ZBASE, & + & [SIZEOF(REUSE_PTR(ALLOC_POS:))/SIZEOF(ZBASE(0))]) ZINPS(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IIN_STRIDES1*D_NUMP-1) ALLOC_POS=ALLOC_POS+ALIGN(IIN_STRIDES1*D_NUMP,8) @@ -140,8 +140,8 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ALLOC_POS=ALLOC_POS+ALIGN(IOUT_STRIDES1*D_NUMP,8) ! The BASE0 pointer points to the rest, but likely in a different type! -CALL C_F_POINTER(C_LOC(PREEL_PTR(ALLOC_POS:)), ZBASE0, & - & [SIZEOF(PREEL_PTR(ALLOC_POS:))/SIZEOF(ZBASE0(0))]) +CALL C_F_POINTER(C_LOC(REUSE_PTR(ALLOC_POS:)), ZBASE0, & + & [SIZEOF(REUSE_PTR(ALLOC_POS:))/SIZEOF(ZBASE0(0))]) ALLOC_POS=1 ZINP0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IIN0_STRIDES1-1) ALLOC_POS=ALLOC_POS+ALIGN(IIN0_STRIDES1,8) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 5f24a2329..febc018d3 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -56,7 +56,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX, R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU -USE TPM_TRANS, ONLY: PREEL_PTR +USE TPM_TRANS, ONLY: REUSE_PTR USE TPM_FIELDS ,ONLY : ZAA,ZAS,ZAA0,ZAS0,KMLOC0 USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1 USE TPM_GEN, ONLY: NOUT @@ -121,21 +121,21 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) +ALIGN(IOUT_STRIDES1*D_NUMP,8)*SIZEOF(ZOUTA(1)) & +ALIGN(IIN0_STRIDES1,8)*SIZEOF(ZINP0(1)) & +ALIGN(IOUT0_STRIDES1,8)*SIZEOF(ZOUT0(1)) -IF (.NOT. ALLOCATED(PREEL_PTR)) THEN - ALLOCATE(PREEL_PTR(ALLOC_SZ/SIZEOF(PREEL_PTR(1)))) - !$ACC ENTER DATA CREATE(PREEL_PTR) -ELSEIF (SIZEOF(PREEL_PTR) <= ALLOC_SZ) THEN +IF (.NOT. ALLOCATED(REUSE_PTR)) THEN + ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) + !$ACC ENTER DATA CREATE(REUSE_PTR) +ELSEIF (SIZEOF(REUSE_PTR) <= ALLOC_SZ) THEN ! and reallocate if needed - !$ACC EXIT DATA DELETE(PREEL_PTR) - DEALLOCATE(PREEL_PTR) - ALLOCATE(PREEL_PTR(ALLOC_SZ/SIZEOF(PREEL_PTR(1)))) - !$ACC ENTER DATA CREATE(PREEL_PTR) + !$ACC EXIT DATA DELETE(REUSE_PTR) + DEALLOCATE(REUSE_PTR) + ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) + !$ACC ENTER DATA CREATE(REUSE_PTR) ENDIF ! Figure out which pointers to use ALLOC_POS=1 -CALL C_F_POINTER(C_LOC(PREEL_PTR(ALLOC_POS:)), ZBASE, & - & [SIZEOF(PREEL_PTR(ALLOC_POS:))/SIZEOF(ZBASE(0))]) +CALL C_F_POINTER(C_LOC(REUSE_PTR(ALLOC_POS:)), ZBASE, & + & [SIZEOF(REUSE_PTR(ALLOC_POS:))/SIZEOF(ZBASE(0))]) ZINP(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IIN_STRIDES1*D_NUMP-1) ALLOC_POS=ALLOC_POS+ALIGN(IIN_STRIDES1*D_NUMP,8) @@ -145,8 +145,8 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ALLOC_POS=ALLOC_POS+ALIGN(IOUT_STRIDES1*D_NUMP,8) ! The BASE0 pointer points to the rest, but likely in a different type! -CALL C_F_POINTER(C_LOC(PREEL_PTR(ALLOC_POS:)), ZBASE0, & - & [SIZEOF(PREEL_PTR(ALLOC_POS:))/SIZEOF(ZBASE0(0))]) +CALL C_F_POINTER(C_LOC(REUSE_PTR(ALLOC_POS:)), ZBASE0, & + & [SIZEOF(REUSE_PTR(ALLOC_POS:))/SIZEOF(ZBASE0(0))]) ALLOC_POS=1 ZINP0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IIN0_STRIDES1-1) ALLOC_POS=ALLOC_POS+ALIGN(IIN0_STRIDES1,8) diff --git a/src/trans/gpu/internal/tpm_trans.F90 b/src/trans/gpu/internal/tpm_trans.F90 index 50d028c7d..158f7680a 100755 --- a/src/trans/gpu/internal/tpm_trans.F90 +++ b/src/trans/gpu/internal/tpm_trans.F90 @@ -63,9 +63,12 @@ MODULE TPM_TRANS REAL(KIND=JPRBT),ALLOCATABLE :: ZMINGPN(:) REAL(KIND=JPRBT),ALLOCATABLE :: ZMAXGPN(:) -! This is used in fourier space. It's reused among the transforms because -! we cannot reallocate - the captured CUDA graphs should not be modified. -! Hence, we keep it if it is large enough, otherwise we adapt the size. -REAL(KIND=JPRBT),ALLOCATABLE,TARGET :: PREEL_PTR(:) +! This is used in fourier space and in spectral space. It's reused among +! the transforms because we cannot reallocate - the captured CUDA graphs +! should not be modified. Hence, we keep it if it is large enough, otherwise +! we adapt the size. After 2 iterations this should lead to constant runtimes +! (the first iteration is used to get the max buffer size, the second iteration +! is going to recreate the graphs if needed) +REAL(KIND=JPRBT),ALLOCATABLE,TARGET :: REUSE_PTR(:) END MODULE TPM_TRANS From 8ecc13d896ebfe2a162040a279937c6e84128c0a Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:29 -0700 Subject: [PATCH 201/263] Add option to use openacc streams --- .../gpu/algor/external/gemm/gemm_wrapper.cu | 70 +++++++++++-------- .../gpu/internal/cuda_gemm_batched_mod.F90 | 61 +++++++++------- src/trans/gpu/internal/ledir_mod.F90 | 11 +-- src/trans/gpu/internal/leinv_mod.F90 | 11 +-- 4 files changed, 86 insertions(+), 67 deletions(-) diff --git a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu index 7678f69a4..7fc663948 100644 --- a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu +++ b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu @@ -1,3 +1,4 @@ +#include #include #include @@ -52,7 +53,8 @@ template void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, const Real *A, int lda, int *offsetsA, const Real *B, int ldb, int *offsetsB, Real beta, Real *C, int ldc, - int *offsetsC, int batchCount, int blas_id = -1) { + int *offsetsC, int batchCount, cudaStream_t stream, + int blas_id = -1) { // we store at most one graph per "m" (# fields) and "blas id" static std::unordered_map, cudaGraphExec_t, detail::pair_hash> @@ -108,7 +110,7 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, ptrCache.insert({key, std::make_tuple(A, B, C)}); } - CUDA_CHECK(cudaGraphLaunch(graphCache.at(key), 0)); + CUDA_CHECK(cudaGraphLaunch(graphCache.at(key), stream)); } // stupid simple gemm calls @@ -116,11 +118,11 @@ template void run_group(Gemm &&gemm, int m, int *n, int *k, Real alpha, const Real *A, int lda, int *offsetsA, const Real *B, int ldb, int *offsetsB, Real beta, Real *C, int ldc, int *offsetsC, int batchCount, - int = -1) { + cudaStream_t stream, int = -1) { for (int i = 0; i < batchCount; ++i) { if (m == 0 || n[i] == 0 || k[i] == 0) continue; - gemm(0, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], ldb, - beta, C + offsetsC[i], ldc); + gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], + ldb, beta, C + offsetsC[i], ldc); } } @@ -231,34 +233,36 @@ void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k, float alpha, const float *A, int lda, int *offsetsA, const float *B, int ldb, int *offsetsB, float beta, float *C, - int ldc, int *offsetsC, int batchCount) { + int ldc, int *offsetsC, int batchCount, + cudaStream_t stream) { using namespace detail; run_group_graph(cutlass_sgemm_grouped(), m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, - batchCount, blas_id); + batchCount, stream, blas_id); } void cutlass_sgemm_wrapper_grouped(int blas_id, cublasOperation_t transa, cublasOperation_t transb, int m, int *n, int *k, float alpha, const float *A, int lda, int *offsetsA, const float *B, int ldb, int *offsetsB, float beta, float *C, int ldc, - int *offsetsC, int batchCount) { + int *offsetsC, int batchCount, + cudaStream_t stream) { if (transa == CUBLAS_OP_N && transb == CUBLAS_OP_N) cutlass_sgemm_wrapper_grouped_op( blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, - ldc, offsetsC, batchCount); + ldc, offsetsC, batchCount, stream); else if (transa == CUBLAS_OP_N && transb == CUBLAS_OP_T) cutlass_sgemm_wrapper_grouped_op( blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, - ldc, offsetsC, batchCount); + ldc, offsetsC, batchCount, stream); else if (transa == CUBLAS_OP_T && transb == CUBLAS_OP_N) cutlass_sgemm_wrapper_grouped_op( blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, - ldc, offsetsC, batchCount); + ldc, offsetsC, batchCount, stream); else if (transa == CUBLAS_OP_T && transb == CUBLAS_OP_T) cutlass_sgemm_wrapper_grouped_op( blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, - ldc, offsetsC, batchCount); + ldc, offsetsC, batchCount, stream); else assert(false); } @@ -301,11 +305,12 @@ void cublas_sgemm_wrapper_grouped(int blas_id, cublasOperation_t transa, int *k, float alpha, const float *A, int lda, int *offsetsA, const float *B, int ldb, int *offsetsB, float beta, float *C, int ldc, - int *offsetsC, int batchCount) { + int *offsetsC, int batchCount, + cudaStream_t stream) { using namespace detail; run_group_graph(cublas_gemm_grouped(transa, transb), m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, - batchCount, blas_id); + batchCount, stream, blas_id); } void cublas_dgemm_wrapper_grouped(int blas_id, cublasOperation_t transa, cublasOperation_t transb, int m, int *n, @@ -313,12 +318,13 @@ void cublas_dgemm_wrapper_grouped(int blas_id, cublasOperation_t transa, int lda, int *offsetsA, const double *B, int ldb, int *offsetsB, double beta, double *C, int ldc, int *offsetsC, - int batchCount) { + int batchCount, cudaStream_t stream) { using namespace detail; run_group_graph(cublas_gemm_grouped(transa, transb), m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, - batchCount, blas_id); + batchCount, stream, blas_id); } + } // namespace extern "C" { @@ -326,10 +332,10 @@ void cublas_dgemm_wrapper(cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double alpha, const double *A, int lda, int tda, const double *B, int ldb, int tdb, double beta, double *C, int ldc, int tdc, - int batchCount) { - static cublasHandle_t handle = nullptr; - if (!handle) CUBLAS_CHECK(cublasCreate(&handle)); - + int batchCount, size_t stream) { + cublasHandle_t handle = detail::get_cublas_handle(); + CUBLAS_CHECK( + cublasSetStream(handle, *(cudaStream_t*)stream)); CUBLAS_CHECK(cublasDgemmStridedBatched(handle, transa, transb, m, n, k, &alpha, A, lda, tda, B, ldb, tdb, &beta, C, ldc, tdc, batchCount)); @@ -339,10 +345,10 @@ void cublas_sgemm_wrapper(cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float alpha, const float *A, int lda, int tda, const float *B, int ldb, int tdb, float beta, float *C, int ldc, int tdc, - int batchCount) { - static cublasHandle_t handle = nullptr; - if (!handle) CUBLAS_CHECK(cublasCreate(&handle)); - + int batchCount, size_t stream) { + cublasHandle_t handle = detail::get_cublas_handle(); + CUBLAS_CHECK( + cublasSetStream(handle, *(cudaStream_t*)stream)); CUBLAS_CHECK(cublasSgemmStridedBatched(handle, transa, transb, m, n, k, &alpha, A, lda, tda, B, ldb, tdb, &beta, C, ldc, tdc, batchCount)); @@ -353,24 +359,26 @@ void blas_sgemm_wrapper_grouped(int blas_id, cublasOperation_t transa, float alpha, const float *A, int lda, int *offsetsA, const float *B, int ldb, int *offsetsB, float beta, float *C, int ldc, - int *offsetsC, int batchCount) { + int *offsetsC, int batchCount, size_t stream) { if (use_cutlass) cutlass_sgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, - offsetsC, batchCount); + offsetsC, batchCount, + *(cudaStream_t*)stream); else cublas_sgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, - offsetsC, batchCount); + offsetsC, batchCount, + *(cudaStream_t*)stream); } void blas_dgemm_wrapper_grouped(int blas_id, cublasOperation_t transa, cublasOperation_t transb, int m, int *n, int *k, double alpha, const double *A, int lda, int *offsetsA, const double *B, int ldb, int *offsetsB, double beta, double *C, int ldc, - int *offsetsC, int batchCount) { - cublas_dgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, - B, ldb, offsetsB, beta, C, ldc, offsetsC, - batchCount); + int *offsetsC, int batchCount, size_t stream) { + cublas_dgemm_wrapper_grouped( + blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, + C, ldc, offsetsC, batchCount, *(cudaStream_t*)stream); } } diff --git a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 index 1782d7fda..ab7ec73a7 100755 --- a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 +++ b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 @@ -2,6 +2,7 @@ MODULE CUDA_GEMM_BATCHED_MOD USE PARKIND1, ONLY: JPRD, JPRM, JPIM USE CUBLAS, ONLY: CUBLAS_OP_N, CUBLAS_OP_T USE ISO_C_BINDING + USE OPENACC, ONLY: ACC_GET_CUDA_STREAM IMPLICIT NONE @@ -25,13 +26,30 @@ SUBROUTINE CUDA_SGEMM_BATCHED(& & B, LDB, TDB, & & BETA, & & C, LDC, TDC, & - & BATCHCOUNT & + & BATCHCOUNT, STREAM & &) BIND(C, NAME='cublas_sgemm_wrapper') USE ISO_C_BINDING INTEGER(C_INT), VALUE :: CTA, CTB, M, N, K, LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT REAL(C_FLOAT), VALUE :: ALPHA,BETA REAL(C_FLOAT) :: A(*), B(*), C(*) + INTEGER(KIND=c_size_t) :: STREAM END SUBROUTINE CUDA_SGEMM_BATCHED + SUBROUTINE CUDA_DGEMM_BATCHED(& + & CTA, CTB, & + & M, N, K, & + & ALPHA, & + & A, LDA, TDA, & + & B, LDB, TDB, & + & BETA, & + & C, LDC, TDC, & + & BATCHCOUNT, STREAM & + &) BIND(C, NAME='cublas_dgemm_wrapper') + USE ISO_C_BINDING + INTEGER(C_INT), VALUE :: CTA, CTB, M, N, K, LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT + REAL(C_DOUBLE), VALUE :: ALPHA,BETA + REAL(C_DOUBLE) :: A(*), B(*), C(*) + INTEGER(KIND=c_size_t) :: STREAM + END SUBROUTINE CUDA_DGEMM_BATCHED SUBROUTINE CUDA_DGEMM_GROUPED(& & BLAS_ID, CTA, CTB, & & M, N, K, & @@ -40,12 +58,13 @@ SUBROUTINE CUDA_DGEMM_GROUPED(& & B, LDB, OFFSETB, & & BETA, & & C, LDC, OFFSETC, & - & BATCHCOUNT & + & BATCHCOUNT, STREAM & &) BIND(C, NAME='blas_dgemm_wrapper_grouped') USE ISO_C_BINDING INTEGER(C_INT), VALUE :: BLAS_ID, CTA, CTB, M, N(:), K(:), LDA, LDB, LDC, OFFSETA(:), OFFSETB(:), OFFSETC(:), BATCHCOUNT REAL(C_DOUBLE), VALUE :: ALPHA,BETA REAL(C_DOUBLE) :: A(*), B(*), C(*) + INTEGER(KIND=c_size_t) :: STREAM END SUBROUTINE CUDA_DGEMM_GROUPED SUBROUTINE CUDA_SGEMM_GROUPED(& & BLAS_ID, CTA, CTB, & @@ -55,28 +74,14 @@ SUBROUTINE CUDA_SGEMM_GROUPED(& & B, LDB, OFFSETB, & & BETA, & & C, LDC, OFFSETC, & - & BATCHCOUNT & + & BATCHCOUNT, STREAM & &) BIND(C, NAME='blas_sgemm_wrapper_grouped') USE ISO_C_BINDING INTEGER(C_INT), VALUE :: BLAS_ID, CTA, CTB, M, N(:), K(:), LDA, LDB, LDC, OFFSETA(:), OFFSETB(:), OFFSETC(:), BATCHCOUNT REAL(C_FLOAT), VALUE :: ALPHA,BETA REAL(C_FLOAT) :: A(*), B(*), C(*) + INTEGER(KIND=c_size_t) :: STREAM END SUBROUTINE CUDA_SGEMM_GROUPED - SUBROUTINE CUDA_DGEMM_BATCHED(& - & CTA, CTB, & - & M, N, K, & - & ALPHA, & - & A, LDA, TDA, & - & B, LDB, TDB, & - & BETA, & - & C, LDC, TDC, & - & BATCHCOUNT & - &) BIND(C, NAME='cublas_dgemm_wrapper') - USE ISO_C_BINDING - INTEGER(C_INT), VALUE :: CTA, CTB, M, N, K, LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT - REAL(C_DOUBLE), VALUE :: ALPHA,BETA - REAL(C_DOUBLE) :: A(*), B(*), C(*) - END SUBROUTINE CUDA_DGEMM_BATCHED END INTERFACE CONTAINS @@ -89,7 +94,7 @@ SUBROUTINE CUDA_DGEMM_BATCHED_OVERLOAD( & & BARRAY, LDB, STRIDEB, & & BETA, & & CARRAY, LDC, STRIDEC, & - & BATCHCOUNT) + & BATCHCOUNT, STREAM) INTEGER(KIND=C_INT), INTENT(IN) :: TRANSA INTEGER(KIND=C_INT), INTENT(IN) :: TRANSB INTEGER(KIND=JPIM) :: M @@ -107,6 +112,7 @@ SUBROUTINE CUDA_DGEMM_BATCHED_OVERLOAD( & INTEGER(KIND=JPIM) :: LDC INTEGER(KIND=JPIM) :: STRIDEC INTEGER(KIND=JPIM) :: BATCHCOUNT + INTEGER(KIND=C_LONG) :: STREAM !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) CALL CUDA_DGEMM_BATCHED( & @@ -117,7 +123,7 @@ SUBROUTINE CUDA_DGEMM_BATCHED_OVERLOAD( & & BARRAY, LDB, STRIDEB, & & BETA, & & CARRAY, LDC, STRIDEC, & - & BATCHCOUNT) + & BATCHCOUNT, ACC_GET_CUDA_STREAM(STREAM)) !$ACC END HOST_DATA END SUBROUTINE CUDA_DGEMM_BATCHED_OVERLOAD @@ -129,7 +135,7 @@ SUBROUTINE CUDA_SGEMM_BATCHED_OVERLOAD( & & BARRAY, LDB, STRIDEB, & & BETA, & & CARRAY, LDC, STRIDEC, & - & BATCHCOUNT) + & BATCHCOUNT, STREAM) INTEGER(KIND=C_INT), INTENT(IN) :: TRANSA INTEGER(KIND=C_INT), INTENT(IN) :: TRANSB INTEGER(KIND=JPIM) :: M @@ -147,6 +153,7 @@ SUBROUTINE CUDA_SGEMM_BATCHED_OVERLOAD( & INTEGER(KIND=JPIM) :: LDC INTEGER(KIND=JPIM) :: STRIDEC INTEGER(KIND=JPIM) :: BATCHCOUNT + INTEGER(KIND=C_LONG) :: STREAM !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) CALL CUDA_SGEMM_BATCHED( & @@ -157,7 +164,7 @@ SUBROUTINE CUDA_SGEMM_BATCHED_OVERLOAD( & & BARRAY, LDB, STRIDEB, & & BETA, & & CARRAY, LDC, STRIDEC, & - & BATCHCOUNT) + & BATCHCOUNT, ACC_GET_CUDA_STREAM(STREAM)) !$ACC END HOST_DATA END SUBROUTINE CUDA_SGEMM_BATCHED_OVERLOAD @@ -169,7 +176,7 @@ SUBROUTINE CUDA_DGEMM_GROUPED_OVERLOAD( & & BARRAY, LDB, OFFSETB, & & BETA, & & CARRAY, LDC, OFFSETC, & - & BATCHCOUNT) + & BATCHCOUNT, STREAM) INTEGER(KIND=C_INT), INTENT(IN) :: BLAS_ID INTEGER(KIND=C_INT), INTENT(IN) :: TRANSA INTEGER(KIND=C_INT), INTENT(IN) :: TRANSB @@ -188,6 +195,7 @@ SUBROUTINE CUDA_DGEMM_GROUPED_OVERLOAD( & INTEGER(KIND=JPIM) :: LDC INTEGER(KIND=JPIM) :: OFFSETC(:) INTEGER(KIND=JPIM) :: BATCHCOUNT + INTEGER(KIND=C_LONG) :: STREAM !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) CALL CUDA_DGEMM_GROUPED( & @@ -198,7 +206,7 @@ SUBROUTINE CUDA_DGEMM_GROUPED_OVERLOAD( & & BARRAY, LDB, OFFSETB, & & BETA, & & CARRAY, LDC, OFFSETC, & - & BATCHCOUNT) + & BATCHCOUNT, ACC_GET_CUDA_STREAM(STREAM)) !$ACC END HOST_DATA END SUBROUTINE CUDA_DGEMM_GROUPED_OVERLOAD @@ -210,7 +218,7 @@ SUBROUTINE CUDA_SGEMM_GROUPED_OVERLOAD( & & BARRAY, LDB, OFFSETB, & & BETA, & & CARRAY, LDC, OFFSETC, & - & BATCHCOUNT) + & BATCHCOUNT, STREAM) INTEGER(KIND=C_INT), INTENT(IN) :: BLAS_ID INTEGER(KIND=C_INT), INTENT(IN) :: TRANSA INTEGER(KIND=C_INT), INTENT(IN) :: TRANSB @@ -229,6 +237,7 @@ SUBROUTINE CUDA_SGEMM_GROUPED_OVERLOAD( & INTEGER(KIND=JPIM) :: LDC INTEGER(KIND=JPIM) :: OFFSETC(:) INTEGER(KIND=JPIM) :: BATCHCOUNT + INTEGER(KIND=C_LONG) :: STREAM !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) CALL CUDA_SGEMM_GROUPED( & @@ -239,7 +248,7 @@ SUBROUTINE CUDA_SGEMM_GROUPED_OVERLOAD( & & BARRAY, LDB, OFFSETB, & & BETA, & & CARRAY, LDC, OFFSETC, & - & BATCHCOUNT) + & BATCHCOUNT, ACC_GET_CUDA_STREAM(STREAM)) !$ACC END HOST_DATA END SUBROUTINE CUDA_SGEMM_GROUPED_OVERLOAD diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 8ecc43f13..334e1bdab 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -67,6 +67,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX USE, INTRINSIC :: ISO_C_BINDING USE IEEE_ARITHMETIC +USE OPENACC IMPLICIT NONE @@ -149,7 +150,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ALLOC_POS=ALLOC_POS+ALIGN(IOUT0_STRIDES1,8) !$ACC DATA & -!$ACC& CREATE(ZINPS,ZINPA,ZOUT,ZINP0,ZOUT0) & +!$ACC& PRESENT(ZINPS,ZINPA,ZOUT,ZINP0,ZOUT0) & !$ACC& PRESENT(F,F%RW) & !$ACC& PRESENT(D,D_MYMS,R,G,G_NDGLU) & !$ACC& PRESENT(ZAA,ZAS,POA1) & @@ -223,7 +224,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & ZAA, SIZE(ZAA,1), BOFFSETS, & & 0.0_JPRBT, & & ZOUT, IOUT_STRIDES0, COFFSETS, & - & D_NUMP) + & D_NUMP, STREAM=ACC_ASYNC_SYNC) CALL cudaDeviceSynchronize() IF (LSYNC_TRANS) THEN CALL GSTATS(434,0) @@ -271,7 +272,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & ZAA0, SIZE(ZAA0,1), 0, & & 0.0_JPRD, & & ZOUT0, IOUT0_STRIDES0, 0, & - & 1) + & 1, STREAM=ACC_ASYNC_SYNC) CALL cudaDeviceSynchronize() !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IA) DEFAULT(NONE) @@ -315,7 +316,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & ZAS, SIZE(ZAS,1), BOFFSETS, & & 0.0_JPRBT, & & ZOUT, IOUT_STRIDES0, COFFSETS, & - & D_NUMP) + & D_NUMP, STREAM=ACC_ASYNC_SYNC) CALL cudaDeviceSynchronize() IF (LSYNC_TRANS) THEN CALL GSTATS(434,0) @@ -359,7 +360,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & ZAS0, SIZE(ZAS0,1), 0, & & 0.0_JPRD, & & ZOUT0, IOUT0_STRIDES0, 0, & - & 1) + & 1, STREAM=ACC_ASYNC_SYNC) call cudaDeviceSynchronize() !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IS) DEFAULT(NONE) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index febc018d3..ded1f3408 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -64,6 +64,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) USE, INTRINSIC :: ISO_C_BINDING USE MPL_MODULE ,ONLY : MPL_BARRIER USE IEEE_ARITHMETIC +USE OPENACC USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX IMPLICIT NONE @@ -154,7 +155,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ALLOC_POS=ALLOC_POS+ALIGN(IOUT0_STRIDES1,8) !$ACC DATA PRESENT(D,D_MYMS,G,G_NDGLU,R) & -!$ACC& COPYIN(ZINP,ZOUTS,ZOUTA,ZINP0,ZOUT0) & +!$ACC& PRESENT(ZINP,ZOUTS,ZOUTA,ZINP0,ZOUT0) & !$ACC& PRESENT(ZAA,ZAS,PIA) & !$ACC& PRESENT(D_MYMS,D_NPNTGTB1,G_NDGLU) @@ -228,7 +229,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & ZAA, SIZE(ZAA,1), BOFFSETS, & & 0.0_JPRBT, & & ZOUTA, IOUT_STRIDES0, COFFSETS, & - & D_NUMP) + & D_NUMP, STREAM=ACC_ASYNC_SYNC) CALL cudaDeviceSynchronize() IF (LSYNC_TRANS) THEN @@ -256,7 +257,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & ZAA0, SIZE(ZAA0,1), 0, & & 0.0_JPRD, & & ZOUT0, IOUT0_STRIDES0, 0, & - & 1) + & 1, STREAM=ACC_ASYNC_SYNC) CALL cudaDeviceSynchronize() !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) @@ -329,7 +330,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & ZAS, SIZE(ZAS,1), BOFFSETS, & & 0.0_JPRBT, & & ZOUTS, IOUT_STRIDES0, COFFSETS, & - & D_NUMP) + & D_NUMP, STREAM=ACC_ASYNC_SYNC) CALL cudaDeviceSynchronize() IF (LSYNC_TRANS) THEN CALL GSTATS(444,0) @@ -355,7 +356,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & ZAS0, SIZE(ZAS0,1), 0, & & 0.0_JPRD, & & ZOUT0, IOUT0_STRIDES0, 0, & - & 1) + & 1, STREAM=ACC_ASYNC_SYNC) CALL cudaDeviceSynchronize() !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) From 69f12ce7686ecd5fdcd66d7cc2ee4d963f3caeb6 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:30 -0700 Subject: [PATCH 202/263] Add ZINPS0/ZINPA0 to have same semantics in ledir --- src/trans/gpu/internal/ledir_mod.F90 | 46 ++++++++++++++-------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 334e1bdab..9f67628de 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -89,7 +89,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) INTEGER(KIND=JPIM) :: IGLS, JF, JGL INTEGER(KIND=JPIM) :: OFFSET1, OFFSET2 REAL(KIND=JPRBT), POINTER :: ZBASE(:), ZINPS(:), ZINPA(:), ZOUT(:) -REAL(KIND=JPRD), POINTER :: ZBASE0(:), ZINP0(:), ZOUT0(:) +REAL(KIND=JPRD), POINTER :: ZBASE0(:), ZINPS0(:), ZINPA0(:), ZOUT0(:) INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 @@ -114,7 +114,8 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ALLOC_SZ = ALIGN(IIN_STRIDES1*D_NUMP,8)*SIZEOF(ZINPS(1)) & +ALIGN(IIN_STRIDES1*D_NUMP,8)*SIZEOF(ZINPA(1)) & +ALIGN(IOUT_STRIDES1*D_NUMP,8)*SIZEOF(ZOUT(1)) & - +ALIGN(IIN0_STRIDES1,8)*SIZEOF(ZINP0(1)) & + +ALIGN(IIN0_STRIDES1,8)*SIZEOF(ZINPS0(1)) & + +ALIGN(IIN0_STRIDES1,8)*SIZEOF(ZINPA0(1)) & +ALIGN(IOUT0_STRIDES1,8)*SIZEOF(ZOUT0(1)) IF (.NOT. ALLOCATED(REUSE_PTR)) THEN @@ -144,13 +145,15 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) CALL C_F_POINTER(C_LOC(REUSE_PTR(ALLOC_POS:)), ZBASE0, & & [SIZEOF(REUSE_PTR(ALLOC_POS:))/SIZEOF(ZBASE0(0))]) ALLOC_POS=1 -ZINP0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IIN0_STRIDES1-1) +ZINPS0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IIN0_STRIDES1-1) +ALLOC_POS=ALLOC_POS+ALIGN(IIN0_STRIDES1,8) +ZINPA0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IIN0_STRIDES1-1) ALLOC_POS=ALLOC_POS+ALIGN(IIN0_STRIDES1,8) ZOUT0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IOUT0_STRIDES1-1) ALLOC_POS=ALLOC_POS+ALIGN(IOUT0_STRIDES1,8) !$ACC DATA & -!$ACC& PRESENT(ZINPS,ZINPA,ZOUT,ZINP0,ZOUT0) & +!$ACC& PRESENT(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) & !$ACC& PRESENT(F,F%RW) & !$ACC& PRESENT(D,D_MYMS,R,G,G_NDGLU) & !$ACC& PRESENT(ZAA,ZAS,POA1) & @@ -188,6 +191,20 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ENDDO END DO !$ACC END DATA +! compute m=0 in double precision: +IF(KMLOC0 > 0) THEN + print*,'computing m=0 in double precision' + + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) + DO JGL=1,G_NDGLU(0) + DO JF=1,2*KF_FS,2 + ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0) & + & = ZINPA((JF-1)+1+(JGL-1)*IIN_STRIDES0+(KMLOC0-1)*IIN_STRIDES1) + ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0) & + & = ZINPS((JF-1)+1+(JGL-1)*IIN_STRIDES0+(KMLOC0-1)*IIN_STRIDES1) + ENDDO + ENDDO +ENDIF !$ACC EXIT DATA DELETE(FOUBUF) DEALLOCATE(FOUBUF) @@ -251,15 +268,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) IF(KMLOC0 > 0) THEN print*,'computing m=0 in double precision' - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) - DO JGL=1,G_NDGLU(0) - DO JF=1,2*KF_FS,2 - ZINP0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0) & - & = ZINPA((JF-1)+1+(JGL-1)*IIN_STRIDES0+(KMLOC0-1)*IIN_STRIDES1) - ENDDO - ENDDO - - ! Get C in transpose format to get better memory access patterns later !C=A*B => ! C^T=B^T*A^T @@ -268,7 +276,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & CUBLAS_OP_N, CUBLAS_OP_N, & & KF_FS, (R%NSMAX+2)/2, G%NDGLU(0), & & 1.0_JPRD, & - & ZINP0, IIN0_STRIDES0, 0, & + & ZINPA0, IIN0_STRIDES0, 0, & & ZAA0, SIZE(ZAA0,1), 0, & & 0.0_JPRD, & & ZOUT0, IOUT0_STRIDES0, 0, & @@ -340,14 +348,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ENDDO IF(KMLOC0 > 0) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) - DO JGL=1,G_NDGLU(0) - DO JF=1,2*KF_FS,2 - ZINP0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0) & - & = ZINPS((JF-1)+1+(JGL-1)*IIN_STRIDES0+(KMLOC0-1)*IIN_STRIDES1) - ENDDO - ENDDO - ! Get C in transpose format to get better memory access patterns later !C=A*B => ! C^T=B^T*A^T @@ -356,7 +356,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & CUBLAS_OP_N, CUBLAS_OP_N, & & KF_FS, (R%NSMAX+3)/2, G%NDGLU(0), & & 1.0_JPRD, & - & ZINP0, IIN0_STRIDES0, 0, & + & ZINPS0, IIN0_STRIDES0, 0, & & ZAS0, SIZE(ZAS0,1), 0, & & 0.0_JPRD, & & ZOUT0, IOUT0_STRIDES0, 0, & From 6c28dd6cfb8d9dc782127321b7f985fef309e654 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:30 -0700 Subject: [PATCH 203/263] Merge kernels for asymm/ledir --- src/trans/gpu/internal/ledir_mod.F90 | 64 ++++++++++++---------------- 1 file changed, 27 insertions(+), 37 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 9f67628de..c764d0367 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -184,27 +184,19 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) PAIA = PAIA*F%RACTHE(JGL) PAIS = PAIS*F%RACTHE(JGL) ENDIF - ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIA*F%RW(JGL) - ZINPS(JF+(JGL-ISL)*IIN_striDes0+(KMLOC-1)*IIN_STRIDES1)=PAIS*F%RW(JGL) + IF (KM /= 0) THEN + ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIA*F%RW(JGL) + ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIS*F%RW(JGL) + ELSEIF (MOD(JF-1,2) == 0) THEN + ! every other field is sufficient because Im(KM=0) == 0 + ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F%RW(JGL) + ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIS*F%RW(JGL) + ENDIF ENDIF ENDDO ENDDO END DO !$ACC END DATA -! compute m=0 in double precision: -IF(KMLOC0 > 0) THEN - print*,'computing m=0 in double precision' - - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) - DO JGL=1,G_NDGLU(0) - DO JF=1,2*KF_FS,2 - ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0) & - & = ZINPA((JF-1)+1+(JGL-1)*IIN_STRIDES0+(KMLOC0-1)*IIN_STRIDES1) - ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0) & - & = ZINPS((JF-1)+1+(JGL-1)*IIN_STRIDES0+(KMLOC0-1)*IIN_STRIDES1) - ENDDO - ENDDO -ENDIF !$ACC EXIT DATA DELETE(FOUBUF) DEALLOCATE(FOUBUF) @@ -250,20 +242,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ENDIF CALL GSTATS(414,1) -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA) DEFAULT(NONE) -DO KMLOC=1,D_NUMP - DO JK=1,2*KF_FS - KM = D_MYMS(KMLOC) - IF (KM /= 0) THEN - IA = 1+MOD(R_NTMAX-KM+2,2) - !$ACC LOOP SEQ - DO J=1,(R%NSMAX-KM+2)/2 - POA1(JK,IA+1+(J-1)*2,KMLOC) = ZOUT(JK+(J-1)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) - ENDDO - ENDIF - ENDDO -ENDDO - ! compute m=0 in double precision: IF(KMLOC0 > 0) THEN print*,'computing m=0 in double precision' @@ -283,15 +261,27 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & 1, STREAM=ACC_ASYNC_SYNC) CALL cudaDeviceSynchronize() - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IA) DEFAULT(NONE) - DO J=1,(R_NSMAX+2)/2 - DO JK=1,2*KF_FS,2 - IA = 1+MOD(R_NTMAX+2,2) - POA1(JK,IA+1+(J-1)*2,KMLOC0) = ZOUT0((JK-1)/2+1+(J-1)*IOUT0_STRIDES0) - ENDDO - ENDDO ENDIF +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) DEFAULT(NONE) +DO KMLOC=1,D_NUMP + DO JF=1,2*KF_FS + KM = D_MYMS(KMLOC) + IA = 1+MOD(R_NTMAX-KM+2,2) + IF (KM /= 0) THEN + !$ACC LOOP SEQ + DO J=1,(R%NSMAX-KM+2)/2 + POA1(JF,IA+1+(J-1)*2,KMLOC) = ZOUT(JF+(J-1)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) + ENDDO + ELSEIF (MOD(JF-1,2) == 0) THEN + !$ACC LOOP SEQ + DO J=1,(R_NSMAX+2)/2 + POA1(JF,IA+1+(J-1)*2,KMLOC0) = ZOUT0((JF-1)/2+1+(J-1)*IOUT0_STRIDES0) + ENDDO + ENDIF + ENDDO +ENDDO + ! symmetric IF (LSYNC_TRANS) THEN From 29e1a458e3b3b1b2fc067a7997077c837580d737 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:30 -0700 Subject: [PATCH 204/263] Merge kernels for symm/ledir --- src/trans/gpu/internal/ledir_mod.F90 | 42 +++++++++++++--------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index c764d0367..6a38d2d14 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -81,7 +81,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ! LOCAL VARIABLES INTEGER(KIND=JPIM) :: KM INTEGER(KIND=JPIM) :: KMLOC -INTEGER(KIND=JPIM) :: IA, IS, ISL, J, JK +INTEGER(KIND=JPIM) :: IA, IS, ISL, J INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPRBT) :: PAIA, PAIS @@ -323,20 +323,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ENDIF CALL GSTATS(414,1) -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS) DEFAULT(NONE) -DO KMLOC=1,D_NUMP - DO JK=1,2*KF_FS - KM = D_MYMS(KMLOC) - IF (KM /= 0) THEN - IS = 1+MOD(R_NTMAX-KM+1,2) - !$ACC LOOP SEQ - DO J=1,(R_NSMAX-KM+3)/2 - POA1(JK,IS+1+(J-1)*2,KMLOC) = ZOUT(JK+(J-1)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) - ENDDO - ENDIF - ENDDO -ENDDO - IF(KMLOC0 > 0) THEN ! Get C in transpose format to get better memory access patterns later !C=A*B => @@ -353,16 +339,28 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & 1, STREAM=ACC_ASYNC_SYNC) call cudaDeviceSynchronize() - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IS) DEFAULT(NONE) - DO J=1,(R_NSMAX+3)/2 - DO JK=1,2*KF_FS,2 - IS = 1+MOD(R_NTMAX+1,2) - POA1(JK,IS+1+(J-1)*2,KMLOC0) = ZOUT0((JK-1)/2+1+(J-1)*IOUT0_STRIDES0) - ENDDO - ENDDO ENDIF +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS) DEFAULT(NONE) +DO KMLOC=1,D_NUMP + DO JF=1,2*KF_FS + KM = D_MYMS(KMLOC) + IS = 1+MOD(R_NTMAX-KM+1,2) + IF (KM /= 0) THEN + !$ACC LOOP SEQ + DO J=1,(R_NSMAX-KM+3)/2 + POA1(JF,IS+1+(J-1)*2,KMLOC) = ZOUT(JF+(J-1)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) + ENDDO + ELSEIF (MOD(JF-1,2) == 0) THEN + !$ACC LOOP SEQ + DO J=1,(R_NSMAX+3)/2 + POA1(JF,IS+1+(J-1)*2,KMLOC0) = ZOUT0((JF-1)/2+1+(J-1)*IOUT0_STRIDES0) + ENDDO + ENDIF + ENDDO +ENDDO + !$ACC END DATA From 48962f0cf75ca9a32e90e285b39d78ffe77cba38 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:30 -0700 Subject: [PATCH 205/263] Run DGEMMs before SGEMMs in ledir --- src/trans/gpu/internal/ledir_mod.F90 | 68 ++++++++++++---------------- 1 file changed, 28 insertions(+), 40 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 6a38d2d14..99ad135a6 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -209,6 +209,20 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) CALL GSTATS(430,1) ENDIF CALL GSTATS(414,0) + +IF(KMLOC0 > 0) THEN + print*,'computing m=0 in double precision' + + CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_N, & + & KF_FS, (R%NSMAX+2)/2, G%NDGLU(0), & + & 1.0_JPRD, & + & ZINPA0, IIN0_STRIDES0, 0, & + & ZAA0, SIZE(ZAA0,1), 0, & + & 0.0_JPRD, & + & ZOUT0, IOUT0_STRIDES0, 0, & + & 1, STREAM=ACC_ASYNC_SYNC) +ENDIF ! Get C in transpose format to get better memory access patterns later !C=A*B => ! C^T=B^T*A^T @@ -242,27 +256,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ENDIF CALL GSTATS(414,1) -! compute m=0 in double precision: -IF(KMLOC0 > 0) THEN - print*,'computing m=0 in double precision' - - ! Get C in transpose format to get better memory access patterns later - !C=A*B => - ! C^T=B^T*A^T - - CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_N, & - & KF_FS, (R%NSMAX+2)/2, G%NDGLU(0), & - & 1.0_JPRD, & - & ZINPA0, IIN0_STRIDES0, 0, & - & ZAA0, SIZE(ZAA0,1), 0, & - & 0.0_JPRD, & - & ZOUT0, IOUT0_STRIDES0, 0, & - & 1, STREAM=ACC_ASYNC_SYNC) - CALL cudaDeviceSynchronize() - -ENDIF - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO JF=1,2*KF_FS @@ -290,6 +283,20 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) CALL GSTATS(430,1) ENDIF CALL GSTATS(414,0) + +IF(KMLOC0 > 0) THEN + ! compute m=0 in double precision: + call CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_N, & + & KF_FS, (R%NSMAX+3)/2, G%NDGLU(0), & + & 1.0_JPRD, & + & ZINPS0, IIN0_STRIDES0, 0, & + & ZAS0, SIZE(ZAS0,1), 0, & + & 0.0_JPRD, & + & ZOUT0, IOUT0_STRIDES0, 0, & + & 1, STREAM=ACC_ASYNC_SYNC) +ENDIF + ! Get C in transpose format to get better memory access patterns later !C=A*B => ! C^T=B^T*A^T @@ -323,25 +330,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ENDIF CALL GSTATS(414,1) -IF(KMLOC0 > 0) THEN - ! Get C in transpose format to get better memory access patterns later - !C=A*B => - ! C^T=B^T*A^T - - call CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_N, & - & KF_FS, (R%NSMAX+3)/2, G%NDGLU(0), & - & 1.0_JPRD, & - & ZINPS0, IIN0_STRIDES0, 0, & - & ZAS0, SIZE(ZAS0,1), 0, & - & 0.0_JPRD, & - & ZOUT0, IOUT0_STRIDES0, 0, & - & 1, STREAM=ACC_ASYNC_SYNC) - call cudaDeviceSynchronize() - - -ENDIF - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS) DEFAULT(NONE) DO KMLOC=1,D_NUMP DO JF=1,2*KF_FS From fc419afdbd71e8388dce996f080a9cc176410d67 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:30 -0700 Subject: [PATCH 206/263] add async statements in ledir --- src/trans/gpu/internal/ledir_mod.F90 | 33 ++++++++++++++++------------ 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 99ad135a6..390b91af4 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -166,7 +166,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !$ACC END KERNELS !$ACC DATA PRESENT(FOUBUF) -!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) +!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) ASYNC(1) DO KMLOC=1,D_NUMP DO JGL=1,R_NDGNH DO JF=1,KF_FS*2 @@ -198,12 +198,13 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) END DO !$ACC END DATA -!$ACC EXIT DATA DELETE(FOUBUF) -DEALLOCATE(FOUBUF) - ! anti-symmetric +IF(KMLOC0 > 0) THEN + PRINT*,'computing m=0 in double precision' +ENDIF IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) CALL GSTATS(430,0) CALL MPL_BARRIER(CDSTRING='') CALL GSTATS(430,1) @@ -211,8 +212,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) CALL GSTATS(414,0) IF(KMLOC0 > 0) THEN - print*,'computing m=0 in double precision' - + ! compute m=0 in double precision: CALL CUDA_GEMM_BATCHED( & & CUBLAS_OP_N, CUBLAS_OP_N, & & KF_FS, (R%NSMAX+2)/2, G%NDGLU(0), & @@ -221,7 +221,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & ZAA0, SIZE(ZAA0,1), 0, & & 0.0_JPRD, & & ZOUT0, IOUT0_STRIDES0, 0, & - & 1, STREAM=ACC_ASYNC_SYNC) + & 1, STREAM=1_C_LONG) ENDIF ! Get C in transpose format to get better memory access patterns later !C=A*B => @@ -247,16 +247,16 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & ZAA, SIZE(ZAA,1), BOFFSETS, & & 0.0_JPRBT, & & ZOUT, IOUT_STRIDES0, COFFSETS, & - & D_NUMP, STREAM=ACC_ASYNC_SYNC) -CALL cudaDeviceSynchronize() + & D_NUMP, STREAM=1_C_LONG) IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) CALL GSTATS(434,0) CALL MPL_BARRIER(CDSTRING='') CALL GSTATS(434,1) ENDIF CALL GSTATS(414,1) -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) DEFAULT(NONE) ASYNC(1) DO KMLOC=1,D_NUMP DO JF=1,2*KF_FS KM = D_MYMS(KMLOC) @@ -278,6 +278,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ! symmetric IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) CALL GSTATS(430,0) CALL MPL_BARRIER(CDSTRING='') CALL GSTATS(430,1) @@ -294,7 +295,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & ZAS0, SIZE(ZAS0,1), 0, & & 0.0_JPRD, & & ZOUT0, IOUT0_STRIDES0, 0, & - & 1, STREAM=ACC_ASYNC_SYNC) + & 1, STREAM=1_C_LONG) ENDIF ! Get C in transpose format to get better memory access patterns later @@ -321,16 +322,16 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) & ZAS, SIZE(ZAS,1), BOFFSETS, & & 0.0_JPRBT, & & ZOUT, IOUT_STRIDES0, COFFSETS, & - & D_NUMP, STREAM=ACC_ASYNC_SYNC) -CALL cudaDeviceSynchronize() + & D_NUMP, STREAM=1_C_LONG) IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) CALL GSTATS(434,0) CALL MPL_BARRIER(CDSTRING='') CALL GSTATS(434,1) ENDIF CALL GSTATS(414,1) -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS) DEFAULT(NONE) ASYNC(1) DO KMLOC=1,D_NUMP DO JF=1,2*KF_FS KM = D_MYMS(KMLOC) @@ -348,9 +349,13 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ENDIF ENDDO ENDDO +!$ACC WAIT(1) !$ACC END DATA +!$ACC EXIT DATA DELETE(FOUBUF) +DEALLOCATE(FOUBUF) + IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ From a69c969d67e70de45adbe68cdc215856b3a53534 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:31 -0700 Subject: [PATCH 207/263] Add ZOUTS0/ZOUTA0 to have same semantics in leinv --- src/trans/gpu/internal/leinv_mod.F90 | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index ded1f3408..b4f10b5be 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -80,7 +80,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ! LOCAL REAL(KIND=JPRBT), POINTER :: ZBASE(:), ZINP(:), ZOUTS(:), ZOUTA(:) -REAL(KIND=JPRD), POINTER :: ZBASE0(:), ZINP0(:), ZOUT0(:) +REAL(KIND=JPRD), POINTER :: ZBASE0(:), ZINP0(:), ZOUTS0(:), ZOUTA0(:) REAL(KIND=JPRBT) :: ZAOA, ZSOA INTEGER(KIND=JPIM) :: IA, IS, ISL, J1, JGL, JK, J, IGLS, ISTAS, OFFSET1, OFFSET2 @@ -121,7 +121,8 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) +ALIGN(IOUT_STRIDES1*D_NUMP,8)*SIZEOF(ZOUTS(1)) & +ALIGN(IOUT_STRIDES1*D_NUMP,8)*SIZEOF(ZOUTA(1)) & +ALIGN(IIN0_STRIDES1,8)*SIZEOF(ZINP0(1)) & - +ALIGN(IOUT0_STRIDES1,8)*SIZEOF(ZOUT0(1)) + +ALIGN(IOUT0_STRIDES1,8)*SIZEOF(ZOUTS0(1)) & + +ALIGN(IOUT0_STRIDES1,8)*SIZEOF(ZOUTA0(1)) IF (.NOT. ALLOCATED(REUSE_PTR)) THEN ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) !$ACC ENTER DATA CREATE(REUSE_PTR) @@ -151,11 +152,13 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ALLOC_POS=1 ZINP0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IIN0_STRIDES1-1) ALLOC_POS=ALLOC_POS+ALIGN(IIN0_STRIDES1,8) -ZOUT0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IOUT0_STRIDES1-1) +ZOUTS0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IOUT0_STRIDES1-1) +ALLOC_POS=ALLOC_POS+ALIGN(IOUT0_STRIDES1,8) +ZOUTA0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IOUT0_STRIDES1-1) ALLOC_POS=ALLOC_POS+ALIGN(IOUT0_STRIDES1,8) !$ACC DATA PRESENT(D,D_MYMS,G,G_NDGLU,R) & -!$ACC& PRESENT(ZINP,ZOUTS,ZOUTA,ZINP0,ZOUT0) & +!$ACC& PRESENT(ZINP,ZOUTS,ZOUTA,ZINP0,ZOUTS0,ZOUTA0) & !$ACC& PRESENT(ZAA,ZAS,PIA) & !$ACC& PRESENT(D_MYMS,D_NPNTGTB1,G_NDGLU) @@ -256,7 +259,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & ZINP0, IIN0_STRIDES0, 0, & & ZAA0, SIZE(ZAA0,1), 0, & & 0.0_JPRD, & - & ZOUT0, IOUT0_STRIDES0, 0, & + & ZOUTA0, IOUT0_STRIDES0, 0, & & 1, STREAM=ACC_ASYNC_SYNC) CALL cudaDeviceSynchronize() @@ -265,7 +268,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) DO JK=1,KFIELDS,2 ! ZAOA = ZOUTA((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) ZOUTA((JK-1)/2+1+(JGL-1)*IOUT_STRIDES0+(KMLOC0-1)*IOUT_STRIDES1) & - & = ZOUT0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) + & = ZOUTA0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) ENDDO ENDDO @@ -355,7 +358,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & ZINP0, IIN0_STRIDES0, 0, & & ZAS0, SIZE(ZAS0,1), 0, & & 0.0_JPRD, & - & ZOUT0, IOUT0_STRIDES0, 0, & + & ZOUTS0, IOUT0_STRIDES0, 0, & & 1, STREAM=ACC_ASYNC_SYNC) CALL cudaDeviceSynchronize() @@ -363,7 +366,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) DO JGL=1,G_NDGLU(0) DO JK=1,KFIELDS,2 ZOUTS((JK-1)/2+1+(JGL-1)*IOUT_STRIDES0+(KMLOC0-1)*IOUT_STRIDES1) & - & = ZOUT0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) + & = ZOUTS0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) ENDDO ENDDO From 039606230e55e850627871c91ff8803d241966b8 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:31 -0700 Subject: [PATCH 208/263] Move around kernels in leinv --- src/trans/gpu/internal/leinv_mod.F90 | 60 +++++++++++++++------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index b4f10b5be..23b0b98b7 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -198,6 +198,15 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDIF ENDDO ENDDO +IF (KMLOC0 > 0) THEN + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IA) DEFAULT(NONE) + DO J=1,(R_NSMAX+2)/2 + DO JK=1,KFIELDS,2 + IA = 1+MOD(R_NSMAX+2,2) + ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IA+1+(J-1)*2,KMLOC0) + ENDDO + ENDDO +ENDIF ! operate on full arrays, where non-relavent entries have been set to zero ! Get C in transpose format to get better memory access patterns later @@ -244,13 +253,6 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) IF (KMLOC0 > 0) THEN print*,'computing m=0 in double precision' - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IA) DEFAULT(NONE) - DO J=1,(R_NSMAX+2)/2 - DO JK=1,KFIELDS,2 - IA = 1+MOD(R_NSMAX+2,2) - ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IA+1+(J-1)*2,KMLOC0) - ENDDO - ENDDO CALL CUDA_GEMM_BATCHED( & & CUBLAS_OP_N, CUBLAS_OP_T, & @@ -263,15 +265,6 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & 1, STREAM=ACC_ASYNC_SYNC) CALL cudaDeviceSynchronize() - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) - DO JGL=1,G_NDGLU(0) - DO JK=1,KFIELDS,2 - ! ZAOA = ZOUTA((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) - ZOUTA((JK-1)/2+1+(JGL-1)*IOUT_STRIDES0+(KMLOC0-1)*IOUT_STRIDES1) & - & = ZOUTA0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) - ENDDO - ENDDO - ENDIF ! 2. +++++++++++++ symmetric @@ -302,6 +295,16 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDIF ENDDO ENDDO +IF (KMLOC0 > 0) THEN + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IA) DEFAULT(NONE) + DO J=1,(R_NSMAX+3)/2 + DO JK=1,KFIELDS,2 + IS = 1+MOD(R_NSMAX+1,2) + ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IS+1+(J-1)*2,KMLOC0) + ENDDO + ENDDO +ENDIF + !C=A*B => ! C^T=B^T*A^T @@ -343,14 +346,6 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) CALL GSTATS(424,1) IF (KMLOC0 > 0) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IA) DEFAULT(NONE) - DO J=1,(R_NSMAX+3)/2 - DO JK=1,KFIELDS,2 - IS = 1+MOD(R_NSMAX+1,2) - ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IS+1+(J-1)*2,KMLOC0) - ENDDO - ENDDO - CALL CUDA_GEMM_BATCHED( & & CUBLAS_OP_N, CUBLAS_OP_T, & & KFIELDS/2, G%NDGLU(0), (R%NSMAX+3)/2, & @@ -361,7 +356,20 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & ZOUTS0, IOUT0_STRIDES0, 0, & & 1, STREAM=ACC_ASYNC_SYNC) CALL cudaDeviceSynchronize() +ENDIF + +ALLOCATE(FOUBUF_IN(D%NLENGT1B*KFIELDS)) +!$ACC ENTER DATA CREATE(FOUBUF_IN) +IF (KMLOC0 > 0) THEN + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) + DO JGL=1,G_NDGLU(0) + DO JK=1,KFIELDS,2 + ! ZAOA = ZOUTA((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) + ZOUTA((JK-1)/2+1+(JGL-1)*IOUT_STRIDES0+(KMLOC0-1)*IOUT_STRIDES1) & + & = ZOUTA0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) + ENDDO + ENDDO !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) DO JGL=1,G_NDGLU(0) DO JK=1,KFIELDS,2 @@ -371,10 +379,6 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDDO ENDIF - -ALLOCATE(FOUBUF_IN(D%NLENGT1B*KFIELDS)) -!$ACC ENTER DATA CREATE(FOUBUF_IN) - !$ACC DATA PRESENT(FOUBUF_IN) !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,ISTAS,ZAOA,ZSOA) DO KMLOC=1,D_NUMP From 08a53cc050b1de549d6c53460d7aefdee649b32d Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:31 -0700 Subject: [PATCH 209/263] Merge input kernels in leinv --- src/trans/gpu/internal/ledir_mod.F90 | 4 ++-- src/trans/gpu/internal/leinv_mod.F90 | 35 +++++----------------------- 2 files changed, 8 insertions(+), 31 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 390b91af4..1b5d94c72 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -269,7 +269,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ELSEIF (MOD(JF-1,2) == 0) THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX+2)/2 - POA1(JF,IA+1+(J-1)*2,KMLOC0) = ZOUT0((JF-1)/2+1+(J-1)*IOUT0_STRIDES0) + POA1(JF,IA+1+(J-1)*2,KMLOC) = ZOUT0((JF-1)/2+1+(J-1)*IOUT0_STRIDES0) ENDDO ENDIF ENDDO @@ -344,7 +344,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ELSEIF (MOD(JF-1,2) == 0) THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX+3)/2 - POA1(JF,IS+1+(J-1)*2,KMLOC0) = ZOUT0((JF-1)/2+1+(J-1)*IOUT0_STRIDES0) + POA1(JF,IS+1+(J-1)*2,KMLOC) = ZOUT0((JF-1)/2+1+(J-1)*IOUT0_STRIDES0) ENDDO ENDIF ENDDO diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 23b0b98b7..2b8268375 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -190,23 +190,15 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) DO J=1,(R_NSMAX-KM+2)/2 ZINP(JK+(J-1)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PIA(JK,IA+1+(J-1)*2,KMLOC) ENDDO - ELSEIF (MOD((JK+1),2) .EQ. 0) THEN + ELSEIF (MOD((JK-1),2) .EQ. 0) THEN + ! every other field is sufficient because Im(KM=0) == 0 !$ACC LOOP SEQ DO J=1,(R_NSMAX+2)/2 - ZINP((JK-1)/2+1+(J-1)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PIA(JK,IA+1+(J-1)*2,KMLOC) + ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IA+1+(J-1)*2,KMLOC) ENDDO ENDIF ENDDO ENDDO -IF (KMLOC0 > 0) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IA) DEFAULT(NONE) - DO J=1,(R_NSMAX+2)/2 - DO JK=1,KFIELDS,2 - IA = 1+MOD(R_NSMAX+2,2) - ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IA+1+(J-1)*2,KMLOC0) - ENDDO - ENDDO -ENDIF ! operate on full arrays, where non-relavent entries have been set to zero ! Get C in transpose format to get better memory access patterns later @@ -282,28 +274,19 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) DO JK=1,KFIELDS KM = D_MYMS(KMLOC) IS = 1+MOD(R_NSMAX-KM+1,2) - IF(KM /= 0)THEN + IF(KM /= 0) THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+3)/2 ZINP(JK+(J-1)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PIA(JK,IS+1+(J-1)*2,KMLOC) ENDDO - ELSEIF (MOD((JK+1),2) .EQ. 0) THEN + ELSEIF (MOD((JK-1),2) == 0) THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX+3)/2 - ZINP((JK-1)/2+1+(J-1)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PIA(JK,IS+1+(J-1)*2,KMLOC) + ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IS+1+(J-1)*2,KMLOC) ENDDO ENDIF ENDDO ENDDO -IF (KMLOC0 > 0) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(IA) DEFAULT(NONE) - DO J=1,(R_NSMAX+3)/2 - DO JK=1,KFIELDS,2 - IS = 1+MOD(R_NSMAX+1,2) - ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IS+1+(J-1)*2,KMLOC0) - ENDDO - ENDDO -ENDIF !C=A*B => @@ -365,14 +348,8 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) DO JGL=1,G_NDGLU(0) DO JK=1,KFIELDS,2 - ! ZAOA = ZOUTA((JK-1)/2+1+(JGL-ISL+(KMLOC-1)*R_NDGNH)*KFIELDS) ZOUTA((JK-1)/2+1+(JGL-1)*IOUT_STRIDES0+(KMLOC0-1)*IOUT_STRIDES1) & & = ZOUTA0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) - ENDDO - ENDDO - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) - DO JGL=1,G_NDGLU(0) - DO JK=1,KFIELDS,2 ZOUTS((JK-1)/2+1+(JGL-1)*IOUT_STRIDES0+(KMLOC0-1)*IOUT_STRIDES1) & & = ZOUTS0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) ENDDO From 715a1d6e0c4de3ac4183a02057798ba08f950a91 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:31 -0700 Subject: [PATCH 210/263] Merge output kernels in leinv --- src/trans/gpu/internal/leinv_mod.F90 | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 2b8268375..b13f4a4d5 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -344,18 +344,6 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ALLOCATE(FOUBUF_IN(D%NLENGT1B*KFIELDS)) !$ACC ENTER DATA CREATE(FOUBUF_IN) -IF (KMLOC0 > 0) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) - DO JGL=1,G_NDGLU(0) - DO JK=1,KFIELDS,2 - ZOUTA((JK-1)/2+1+(JGL-1)*IOUT_STRIDES0+(KMLOC0-1)*IOUT_STRIDES1) & - & = ZOUTA0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) - ZOUTS((JK-1)/2+1+(JGL-1)*IOUT_STRIDES0+(KMLOC0-1)*IOUT_STRIDES1) & - & = ZOUTS0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) - ENDDO - ENDDO - -ENDIF !$ACC DATA PRESENT(FOUBUF_IN) !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,ISTAS,ZAOA,ZSOA) DO KMLOC=1,D_NUMP @@ -373,8 +361,8 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ZSOA = ZOUTS(JK+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) ZAOA = ZOUTA(JK+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) ELSEIF (MOD((JK-1),2) .EQ. 0) THEN - ZSOA = ZOUTS((JK-1)/2+1+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) - ZAOA = ZOUTA((JK-1)/2+1+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) + ZSOA = ZOUTS0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) + ZAOA = ZOUTA0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) ELSE ! Imaginary values of KM=0 is zero, though I don't think we care ZSOA = 0_JPRBT From 3fd83ffeee9de5b1f2fa03567c14b032c4305c3f Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:31 -0700 Subject: [PATCH 211/263] Run DGEMMs before SGEMMs in leinv --- src/trans/gpu/internal/leinv_mod.F90 | 78 +++++++++++++--------------- 1 file changed, 36 insertions(+), 42 deletions(-) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index b13f4a4d5..dd41e31f6 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -169,6 +169,13 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ZOUTA(:) = 0 !$ACC END KERNELS +ALLOCATE(FOUBUF_IN(D%NLENGT1B*KFIELDS)) +!$ACC ENTER DATA CREATE(FOUBUF_IN) + +IF (KMLOC0 > 0) THEN + print*,'computing m=0 in double precision' +ENDIF + ! READ 2:NSMAX+3 !IF KM=0 and NSMAX is 6: @@ -200,18 +207,27 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDDO ENDDO -! operate on full arrays, where non-relavent entries have been set to zero -! Get C in transpose format to get better memory access patterns later -!C=A*B => -! C^T=B^T*A^T - IF (LSYNC_TRANS) THEN CALL GSTATS(440,0) CALL MPL_BARRIER(CDSTRING='') CALL GSTATS(440,1) ENDIF CALL GSTATS(424,0) -! OVERLOADED FOR SINGLE AND DOUBLE PRECISION + +IF (KMLOC0 > 0) THEN + ! compute m=0 in double precision: + CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_T, & + & KFIELDS/2, G%NDGLU(0), (R%NSMAX+2)/2, & + & 1.0_JPRD, & + & ZINP0, IIN0_STRIDES0, 0, & + & ZAA0, SIZE(ZAA0,1), 0, & + & 0.0_JPRD, & + & ZOUTA0, IOUT0_STRIDES0, 0, & + & 1, STREAM=ACC_ASYNC_SYNC) + CALL cudaDeviceSynchronize() +ENDIF + DO KMLOC=1,D_NUMP KM = D_MYMS(KMLOC) KS(KMLOC) = (R%NSMAX-KM+2)/2 @@ -243,22 +259,6 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDIF CALL GSTATS(424,1) -IF (KMLOC0 > 0) THEN - print*,'computing m=0 in double precision' - - CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_T, & - & KFIELDS/2, G%NDGLU(0), (R%NSMAX+2)/2, & - & 1.0_JPRD, & - & ZINP0, IIN0_STRIDES0, 0, & - & ZAA0, SIZE(ZAA0,1), 0, & - & 0.0_JPRD, & - & ZOUTA0, IOUT0_STRIDES0, 0, & - & 1, STREAM=ACC_ASYNC_SYNC) - CALL cudaDeviceSynchronize() - -ENDIF - ! 2. +++++++++++++ symmetric !IF KM=0 and NSMAX is 6: ! IS=2 @@ -288,16 +288,26 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDDO ENDDO - -!C=A*B => -! C^T=B^T*A^T - IF (LSYNC_TRANS) THEN CALL GSTATS(440,0) CALL MPL_BARRIER(CDSTRING='') CALL GSTATS(440,1) ENDIF CALL GSTATS(424,0) + +IF (KMLOC0 > 0) THEN + CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_T, & + & KFIELDS/2, G%NDGLU(0), (R%NSMAX+3)/2, & + & 1.0_JPRD, & + & ZINP0, IIN0_STRIDES0, 0, & + & ZAS0, SIZE(ZAS0,1), 0, & + & 0.0_JPRD, & + & ZOUTS0, IOUT0_STRIDES0, 0, & + & 1, STREAM=ACC_ASYNC_SYNC) + CALL cudaDeviceSynchronize() +ENDIF + DO KMLOC=1,D_NUMP KM = D_MYMS(KMLOC) KS(KMLOC) = (R%NSMAX-KM+3)/2 @@ -328,22 +338,6 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDIF CALL GSTATS(424,1) -IF (KMLOC0 > 0) THEN - CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_T, & - & KFIELDS/2, G%NDGLU(0), (R%NSMAX+3)/2, & - & 1.0_JPRD, & - & ZINP0, IIN0_STRIDES0, 0, & - & ZAS0, SIZE(ZAS0,1), 0, & - & 0.0_JPRD, & - & ZOUTS0, IOUT0_STRIDES0, 0, & - & 1, STREAM=ACC_ASYNC_SYNC) - CALL cudaDeviceSynchronize() -ENDIF - -ALLOCATE(FOUBUF_IN(D%NLENGT1B*KFIELDS)) -!$ACC ENTER DATA CREATE(FOUBUF_IN) - !$ACC DATA PRESENT(FOUBUF_IN) !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,ISTAS,ZAOA,ZSOA) DO KMLOC=1,D_NUMP From 19d31f058c2e656d3370510234a8fdf81e5c11bb Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:32 -0700 Subject: [PATCH 212/263] add async statements in leinv --- src/trans/gpu/internal/leinv_mod.F90 | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index dd41e31f6..4569a0565 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -187,7 +187,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ! DO=1,7/2+1 ... 1..4 ! PIA_2=2+1+(1..4-1)*2 ...3+(0..3)*2 .... 3,5,7,9 -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) DEFAULT(NONE) ASYNC(1) DO KMLOC=1,D_NUMP DO JK=1,KFIELDS KM = D_MYMS(KMLOC) @@ -208,6 +208,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDDO IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) CALL GSTATS(440,0) CALL MPL_BARRIER(CDSTRING='') CALL GSTATS(440,1) @@ -224,8 +225,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & ZAA0, SIZE(ZAA0,1), 0, & & 0.0_JPRD, & & ZOUTA0, IOUT0_STRIDES0, 0, & - & 1, STREAM=ACC_ASYNC_SYNC) - CALL cudaDeviceSynchronize() + & 1, STREAM=1_C_LONG) ENDIF DO KMLOC=1,D_NUMP @@ -249,10 +249,10 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & ZAA, SIZE(ZAA,1), BOFFSETS, & & 0.0_JPRBT, & & ZOUTA, IOUT_STRIDES0, COFFSETS, & - & D_NUMP, STREAM=ACC_ASYNC_SYNC) -CALL cudaDeviceSynchronize() + & D_NUMP, STREAM=1_C_LONG) IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) CALL GSTATS(444,0) CALL MPL_BARRIER(CDSTRING='') CALL GSTATS(444,1) @@ -269,7 +269,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ! DO=1,5 ! PIA_2=1+1+(1..5-1)*2 ...2+(0..4)*2 .... 2,4,6,8,10 -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS,J) DEFAULT(NONE) +!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS,J) DEFAULT(NONE) ASYNC(1) DO KMLOC=1,D_NUMP DO JK=1,KFIELDS KM = D_MYMS(KMLOC) @@ -289,6 +289,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDDO IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) CALL GSTATS(440,0) CALL MPL_BARRIER(CDSTRING='') CALL GSTATS(440,1) @@ -304,8 +305,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & ZAS0, SIZE(ZAS0,1), 0, & & 0.0_JPRD, & & ZOUTS0, IOUT0_STRIDES0, 0, & - & 1, STREAM=ACC_ASYNC_SYNC) - CALL cudaDeviceSynchronize() + & 1, STREAM=1_C_LONG) ENDIF DO KMLOC=1,D_NUMP @@ -329,9 +329,9 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) & ZAS, SIZE(ZAS,1), BOFFSETS, & & 0.0_JPRBT, & & ZOUTS, IOUT_STRIDES0, COFFSETS, & - & D_NUMP, STREAM=ACC_ASYNC_SYNC) -CALL cudaDeviceSynchronize() + & D_NUMP, STREAM=1_C_LONG) IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) CALL GSTATS(444,0) CALL MPL_BARRIER(CDSTRING='') CALL GSTATS(444,1) @@ -339,7 +339,7 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) CALL GSTATS(424,1) !$ACC DATA PRESENT(FOUBUF_IN) -!$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,ISTAS,ZAOA,ZSOA) +!$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,ISTAS,ZAOA,ZSOA) ASYNC(1) DO KMLOC=1,D_NUMP DO JGL=1,R_NDGNH DO JK=1,KFIELDS @@ -369,6 +369,9 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) ENDDO ENDDO ENDDO + +!$ACC WAIT(1) + !$ACC END DATA !$ACC END DATA From 43d52948a9d9895050f2e0b00e33d1a128c64994 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:32 -0700 Subject: [PATCH 213/263] enable 3XTF32 on ampere --- .../gpu/algor/external/gemm/gemm_wrapper.cu | 79 +++++++++++++------ 1 file changed, 55 insertions(+), 24 deletions(-) diff --git a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu index 7fc663948..e89a4257b 100644 --- a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu +++ b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu @@ -133,10 +133,14 @@ CutlassGemm &get_cutlass_handle() { } namespace detail { + +enum class CutlassType { cutlass_3xtf32, cutlass_fp32 }; + +template +class cutlass_sgemm_grouped; + template -class cutlass_sgemm_grouped { -#if 0 - // we will enable this later (this ifdefs did not work, so I am going to enable this properly ltaer) +class cutlass_sgemm_grouped { // this was verified using Ampere and uses 3XTF32 static constexpr int AlignmentA = 4; static constexpr int AlignmentB = 4; @@ -171,7 +175,27 @@ class cutlass_sgemm_grouped { MyOp // >; static constexpr int sz_align = 8; -#else + + public: + void operator()(cudaStream_t stream, int m, int n, int k, float alpha, + const float *A, int lda, const float *B, int ldb, float beta, + float *C, int ldc) const { + auto &gemm_op = get_cutlass_handle(); + CUTLASS_CHECK(gemm_op( + {// + {(m + sz_align - 1) / sz_align * sz_align, + (n + sz_align - 1) / sz_align * sz_align, + (k + sz_align - 1) / sz_align * sz_align}, + {const_cast(A), lda}, + {const_cast(B), ldb}, + {C, ldc}, + {C, ldc}, + {alpha, beta}}, + nullptr, stream)); + } +}; +template +class cutlass_sgemm_grouped { // this was verified using Volta and uses FP32 static constexpr int AlignmentA = 1; static constexpr int AlignmentB = 1; @@ -206,7 +230,6 @@ class cutlass_sgemm_grouped { MyOp // >; static constexpr int sz_align = 1; -#endif public: void operator()(cudaStream_t stream, int m, int n, int k, float alpha, @@ -236,9 +259,21 @@ void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k, int ldc, int *offsetsC, int batchCount, cudaStream_t stream) { using namespace detail; - run_group_graph(cutlass_sgemm_grouped(), m, n, k, alpha, A, - lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, - batchCount, stream, blas_id); + int device; + CUDA_CHECK(cudaGetDevice(&device)); + int capability_major; + CUDA_CHECK(cudaDeviceGetAttribute(&capability_major, + cudaDevAttrComputeCapabilityMajor, device)); + if (capability_major >= 8) + run_group_graph(cutlass_sgemm_grouped(), + m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + ldc, offsetsC, batchCount, stream, blas_id); + else + run_group_graph(cutlass_sgemm_grouped(), + m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + ldc, offsetsC, batchCount, stream, blas_id); } void cutlass_sgemm_wrapper_grouped(int blas_id, cublasOperation_t transa, cublasOperation_t transb, int m, int *n, @@ -320,9 +355,9 @@ void cublas_dgemm_wrapper_grouped(int blas_id, cublasOperation_t transa, double *C, int ldc, int *offsetsC, int batchCount, cudaStream_t stream) { using namespace detail; - run_group_graph(cublas_gemm_grouped(transa, transb), m, n, k, alpha, - A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, - batchCount, stream, blas_id); + run_group(cublas_gemm_grouped(transa, transb), m, n, k, alpha, A, lda, + offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, + stream, blas_id); } } // namespace @@ -334,8 +369,7 @@ void cublas_dgemm_wrapper(cublasOperation_t transa, cublasOperation_t transb, double beta, double *C, int ldc, int tdc, int batchCount, size_t stream) { cublasHandle_t handle = detail::get_cublas_handle(); - CUBLAS_CHECK( - cublasSetStream(handle, *(cudaStream_t*)stream)); + CUBLAS_CHECK(cublasSetStream(handle, *(cudaStream_t *)stream)); CUBLAS_CHECK(cublasDgemmStridedBatched(handle, transa, transb, m, n, k, &alpha, A, lda, tda, B, ldb, tdb, &beta, C, ldc, tdc, batchCount)); @@ -347,8 +381,7 @@ void cublas_sgemm_wrapper(cublasOperation_t transa, cublasOperation_t transb, float beta, float *C, int ldc, int tdc, int batchCount, size_t stream) { cublasHandle_t handle = detail::get_cublas_handle(); - CUBLAS_CHECK( - cublasSetStream(handle, *(cudaStream_t*)stream)); + CUBLAS_CHECK(cublasSetStream(handle, *(cudaStream_t *)stream)); CUBLAS_CHECK(cublasSgemmStridedBatched(handle, transa, transb, m, n, k, &alpha, A, lda, tda, B, ldb, tdb, &beta, C, ldc, tdc, batchCount)); @@ -361,15 +394,13 @@ void blas_sgemm_wrapper_grouped(int blas_id, cublasOperation_t transa, int *offsetsB, float beta, float *C, int ldc, int *offsetsC, int batchCount, size_t stream) { if (use_cutlass) - cutlass_sgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, - lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, - offsetsC, batchCount, - *(cudaStream_t*)stream); + cutlass_sgemm_wrapper_grouped( + blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B, ldb, + offsetsB, beta, C, ldc, offsetsC, batchCount, *(cudaStream_t *)stream); else cublas_sgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, - offsetsC, batchCount, - *(cudaStream_t*)stream); + offsetsC, batchCount, *(cudaStream_t *)stream); } void blas_dgemm_wrapper_grouped(int blas_id, cublasOperation_t transa, cublasOperation_t transb, int m, int *n, int *k, @@ -377,8 +408,8 @@ void blas_dgemm_wrapper_grouped(int blas_id, cublasOperation_t transa, int *offsetsA, const double *B, int ldb, int *offsetsB, double beta, double *C, int ldc, int *offsetsC, int batchCount, size_t stream) { - cublas_dgemm_wrapper_grouped( - blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, - C, ldc, offsetsC, batchCount, *(cudaStream_t*)stream); + cublas_dgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, + B, ldb, offsetsB, beta, C, ldc, offsetsC, + batchCount, *(cudaStream_t *)stream); } } From 4762963dc47eb6d9ae782749582f0b0ec7438704 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:40 -0700 Subject: [PATCH 214/263] Remove unneeded zero init --- src/trans/gpu/internal/ledir_mod.F90 | 6 ------ src/trans/gpu/internal/leinv_mod.F90 | 7 ------- 2 files changed, 13 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 1b5d94c72..d2d3df903 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -159,12 +159,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !$ACC& PRESENT(ZAA,ZAS,POA1) & !$ACC& PRESENT(D_NPNTGTB1) -! TODO this doesn't make sense that we need it (???) -!$ACC KERNELS -ZINPS(:) = 0 -ZINPA(:) = 0 -!$ACC END KERNELS - !$ACC DATA PRESENT(FOUBUF) !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) ASYNC(1) DO KMLOC=1,D_NUMP diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 4569a0565..4969ffe3e 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -162,13 +162,6 @@ SUBROUTINE LEINV(PIA,FOUBUF_IN) !$ACC& PRESENT(ZAA,ZAS,PIA) & !$ACC& PRESENT(D_MYMS,D_NPNTGTB1,G_NDGLU) -! TODO this doesn't make sense that we need it (???) -!$ACC KERNELS -ZINP(:) = 0 -ZOUTS(:) = 0 -ZOUTA(:) = 0 -!$ACC END KERNELS - ALLOCATE(FOUBUF_IN(D%NLENGT1B*KFIELDS)) !$ACC ENTER DATA CREATE(FOUBUF_IN) From 0352bf7fb95119eb47079f8abca08363a7428b93 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:33 -0700 Subject: [PATCH 215/263] Allow shortcut if only one process for trltom --- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 22 ++- src/trans/gpu/internal/ledir_mod.F90 | 187 ++++++++++++++++++----- src/trans/gpu/internal/ltdir_ctl_mod.F90 | 10 +- 3 files changed, 174 insertions(+), 45 deletions(-) diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index cfe456bdf..92014429c 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -69,6 +69,8 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + +USE LEDIR_MOD, ONLY: LEDIR_ALLOC_SIZE use ieee_arithmetic ! @@ -104,6 +106,8 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & INTEGER(KIND=JPIM) :: IBEG,IEND,IINC,IFIRST INTEGER(KIND=JPIM) :: ISIZE,IFIELDS,ICHUNK,ICHUNKS,JK +INTEGER(KIND=JPIM) :: IALLOC_SIZE + ! ------------------------------------------------------------------ ! Field distribution in Spectral/Fourier space @@ -151,13 +155,21 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & IST = IST+KF_SCALARS_G ENDIF +IALLOC_SIZE=KF_FS*D%NLENGTF + + +IF (NPROC == 1) THEN + ! If we have only one process, we skip the intermediate buffer, so we need more space + IALLOC_SIZE = IALLOC_SIZE + LEDIR_ALLOC_SIZE(KF_FS)/SIZEOF(REUSE_PTR(1)) +ENDIF + IF (.NOT. ALLOCATED(REUSE_PTR)) THEN - ALLOCATE(REUSE_PTR(KF_FS*D%NLENGTF)) + ALLOCATE(REUSE_PTR(IALLOC_SIZE)) !$ACC ENTER DATA CREATE(REUSE_PTR) -ELSEIF (SIZE(REUSE_PTR) < KF_FS*D%NLENGTF) THEN +ELSEIF (SIZE(REUSE_PTR) < IALLOC_SIZE) THEN !$ACC EXIT DATA DELETE(REUSE_PTR) DEALLOCATE(REUSE_PTR) - ALLOCATE(REUSE_PTR(KF_FS*D%NLENGTF)) + ALLOCATE(REUSE_PTR(IALLOC_SIZE)) !$ACC ENTER DATA CREATE(REUSE_PTR) ENDIF PREEL_REAL => REUSE_PTR @@ -182,7 +194,9 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & CALL GSTATS(1640,0) IF (KF_FS > 0) THEN CALL FTDIR(PREEL_REAL,PREEL_COMPLEX,KF_FS) - CALL FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KF_FS) + IF (NPROC > 1) THEN + CALL FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KF_FS) + ENDIF ELSE PREEL_COMPLEX => PREEL_REAL ENDIF diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index d2d3df903..586182ec8 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -10,7 +10,48 @@ ! MODULE LEDIR_MOD + USE PARKIND_ECTRANS ,ONLY : JPIM + + PRIVATE + PUBLIC :: LEDIR, LEDIR_ALLOC_SIZE + + INTEGER(KIND=JPIM) :: A = 8 !Alignment CONTAINS +FUNCTION LEDIR_ALLOC_SIZE(KF_FS) + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD + USE TPM_DIM ,ONLY : R + USE TPM_DISTR ,ONLY : D + + IMPLICIT NONE + + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=8) :: LEDIR_ALLOC_SIZE + + REAL(KIND=JPRBT) :: ZPRBT_DUMMY + REAL(KIND=JPRD) :: ZPRD_DUMMY + + IOUT_STRIDES0 = ALIGN(2*KF_FS,A) + IOUT_STRIDES1 = IOUT_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) + IIN_STRIDES0 = ALIGN(2*KF_FS,A) + IIN_STRIDES1 = IIN_STRIDES0 * ALIGN(R%NDGNH,A) + IOUT0_STRIDES0 = ALIGN(KF_FS,A) + IOUT0_STRIDES1 = IOUT0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) + IIN0_STRIDES0 = ALIGN(KF_FS,A) + IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(R%NDGNH,A) + + ! Check if the reuse buffer is large enough + LEDIR_ALLOC_SIZE = ALIGN(IIN_STRIDES1*D%NUMP,8)*SIZEOF(ZPRBT_DUMMY) & + +ALIGN(IIN_STRIDES1*D%NUMP,8)*SIZEOF(ZPRBT_DUMMY) & + +ALIGN(IOUT_STRIDES1*D%NUMP,8)*SIZEOF(ZPRBT_DUMMY) & + +ALIGN(IIN0_STRIDES1,8)*SIZEOF(ZPRD_DUMMY) & + +ALIGN(IIN0_STRIDES1,8)*SIZEOF(ZPRD_DUMMY) & + +ALIGN(IOUT0_STRIDES1,8)*SIZEOF(ZPRD_DUMMY) +END FUNCTION SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !**** *LEDIR* - Direct Legendre transform. @@ -69,6 +110,10 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) USE IEEE_ARITHMETIC USE OPENACC +!!!! +USE TPM_DISTR ,ONLY : D,MYSETW,MYPROC, NPROC, D_NSTAGTF, D_NPNTGTB0,D_NPTRLS +USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NLOEN + IMPLICIT NONE @@ -84,7 +129,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) INTEGER(KIND=JPIM) :: IA, IS, ISL, J INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE -REAL(KIND=JPRBT) :: PAIA, PAIS +REAL(KIND=JPRBT) :: PAIA, PAIS, V1, V2 INTEGER(KIND=JPIM) :: IGLS, JF, JGL INTEGER(KIND=JPIM) :: OFFSET1, OFFSET2 @@ -97,7 +142,9 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS -INTEGER(KIND=JPIM) :: A = 8 !Alignment + INTEGER(KIND=JPIM) :: IGLG, IOFF_LAT, ISTA, OFFSET_VAR, KGL, JM + REAL(KIND=JPRBT):: SCAL + REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) @@ -110,8 +157,18 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) IIN0_STRIDES0 = ALIGN(KF_FS,A) IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(R_NDGNH,A) +IF (NPROC == 1) THEN + ! Short cut - no need to go through tansforms, we will go directly into + ! the legendre space, but for that we need twice the memory, roughly + ! (but we don't need the send/recv buffers) + ALLOC_SZ = KF_FS*D%NLENGTF*SIZEOF(REUSE_PTR(1)) +ELSE + ALLOC_SZ = 0 +ENDIF + ! Check if the reuse buffer is large enough -ALLOC_SZ = ALIGN(IIN_STRIDES1*D_NUMP,8)*SIZEOF(ZINPS(1)) & +ALLOC_SZ = ALLOC_SZ & + +ALIGN(IIN_STRIDES1*D_NUMP,8)*SIZEOF(ZINPS(1)) & +ALIGN(IIN_STRIDES1*D_NUMP,8)*SIZEOF(ZINPA(1)) & +ALIGN(IOUT_STRIDES1*D_NUMP,8)*SIZEOF(ZOUT(1)) & +ALIGN(IIN0_STRIDES1,8)*SIZEOF(ZINPS0(1)) & @@ -121,7 +178,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) IF (.NOT. ALLOCATED(REUSE_PTR)) THEN ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) !$ACC ENTER DATA CREATE(REUSE_PTR) -ELSEIF (SIZEOF(REUSE_PTR) <= ALLOC_SZ) THEN +ELSEIF (SIZEOF(REUSE_PTR) < ALLOC_SZ) THEN ! and reallocate if needed !$ACC EXIT DATA DELETE(REUSE_PTR) DEALLOCATE(REUSE_PTR) @@ -131,9 +188,13 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ! Figure out which pointers to use ALLOC_POS=1 +IF (NPROC == 1) THEN + ALLOC_POS = ALLOC_POS+KF_FS*D%NLENGTF +ENDIF CALL C_F_POINTER(C_LOC(REUSE_PTR(ALLOC_POS:)), ZBASE, & & [SIZEOF(REUSE_PTR(ALLOC_POS:))/SIZEOF(ZBASE(0))]) +ALLOC_POS=1 ZINPS(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IIN_STRIDES1*D_NUMP-1) ALLOC_POS=ALLOC_POS+ALIGN(IIN_STRIDES1*D_NUMP,8) ZINPA(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IIN_STRIDES1*D_NUMP-1) @@ -142,8 +203,8 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ALLOC_POS=ALLOC_POS+ALIGN(IOUT_STRIDES1*D_NUMP,8) ! The BASE0 pointer points to the rest, but likely in a different type! -CALL C_F_POINTER(C_LOC(REUSE_PTR(ALLOC_POS:)), ZBASE0, & - & [SIZEOF(REUSE_PTR(ALLOC_POS:))/SIZEOF(ZBASE0(0))]) +CALL C_F_POINTER(C_LOC(ZBASE(ALLOC_POS:)), ZBASE0, & + & [SIZEOF(ZBASE(ALLOC_POS:))/SIZEOF(ZBASE0(0))]) ALLOC_POS=1 ZINPS0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IIN0_STRIDES1-1) ALLOC_POS=ALLOC_POS+ALIGN(IIN0_STRIDES1,8) @@ -159,38 +220,88 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !$ACC& PRESENT(ZAA,ZAS,POA1) & !$ACC& PRESENT(D_NPNTGTB1) -!$ACC DATA PRESENT(FOUBUF) -!$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) ASYNC(1) -DO KMLOC=1,D_NUMP - DO JGL=1,R_NDGNH - DO JF=1,KF_FS*2 - KM = D_MYMS(KMLOC) - ISL = R_NDGNH-G_NDGLU(KM)+1 - IF (JGL >= ISL) THEN - !(DO JGL=ISL,R_NDGNH) - IGLS = R_NDGL+1-JGL - OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*2*KF_FS - OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_FS - PAIA = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) - PAIS = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) - IF (JF <= 4*KF_UV) THEN - ! Multiply in case of velocity - PAIA = PAIA*F%RACTHE(JGL) - PAIS = PAIS*F%RACTHE(JGL) +IF (NPROC > 1) THEN + + !$ACC DATA PRESENT(FOUBUF) + !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) ASYNC(1) + DO KMLOC=1,D_NUMP + DO JGL=1,R_NDGNH + DO JF=1,KF_FS*2 + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + IF (JGL >= ISL) THEN + !(DO JGL=ISL,R_NDGNH) + IGLS = R_NDGL+1-JGL + OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*2*KF_FS + OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_FS + PAIA = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) + PAIS = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) + IF (JF <= 4*KF_UV) THEN + ! Multiply in case of velocity + PAIA = PAIA*F%RACTHE(JGL) + PAIS = PAIS*F%RACTHE(JGL) + ENDIF + IF (KM /= 0) THEN + ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIA*F%RW(JGL) + ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIS*F%RW(JGL) + ELSEIF (MOD(JF-1,2) == 0) THEN + ! every other field is sufficient because Im(KM=0) == 0 + ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F%RW(JGL) + ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIS*F%RW(JGL) + ENDIF ENDIF - IF (KM /= 0) THEN - ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIA*F%RW(JGL) - ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIS*F%RW(JGL) - ELSEIF (MOD(JF-1,2) == 0) THEN - ! every other field is sufficient because Im(KM=0) == 0 - ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F%RW(JGL) - ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIS*F%RW(JGL) + ENDDO + ENDDO + END DO + !$ACC END DATA +ELSE + + OFFSET_VAR=D_NPTRLS(MYSETW) + + PREEL_COMPLEX(1:) => REUSE_PTR(1:) + !$ACC DATA PRESENT(PREEL_COMPLEX,D_NSTAGTF,G_NLOEN) + !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS,IGLS,V1,V2,SCAL,IGLG) ASYNC(1) TILE(32,16,1) + DO JGL=1,R_NDGNH + DO KMLOC=1,D_NUMP + DO JF=1,KF_FS*2 + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + IF (JGL >= ISL) THEN + !(DO JGL=ISL,R_NDGNH) + IGLS = JGL + + OFFSET1 = KF_FS*D_NSTAGTF(IGLS)+(JF-1)/2*(D_NSTAGTF(IGLS+1)-D_NSTAGTF(IGLS)) + IGLG = OFFSET_VAR+IGLS-1 + SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) + V1 = SCAL * PREEL_COMPLEX(OFFSET1+2*(KMLOC-1)+1+MOD(JF-1,2)) + + IGLS = R_NDGL+1-JGL + OFFSET2 = KF_FS*D_NSTAGTF(IGLS)+(JF-1)/2*(D_NSTAGTF(IGLS+1)-D_NSTAGTF(IGLS)) + IGLG = OFFSET_VAR+IGLS-1 + SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) + V2 = SCAL * PREEL_COMPLEX(OFFSET2+2*(KMLOC-1)+1+MOD(JF-1,2)) + + PAIA = V1-V2 + PAIS = V1+V2 + IF (JF <= 4*KF_UV) THEN + ! Multiply in case of velocity + PAIA = PAIA*F%RACTHE(JGL) + PAIS = PAIS*F%RACTHE(JGL) + ENDIF + IF (KM /= 0) THEN + ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIA*F%RW(JGL) + ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIS*F%RW(JGL) + ELSEIF (MOD(JF-1,2) == 0) THEN + ! every other field is sufficient because Im(KM=0) == 0 + ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F%RW(JGL) + ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIS*F%RW(JGL) + ENDIF ENDIF - ENDIF + ENDDO ENDDO - ENDDO -END DO -!$ACC END DATA + END DO + !$ACC END DATA +ENDIF ! anti-symmetric IF(KMLOC0 > 0) THEN @@ -347,8 +458,10 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !$ACC END DATA -!$ACC EXIT DATA DELETE(FOUBUF) -DEALLOCATE(FOUBUF) +IF (NPROC > 1) THEN + !$ACC EXIT DATA DELETE(FOUBUF) + DEALLOCATE(FOUBUF) +ENDIF IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) diff --git a/src/trans/gpu/internal/ltdir_ctl_mod.F90 b/src/trans/gpu/internal/ltdir_ctl_mod.F90 index e0667f5de..cc10e4489 100755 --- a/src/trans/gpu/internal/ltdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltdir_ctl_mod.F90 @@ -43,7 +43,7 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN,& USE TPM_GEN, only: nout USE TPM_DIM ,ONLY : R - USE TPM_DISTR ,ONLY : D + USE TPM_DISTR ,ONLY : D, NPROC USE TPM_GEOMETRY ,ONLY : G USE TPM_FIELDS ,ONLY : F @@ -72,12 +72,14 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN,& INTEGER(KIND=JPIM) :: JM,IM CALL GSTATS(153,0) + IF (NPROC > 1) THEN #ifdef USE_CUDA_AWARE_MPI_FT - WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_CUDAAWARE' - CALL TRLTOM_CUDAAWARE(FOUBUF_IN,FOUBUF,KF_FS) + WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_CUDAAWARE' + CALL TRLTOM_CUDAAWARE(FOUBUF_IN,FOUBUF,KF_FS) #else - CALL TRLTOM(FOUBUF_IN,FOUBUF,KF_FS) + CALL TRLTOM(FOUBUF_IN,FOUBUF,KF_FS) #endif + ENDIF CALL GSTATS(153,1) ! Direct Legendre transform From 665c81a640f3eec635f883fce3f3510ac4a0567d Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:33 -0700 Subject: [PATCH 216/263] Move TRLTOM to dir_trans --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 67 ++++++-------------- src/trans/gpu/internal/ltdir_ctl_mod.F90 | 28 ++------ 2 files changed, 24 insertions(+), 71 deletions(-) diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 81b2492eb..7a85c0645 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -75,9 +75,9 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& USE PARKIND1 ,ONLY : JPIM ,JPRB USE PARKIND_ECTRANS ,ONLY : JPRBT -USE TPM_GEN ,ONLY : NPROMATR +USE TPM_GEN ,ONLY : NPROMATR, NOUT USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B -!USE TPM_DISTR +USE TPM_DISTR, ONLY: NPROC USE SHUFFLE_MOD ,ONLY : SHUFFLE USE FIELD_SPLIT_MOD ,ONLY : FIELD_SPLIT @@ -85,6 +85,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& USE FTDIR_CTL_MOD ,ONLY : FTDIR_CTL USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS +USE TRLTOM_MOD ,ONLY : TRLTOM, TRLTOM_CUDAAWARE ! IMPLICIT NONE @@ -124,6 +125,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& INTEGER(KIND=JPIM) :: JFLD,ISTSC_G,IENSC_G,ISTSC,IENSC,IENUV,IF_GPB REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF_IN(:) +REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF(:) ! ------------------------------------------------------------------ @@ -132,52 +134,8 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& IF_GPB = 2*KF_UV_G+KF_SCALARS_G IF(NPROMATR > 0 .AND. IF_GPB > NPROMATR) THEN - - ! Fields to be split into packets - - CALL SHUFFLE(KF_UV_G,KF_SCALARS_G,ISHFUV_G,IVSETUV,ISHFSC_G,IVSETSC,& - & KVSETUV,KVSETSC) - - IBLKS=(IF_GPB-1)/NPROMATR+1 - - DO JBLK=1,IBLKS - - CALL FIELD_SPLIT(JBLK,KF_GP,KF_UV_G,IVSETUV,IVSETSC,& - & ISTUV_G,IENUV_G,IF_UV_G,ISTSC_G,IENSC_G,IF_SCALARS_G,& - & ISTUV,IENUV,IF_UV,ISTSC,IENSC,IF_SCALARS) - - IF_FS = 2*IF_UV + IF_SCALARS - IF_GP = 2*IF_UV_G+IF_SCALARS_G - DO JFLD=1,IF_UV_G - IPTRGP(JFLD) = ISHFUV_G(ISTUV_G+JFLD-1) - IPTRGP(JFLD+IF_UV_G) = KF_UV_G+ISHFUV_G(ISTUV_G+JFLD-1) - ENDDO - DO JFLD=1,IF_SCALARS_G - IPTRGP(JFLD+2*IF_UV_G) = 2*KF_UV_G+ISHFSC_G(ISTSC_G+JFLD-1) - ENDDO - DO JFLD=1,IF_UV - IPTRSPUV(JFLD) = ISTUV+JFLD-1 - ENDDO - DO JFLD=1,IF_SCALARS - IPTRSPSC(JFLD) = ISTSC+JFLD-1 - ENDDO - - IF(IF_UV_G > 0 .AND. IF_SCALARS_G > 0) THEN - CALL FTDIR_CTL(IF_UV_G,IF_SCALARS_G,IF_GP,IF_FS,FOUBUF_IN,& - & KVSETUV=IVSETUV(ISTUV_G:IENUV_G),& - & KVSETSC=IVSETSC(ISTSC_G:IENSC_G),KPTRGP=IPTRGP,PGP=PGP) - ELSEIF(IF_UV_G > 0) THEN - CALL FTDIR_CTL(IF_UV_G,IF_SCALARS_G,IF_GP,IF_FS,FOUBUF_IN,& - & KVSETUV=IVSETUV(ISTUV_G:IENUV_G),& - & KPTRGP=IPTRGP,PGP=PGP) - ELSEIF(IF_SCALARS_G > 0) THEN - CALL FTDIR_CTL(IF_UV_G,IF_SCALARS_G,IF_GP,IF_FS,FOUBUF_IN,& - & KVSETSC=IVSETSC(ISTSC_G:IENSC_G),KPTRGP=IPTRGP,PGP=PGP) - ENDIF - CALL LTDIR_CTL(IF_FS,IF_UV,IF_SCALARS,FOUBUF_IN,& - & PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& - & KFLDPTRUV=IPTRSPUV,KFLDPTRSC=IPTRSPSC) - ENDDO + PRINT *, "ERROR, not implemented right now" + stop 4 ELSE ! No splitting of fields, transform done in one go @@ -186,7 +144,18 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) - CALL LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN, & + CALL GSTATS(153,0) + IF (NPROC > 1) THEN +#ifdef USE_CUDA_AWARE_MPI_FT + WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_CUDAAWARE' + CALL TRLTOM_CUDAAWARE(FOUBUF_IN,FOUBUF,KF_FS) +#else + CALL TRLTOM(FOUBUF_IN,FOUBUF,KF_FS) +#endif + ENDIF + CALL GSTATS(153,1) + + CALL LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF, & & PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& & PSPSC3A=PSPSC3A,PSPSC3B=PSPSC3B,PSPSC2=PSPSC2) ENDIF diff --git a/src/trans/gpu/internal/ltdir_ctl_mod.F90 b/src/trans/gpu/internal/ltdir_ctl_mod.F90 index cc10e4489..8e07dcc43 100755 --- a/src/trans/gpu/internal/ltdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltdir_ctl_mod.F90 @@ -10,7 +10,7 @@ MODULE LTDIR_CTL_MOD CONTAINS - SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN,& + SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF,& & PSPVOR,PSPDIV,PSPSCALAR, & & PSPSC3A,PSPSC3B,PSPSC2, & & KFLDPTRUV,KFLDPTRSC) @@ -41,15 +41,13 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN,& USE PARKIND1 ,ONLY : JPIM ,JPRB USE PARKIND_ECTRANS ,ONLY : JPRBT - USE TPM_GEN, only: nout USE TPM_DIM ,ONLY : R - USE TPM_DISTR ,ONLY : D, NPROC + USE TPM_DISTR ,ONLY : D USE TPM_GEOMETRY ,ONLY : G USE TPM_FIELDS ,ONLY : F USE LTDIR_MOD ,ONLY : LTDIR - USE TRLTOM_MOD ,ONLY : TRLTOM, TRLTOM_CUDAAWARE USE TPM_FIELDS ,ONLY : ZEPSNM USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX @@ -57,7 +55,7 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN,& IMPLICIT NONE INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_UV,KF_SCALARS - REAL(KIND=JPRBT), ALLOCATABLE, INTENT(INOUT) :: FOUBUF_IN(:) + REAL(KIND=JPRBT), ALLOCATABLE, INTENT(INOUT) :: FOUBUF(:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) @@ -67,34 +65,20 @@ SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF_IN,& INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) - REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF(:) INTEGER(KIND=JPIM) :: JM,IM - CALL GSTATS(153,0) - IF (NPROC > 1) THEN -#ifdef USE_CUDA_AWARE_MPI_FT - WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_CUDAAWARE' - CALL TRLTOM_CUDAAWARE(FOUBUF_IN,FOUBUF,KF_FS) -#else - CALL TRLTOM(FOUBUF_IN,FOUBUF,KF_FS) -#endif - ENDIF - CALL GSTATS(153,1) - ! Direct Legendre transform - - CALL GSTATS(103,0) - CALL GSTATS(1645,0) + CALL GSTATS(103,0) IF (KF_FS > 0) THEN + CALL GSTATS(1645,0) CALL LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS, & & PSPVOR,PSPDIV,PSPSCALAR,& & PSPSC3A,PSPSC3B,PSPSC2 , & & KFLDPTRUV,KFLDPTRSC) + CALL GSTATS(1645,1) ENDIF - CALL GSTATS(1645,1) - CALL GSTATS(103,1) ! ----------------------------------------------------------------- From 547315be63ce45e9f25cd43a5a671109ee102368 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:33 -0700 Subject: [PATCH 217/263] Remove empty ltdir_ctl wrapper --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 10 ++- src/trans/gpu/internal/ltdir_ctl_mod.F90 | 87 -------------------- 2 files changed, 6 insertions(+), 91 deletions(-) delete mode 100755 src/trans/gpu/internal/ltdir_ctl_mod.F90 diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 7a85c0645..8ece3f1eb 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -81,7 +81,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& USE SHUFFLE_MOD ,ONLY : SHUFFLE USE FIELD_SPLIT_MOD ,ONLY : FIELD_SPLIT -USE LTDIR_CTL_MOD ,ONLY : LTDIR_CTL +USE LTDIR_MOD ,ONLY : LTDIR USE FTDIR_CTL_MOD ,ONLY : FTDIR_CTL USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS @@ -155,9 +155,11 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ENDIF CALL GSTATS(153,1) - CALL LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF, & - & PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& - & PSPSC3A=PSPSC3A,PSPSC3B=PSPSC3B,PSPSC2=PSPSC2) + IF (KF_FS > 0) THEN + CALL LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS, & + & PSPVOR,PSPDIV,PSPSCALAR,& + & PSPSC3A,PSPSC3B,PSPSC2) + ENDIF ENDIF ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/ltdir_ctl_mod.F90 b/src/trans/gpu/internal/ltdir_ctl_mod.F90 deleted file mode 100755 index 8e07dcc43..000000000 --- a/src/trans/gpu/internal/ltdir_ctl_mod.F90 +++ /dev/null @@ -1,87 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE LTDIR_CTL_MOD - CONTAINS - SUBROUTINE LTDIR_CTL(KF_FS,KF_UV,KF_SCALARS,FOUBUF,& - & PSPVOR,PSPDIV,PSPSCALAR, & - & PSPSC3A,PSPSC3B,PSPSC2, & - & KFLDPTRUV,KFLDPTRSC) - - !**** *LTDIR_CTL* - Control routine for direct Legendre transform - - ! Purpose. - ! -------- - ! Direct Legendre transform - - !** Interface. - ! ---------- - ! CALL LTDIR_CTL(...) - - ! Explicit arguments : - ! -------------------- - ! KF_FS - number of fields in Fourier space - ! KF_UV - local number of spectral u-v fields - ! KF_SCALARS - local number of scalar spectral fields - ! PSPVOR(:,:) - spectral vorticity (output) - ! PSPDIV(:,:) - spectral divergence (output) - ! PSPSCALAR(:,:) - spectral scalarvalued fields (output) - ! KFLDPTRUV(:) - field pointer for vorticity and divergence (input) - ! KFLDPTRSC(:) - field pointer for scalarvalued fields (input) - - ! ------------------------------------------------------------------ - -USE PARKIND1 ,ONLY : JPIM ,JPRB -USE PARKIND_ECTRANS ,ONLY : JPRBT - - USE TPM_DIM ,ONLY : R - USE TPM_DISTR ,ONLY : D - USE TPM_GEOMETRY ,ONLY : G - USE TPM_FIELDS ,ONLY : F - - - USE LTDIR_MOD ,ONLY : LTDIR - - USE TPM_FIELDS ,ONLY : ZEPSNM - USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX - - IMPLICIT NONE - - INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_UV,KF_SCALARS - REAL(KIND=JPRBT), ALLOCATABLE, INTENT(INOUT) :: FOUBUF(:) - REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) - REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) - REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) - REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3A(:,:,:) - REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3B(:,:,:) - REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC2(:,:) - INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) - INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) - - - INTEGER(KIND=JPIM) :: JM,IM - - ! Direct Legendre transform - - CALL GSTATS(103,0) - IF (KF_FS > 0) THEN - CALL GSTATS(1645,0) - CALL LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS, & - & PSPVOR,PSPDIV,PSPSCALAR,& - & PSPSC3A,PSPSC3B,PSPSC2 , & - & KFLDPTRUV,KFLDPTRSC) - CALL GSTATS(1645,1) - ENDIF - CALL GSTATS(103,1) - - ! ----------------------------------------------------------------- - - END SUBROUTINE LTDIR_CTL - END MODULE LTDIR_CTL_MOD From 171e43fac4bfa1e2e3703b6015935a6813e94d47 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:33 -0700 Subject: [PATCH 218/263] move fourier_out outwards --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 5 ++++- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 9 ++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 8ece3f1eb..671309bd4 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -86,6 +86,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS USE TRLTOM_MOD ,ONLY : TRLTOM, TRLTOM_CUDAAWARE +USE FOURIER_OUT_MOD , ONLY: FOURIER_OUT ! IMPLICIT NONE @@ -126,6 +127,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF_IN(:) REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF(:) +REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) ! ------------------------------------------------------------------ @@ -139,13 +141,14 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ELSE ! No splitting of fields, transform done in one go - CALL FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN,& + CALL FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_COMPLEX,& & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) CALL GSTATS(153,0) IF (NPROC > 1) THEN + CALL FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KF_FS) #ifdef USE_CUDA_AWARE_MPI_FT WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_CUDAAWARE' CALL TRLTOM_CUDAAWARE(FOUBUF_IN,FOUBUF,KF_FS) diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index 92014429c..43e63441c 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -10,7 +10,7 @@ MODULE FTDIR_CTL_MOD CONTAINS -SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & +SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_COMPLEX, & & KVSETUV,KVSETSC,KPTRGP,& & KVSETSC3A,KVSETSC3B,KVSETSC2,& & PGP,PGPUV,PGP3A,PGP3B,PGP2) @@ -61,7 +61,6 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & !USE TPM_DIM !USE TPM_GEOMETRY USE TPM_DISTR ,ONLY : D, MYPROC, NPROC -USE FOURIER_OUT_MOD , ONLY: FOURIER_OUT USE TPM_TRANS, ONLY: REUSE_PTR USE TRGTOL_MOD ,ONLY : TRGTOL, TRGTOL_CUDAAWARE @@ -92,11 +91,10 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP3A(:,:,:,:) REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP2(:,:,:) -REAL(KIND=JPRBT), ALLOCATABLE, INTENT(OUT) :: FOUBUF_IN(:) +REAL(KIND=JPRBT), POINTER, INTENT(OUT) :: PREEL_COMPLEX(:) ! Local variables REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:) -REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) INTEGER(KIND=JPIM) :: IST,JGL,IGL,JF_FS INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) @@ -194,9 +192,6 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,FOUBUF_IN, & CALL GSTATS(1640,0) IF (KF_FS > 0) THEN CALL FTDIR(PREEL_REAL,PREEL_COMPLEX,KF_FS) - IF (NPROC > 1) THEN - CALL FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KF_FS) - ENDIF ELSE PREEL_COMPLEX => PREEL_REAL ENDIF From 363e7da0e605cb9821b4d47885cdd2412b5c28bb Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:33 -0700 Subject: [PATCH 219/263] Restructure LEDIR --- src/trans/gpu/internal/ledir_mod.F90 | 353 ++++++++++++++++----------- 1 file changed, 216 insertions(+), 137 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 586182ec8..c6372676c 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -13,10 +13,43 @@ MODULE LEDIR_MOD USE PARKIND_ECTRANS ,ONLY : JPIM PRIVATE - PUBLIC :: LEDIR, LEDIR_ALLOC_SIZE + PUBLIC :: LEDIR, LEDIR_ALLOC_SIZE, LEDIR_PACK_BUFFER INTEGER(KIND=JPIM) :: A = 8 !Alignment CONTAINS +SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& + IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD + USE TPM_DIM ,ONLY : R + + IMPLICIT NONE + + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + + INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0, IOUT_STRIDES1 + INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_STRIDES1 + INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_STRIDES1 + + IF (PRESENT(IOUT_STRIDES0)) & + IOUT_STRIDES0 = ALIGN(2*KF_FS,A) + IF (PRESENT(IOUT_STRIDES1)) & + IOUT_STRIDES1 = IOUT_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) + IF (PRESENT(IIN_STRIDES0)) & + IIN_STRIDES0 = ALIGN(2*KF_FS,A) + IF (PRESENT(IIN_STRIDES1)) & + IIN_STRIDES1 = IIN_STRIDES0 * ALIGN(R%NDGNH,A) + IF (PRESENT(IOUT0_STRIDES0)) & + IOUT0_STRIDES0 = ALIGN(KF_FS,A) + IF (PRESENT(IOUT0_STRIDES1)) & + IOUT0_STRIDES1 = IOUT0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) + IF (PRESENT(IIN0_STRIDES0)) & + IIN0_STRIDES0 = ALIGN(KF_FS,A) + IF (PRESENT(IIN0_STRIDES1)) & + IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(R%NDGNH,A) +END SUBROUTINE + + FUNCTION LEDIR_ALLOC_SIZE(KF_FS) USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD USE TPM_DIM ,ONLY : R @@ -35,14 +68,8 @@ FUNCTION LEDIR_ALLOC_SIZE(KF_FS) REAL(KIND=JPRBT) :: ZPRBT_DUMMY REAL(KIND=JPRD) :: ZPRD_DUMMY - IOUT_STRIDES0 = ALIGN(2*KF_FS,A) - IOUT_STRIDES1 = IOUT_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) - IIN_STRIDES0 = ALIGN(2*KF_FS,A) - IIN_STRIDES1 = IIN_STRIDES0 * ALIGN(R%NDGNH,A) - IOUT0_STRIDES0 = ALIGN(KF_FS,A) - IOUT0_STRIDES1 = IOUT0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) - IIN0_STRIDES0 = ALIGN(KF_FS,A) - IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(R%NDGNH,A) + CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& + IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) ! Check if the reuse buffer is large enough LEDIR_ALLOC_SIZE = ALIGN(IIN_STRIDES1*D%NUMP,8)*SIZEOF(ZPRBT_DUMMY) & @@ -52,6 +79,178 @@ FUNCTION LEDIR_ALLOC_SIZE(KF_FS) +ALIGN(IIN0_STRIDES1,8)*SIZEOF(ZPRD_DUMMY) & +ALIGN(IOUT0_STRIDES1,8)*SIZEOF(ZPRD_DUMMY) END FUNCTION + +SUBROUTINE LEDIR_POINTERS(ZBUF,KF_FS,ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD + USE TPM_DIM ,ONLY : R + USE TPM_DISTR ,ONLY : D + USE, INTRINSIC :: ISO_C_BINDING + IMPLICIT NONE + + REAL(KIND=JPRBT), INTENT(IN), TARGET :: ZBUF(:) + REAL(KIND=JPRBT), INTENT(OUT), OPTIONAL, POINTER :: ZINPS(:), ZINPA(:), ZOUT(:) + REAL(KIND=JPRD), INTENT(OUT), OPTIONAL, POINTER :: ZINPS0(:), ZINPA0(:), ZOUT0(:) + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=8) :: ALLOC_POS + + REAL(KIND=JPRBT), POINTER :: ZBASE(:) + REAL(KIND=JPRD), POINTER :: ZBASE0(:) + + CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& + IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) + + CALL C_F_POINTER(C_LOC(ZBUF(:)), ZBASE, & + & [SIZEOF(ZBUF(:))/SIZEOF(ZBASE(0))]) + + ALLOC_POS=1 + IF (PRESENT(ZINPS)) ZINPS(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IIN_STRIDES1*D%NUMP-1) + ALLOC_POS=ALLOC_POS+ALIGN(IIN_STRIDES1*D%NUMP,8) + IF (PRESENT(ZINPA)) ZINPA(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IIN_STRIDES1*D%NUMP-1) + ALLOC_POS=ALLOC_POS+ALIGN(IIN_STRIDES1*D%NUMP,8) + IF (PRESENT(ZOUT)) ZOUT(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IOUT_STRIDES1*D%NUMP-1) + ALLOC_POS=ALLOC_POS+ALIGN(IOUT_STRIDES1*D%NUMP,8) + + ! The BASE0 pointer points to the rest, but likely in a different type! + CALL C_F_POINTER(C_LOC(ZBASE(ALLOC_POS:)), ZBASE0, & + & [SIZEOF(ZBASE(ALLOC_POS:))/SIZEOF(ZBASE0(0))]) + ALLOC_POS=1 + IF (PRESENT(ZINPS0)) ZINPS0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IIN0_STRIDES1-1) + ALLOC_POS=ALLOC_POS+ALIGN(IIN0_STRIDES1,8) + IF (PRESENT(ZINPA0)) ZINPA0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IIN0_STRIDES1-1) + ALLOC_POS=ALLOC_POS+ALIGN(IIN0_STRIDES1,8) + IF (PRESENT(ZOUT0)) ZOUT0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IOUT0_STRIDES1-1) + ALLOC_POS=ALLOC_POS+ALIGN(IOUT0_STRIDES1,8) + +END SUBROUTINE + +SUBROUTINE LEDIR_PACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD + USE TPM_DIM ,ONLY : R, R_NDGNH, R_NDGL + USE TPM_GEOMETRY ,ONLY : G, G_NDGLU + USE TPM_FIELDS ,ONLY : F + USE TPM_TRANS, ONLY: REUSE_PTR + USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1,MYPROC,NPROC + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + + USE TPM_DISTR ,ONLY : D,MYSETW, NPROC,D_NSTAGTF,D_NPTRLS + USE TPM_GEOMETRY ,ONLY : G,G_NLOEN + USE, INTRINSIC :: ISO_C_BINDING + + IMPLICIT NONE + + REAL(KIND=JPRBT), ALLOCATABLE, INTENT(INOUT) :: FOUBUF(:) + REAL(KIND=JPRBT), INTENT(INOUT) :: ZINPS(:), ZINPA(:) + REAL(KIND=JPRD), INTENT(INOUT) :: ZINPS0(:), ZINPA0(:) + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV + + REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) + + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM) :: IIN0_STRIDES0 + INTEGER(KIND=8) :: JF, OFFSET_VAR + INTEGER(KIND=JPIM) :: KM, ISL, IGLS, OFFSET1, OFFSET2, JGL, IGLG, KMLOC + + REAL(KIND=JPRBT) :: PAIA, PAIS, V1, V2, SCAL + + CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& + IIN0_STRIDES0=IIN0_STRIDES0) + + !$ACC DATA & + !$ACC& PRESENT(ZINPS,ZINPA,ZINPS0,ZINPA0) & + !$ACC& PRESENT(F,F%RW) & + !$ACC& PRESENT(D,D_MYMS,R,G,G_NDGLU) & + !$ACC& PRESENT(D_NPNTGTB1) + + IF (NPROC > 1) THEN + + !$ACC DATA PRESENT(FOUBUF) + !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) ASYNC(1) + DO KMLOC=1,D_NUMP + DO JGL=1,R_NDGNH + DO JF=1,KF_FS*2 + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + IF (JGL >= ISL) THEN + !(DO JGL=ISL,R_NDGNH) + IGLS = R_NDGL+1-JGL + OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*2*KF_FS + OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_FS + PAIA = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) + PAIS = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) + IF (JF <= 4*KF_UV) THEN + ! Multiply in case of velocity + PAIA = PAIA*F%RACTHE(JGL) + PAIS = PAIS*F%RACTHE(JGL) + ENDIF + IF (KM /= 0) THEN + ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIA*F%RW(JGL) + ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIS*F%RW(JGL) + ELSEIF (MOD(JF-1,2) == 0) THEN + ! every other field is sufficient because Im(KM=0) == 0 + ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F%RW(JGL) + ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIS*F%RW(JGL) + ENDIF + ENDIF + ENDDO + ENDDO + END DO + !$ACC END DATA + ELSE + + OFFSET_VAR=D_NPTRLS(MYSETW) + + PREEL_COMPLEX(1:) => REUSE_PTR(1:) + !$ACC DATA PRESENT(PREEL_COMPLEX,D_NSTAGTF,G_NLOEN) + !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS,IGLS,V1,V2,SCAL,IGLG) ASYNC(1) TILE(32,16,1) + DO JGL=1,R_NDGNH + DO KMLOC=1,D_NUMP + DO JF=1,KF_FS*2 + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + IF (JGL >= ISL) THEN + !(DO JGL=ISL,R_NDGNH) + IGLS = JGL + + OFFSET1 = KF_FS*D_NSTAGTF(IGLS)+(JF-1)/2*(D_NSTAGTF(IGLS+1)-D_NSTAGTF(IGLS)) + IGLG = OFFSET_VAR+IGLS-1 + SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) + V1 = SCAL * PREEL_COMPLEX(OFFSET1+2*(KMLOC-1)+1+MOD(JF-1,2)) + + IGLS = R_NDGL+1-JGL + OFFSET2 = KF_FS*D_NSTAGTF(IGLS)+(JF-1)/2*(D_NSTAGTF(IGLS+1)-D_NSTAGTF(IGLS)) + IGLG = OFFSET_VAR+IGLS-1 + SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) + V2 = SCAL * PREEL_COMPLEX(OFFSET2+2*(KMLOC-1)+1+MOD(JF-1,2)) + + PAIA = V1-V2 + PAIS = V1+V2 + IF (JF <= 4*KF_UV) THEN + ! Multiply in case of velocity + PAIA = PAIA*F%RACTHE(JGL) + PAIS = PAIS*F%RACTHE(JGL) + ENDIF + IF (KM /= 0) THEN + ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIA*F%RW(JGL) + ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIS*F%RW(JGL) + ELSEIF (MOD(JF-1,2) == 0) THEN + ! every other field is sufficient because Im(KM=0) == 0 + ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F%RW(JGL) + ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIS*F%RW(JGL) + ENDIF + ENDIF + ENDDO + ENDDO + END DO + !$ACC END DATA + ENDIF + + !$ACC END DATA +END SUBROUTINE SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !**** *LEDIR* - Direct Legendre transform. @@ -102,7 +301,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) USE TPM_GEOMETRY ,ONLY : G, G_NDGLU USE TPM_FIELDS ,ONLY : F,ZAA,ZAS,ZAA0,ZAS0,KMLOC0 USE TPM_TRANS, ONLY: REUSE_PTR -USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1 +USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1, NPROC USE CUDA_GEMM_BATCHED_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX @@ -110,10 +309,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) USE IEEE_ARITHMETIC USE OPENACC -!!!! -USE TPM_DISTR ,ONLY : D,MYSETW,MYPROC, NPROC, D_NSTAGTF, D_NPNTGTB0,D_NPTRLS -USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NLOEN - IMPLICIT NONE @@ -142,21 +337,8 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS - INTEGER(KIND=JPIM) :: IGLG, IOFF_LAT, ISTA, OFFSET_VAR, KGL, JM - REAL(KIND=JPRBT):: SCAL - REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) - IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) -IOUT_STRIDES0 = ALIGN(2*KF_FS,A) -IOUT_STRIDES1 = IOUT_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) -IIN_STRIDES0 = ALIGN(2*KF_FS,A) -IIN_STRIDES1 = IIN_STRIDES0 * ALIGN(R_NDGNH,A) -IOUT0_STRIDES0 = ALIGN(KF_FS,A) -IOUT0_STRIDES1 = IOUT0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) -IIN0_STRIDES0 = ALIGN(KF_FS,A) -IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(R_NDGNH,A) - IF (NPROC == 1) THEN ! Short cut - no need to go through tansforms, we will go directly into ! the legendre space, but for that we need twice the memory, roughly @@ -167,13 +349,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) ENDIF ! Check if the reuse buffer is large enough -ALLOC_SZ = ALLOC_SZ & - +ALIGN(IIN_STRIDES1*D_NUMP,8)*SIZEOF(ZINPS(1)) & - +ALIGN(IIN_STRIDES1*D_NUMP,8)*SIZEOF(ZINPA(1)) & - +ALIGN(IOUT_STRIDES1*D_NUMP,8)*SIZEOF(ZOUT(1)) & - +ALIGN(IIN0_STRIDES1,8)*SIZEOF(ZINPS0(1)) & - +ALIGN(IIN0_STRIDES1,8)*SIZEOF(ZINPA0(1)) & - +ALIGN(IOUT0_STRIDES1,8)*SIZEOF(ZOUT0(1)) +ALLOC_SZ = ALLOC_SZ+LEDIR_ALLOC_SIZE(KF_FS) IF (.NOT. ALLOCATED(REUSE_PTR)) THEN ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) @@ -191,27 +367,13 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) IF (NPROC == 1) THEN ALLOC_POS = ALLOC_POS+KF_FS*D%NLENGTF ENDIF -CALL C_F_POINTER(C_LOC(REUSE_PTR(ALLOC_POS:)), ZBASE, & - & [SIZEOF(REUSE_PTR(ALLOC_POS:))/SIZEOF(ZBASE(0))]) -ALLOC_POS=1 -ZINPS(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IIN_STRIDES1*D_NUMP-1) -ALLOC_POS=ALLOC_POS+ALIGN(IIN_STRIDES1*D_NUMP,8) -ZINPA(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IIN_STRIDES1*D_NUMP-1) -ALLOC_POS=ALLOC_POS+ALIGN(IIN_STRIDES1*D_NUMP,8) -ZOUT(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IOUT_STRIDES1*D_NUMP-1) -ALLOC_POS=ALLOC_POS+ALIGN(IOUT_STRIDES1*D_NUMP,8) - -! The BASE0 pointer points to the rest, but likely in a different type! -CALL C_F_POINTER(C_LOC(ZBASE(ALLOC_POS:)), ZBASE0, & - & [SIZEOF(ZBASE(ALLOC_POS:))/SIZEOF(ZBASE0(0))]) -ALLOC_POS=1 -ZINPS0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IIN0_STRIDES1-1) -ALLOC_POS=ALLOC_POS+ALIGN(IIN0_STRIDES1,8) -ZINPA0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IIN0_STRIDES1-1) -ALLOC_POS=ALLOC_POS+ALIGN(IIN0_STRIDES1,8) -ZOUT0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IOUT0_STRIDES1-1) -ALLOC_POS=ALLOC_POS+ALIGN(IOUT0_STRIDES1,8) +CALL LEDIR_POINTERS(REUSE_PTR(ALLOC_POS:),KF_FS,ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) + +CALL LEDIR_PACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + +CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& + IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) !$ACC DATA & !$ACC& PRESENT(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) & @@ -220,89 +382,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !$ACC& PRESENT(ZAA,ZAS,POA1) & !$ACC& PRESENT(D_NPNTGTB1) -IF (NPROC > 1) THEN - - !$ACC DATA PRESENT(FOUBUF) - !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) ASYNC(1) - DO KMLOC=1,D_NUMP - DO JGL=1,R_NDGNH - DO JF=1,KF_FS*2 - KM = D_MYMS(KMLOC) - ISL = R_NDGNH-G_NDGLU(KM)+1 - IF (JGL >= ISL) THEN - !(DO JGL=ISL,R_NDGNH) - IGLS = R_NDGL+1-JGL - OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*2*KF_FS - OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_FS - PAIA = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) - PAIS = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) - IF (JF <= 4*KF_UV) THEN - ! Multiply in case of velocity - PAIA = PAIA*F%RACTHE(JGL) - PAIS = PAIS*F%RACTHE(JGL) - ENDIF - IF (KM /= 0) THEN - ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIA*F%RW(JGL) - ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIS*F%RW(JGL) - ELSEIF (MOD(JF-1,2) == 0) THEN - ! every other field is sufficient because Im(KM=0) == 0 - ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F%RW(JGL) - ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIS*F%RW(JGL) - ENDIF - ENDIF - ENDDO - ENDDO - END DO - !$ACC END DATA -ELSE - - OFFSET_VAR=D_NPTRLS(MYSETW) - - PREEL_COMPLEX(1:) => REUSE_PTR(1:) - !$ACC DATA PRESENT(PREEL_COMPLEX,D_NSTAGTF,G_NLOEN) - !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS,IGLS,V1,V2,SCAL,IGLG) ASYNC(1) TILE(32,16,1) - DO JGL=1,R_NDGNH - DO KMLOC=1,D_NUMP - DO JF=1,KF_FS*2 - KM = D_MYMS(KMLOC) - ISL = R_NDGNH-G_NDGLU(KM)+1 - IF (JGL >= ISL) THEN - !(DO JGL=ISL,R_NDGNH) - IGLS = JGL - - OFFSET1 = KF_FS*D_NSTAGTF(IGLS)+(JF-1)/2*(D_NSTAGTF(IGLS+1)-D_NSTAGTF(IGLS)) - IGLG = OFFSET_VAR+IGLS-1 - SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) - V1 = SCAL * PREEL_COMPLEX(OFFSET1+2*(KMLOC-1)+1+MOD(JF-1,2)) - - IGLS = R_NDGL+1-JGL - OFFSET2 = KF_FS*D_NSTAGTF(IGLS)+(JF-1)/2*(D_NSTAGTF(IGLS+1)-D_NSTAGTF(IGLS)) - IGLG = OFFSET_VAR+IGLS-1 - SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) - V2 = SCAL * PREEL_COMPLEX(OFFSET2+2*(KMLOC-1)+1+MOD(JF-1,2)) - - PAIA = V1-V2 - PAIS = V1+V2 - IF (JF <= 4*KF_UV) THEN - ! Multiply in case of velocity - PAIA = PAIA*F%RACTHE(JGL) - PAIS = PAIS*F%RACTHE(JGL) - ENDIF - IF (KM /= 0) THEN - ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIA*F%RW(JGL) - ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIS*F%RW(JGL) - ELSEIF (MOD(JF-1,2) == 0) THEN - ! every other field is sufficient because Im(KM=0) == 0 - ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F%RW(JGL) - ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIS*F%RW(JGL) - ENDIF - ENDIF - ENDDO - ENDDO - END DO - !$ACC END DATA -ENDIF - ! anti-symmetric IF(KMLOC0 > 0) THEN PRINT*,'computing m=0 in double precision' From bd5e9076d3a2a4d31c4e8636a5814fecaf44c6a5 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:34 -0700 Subject: [PATCH 220/263] Move ledir pack to ltdir --- src/trans/gpu/internal/ledir_mod.F90 | 55 ++++------------------------ src/trans/gpu/internal/ltdir_mod.F90 | 53 +++++++++++++++++++++++++-- 2 files changed, 58 insertions(+), 50 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index c6372676c..20cefcdee 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -13,7 +13,7 @@ MODULE LEDIR_MOD USE PARKIND_ECTRANS ,ONLY : JPIM PRIVATE - PUBLIC :: LEDIR, LEDIR_ALLOC_SIZE, LEDIR_PACK_BUFFER + PUBLIC :: LEDIR, LEDIR_ALLOC_SIZE, LEDIR_PACK_BUFFER, LEDIR_POINTERS INTEGER(KIND=JPIM) :: A = 8 !Alignment CONTAINS @@ -251,7 +251,7 @@ SUBROUTINE LEDIR_PACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) !$ACC END DATA END SUBROUTINE -SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) +SUBROUTINE LEDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,POA1,KF_FS) !**** *LEDIR* - Direct Legendre transform. @@ -312,11 +312,13 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) IMPLICIT NONE - ! DUMMY ARGUMENTS -REAL(KIND=JPRBT), ALLOCATABLE, INTENT(INOUT) :: FOUBUF(:) +REAL(KIND=JPRBT), INTENT(IN) :: ZINPS(:), ZINPA(:) +REAL(KIND=JPRD), INTENT(IN) :: ZINPS0(:), ZINPA0(:) +REAL(KIND=JPRBT), INTENT(OUT) :: ZOUT(:) +REAL(KIND=JPRD), INTENT(OUT) :: ZOUT0(:) REAL(KIND=JPRBT), INTENT(OUT) :: POA1(:,:,:) -INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV +INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS ! LOCAL VARIABLES INTEGER(KIND=JPIM) :: KM @@ -328,8 +330,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) INTEGER(KIND=JPIM) :: IGLS, JF, JGL INTEGER(KIND=JPIM) :: OFFSET1, OFFSET2 -REAL(KIND=JPRBT), POINTER :: ZBASE(:), ZINPS(:), ZINPA(:), ZOUT(:) -REAL(KIND=JPRD), POINTER :: ZBASE0(:), ZINPS0(:), ZINPA0(:), ZOUT0(:) INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 @@ -339,39 +339,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) -IF (NPROC == 1) THEN - ! Short cut - no need to go through tansforms, we will go directly into - ! the legendre space, but for that we need twice the memory, roughly - ! (but we don't need the send/recv buffers) - ALLOC_SZ = KF_FS*D%NLENGTF*SIZEOF(REUSE_PTR(1)) -ELSE - ALLOC_SZ = 0 -ENDIF - -! Check if the reuse buffer is large enough -ALLOC_SZ = ALLOC_SZ+LEDIR_ALLOC_SIZE(KF_FS) - -IF (.NOT. ALLOCATED(REUSE_PTR)) THEN - ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) - !$ACC ENTER DATA CREATE(REUSE_PTR) -ELSEIF (SIZEOF(REUSE_PTR) < ALLOC_SZ) THEN - ! and reallocate if needed - !$ACC EXIT DATA DELETE(REUSE_PTR) - DEALLOCATE(REUSE_PTR) - ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) - !$ACC ENTER DATA CREATE(REUSE_PTR) -ENDIF - -! Figure out which pointers to use -ALLOC_POS=1 -IF (NPROC == 1) THEN - ALLOC_POS = ALLOC_POS+KF_FS*D%NLENGTF -ENDIF - -CALL LEDIR_POINTERS(REUSE_PTR(ALLOC_POS:),KF_FS,ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) - -CALL LEDIR_PACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) - CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) @@ -379,8 +346,7 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !$ACC& PRESENT(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) & !$ACC& PRESENT(F,F%RW) & !$ACC& PRESENT(D,D_MYMS,R,G,G_NDGLU) & -!$ACC& PRESENT(ZAA,ZAS,POA1) & -!$ACC& PRESENT(D_NPNTGTB1) +!$ACC& PRESENT(ZAA,ZAS,POA1) ! anti-symmetric IF(KMLOC0 > 0) THEN @@ -537,11 +503,6 @@ SUBROUTINE LEDIR(FOUBUF,POA1,KF_FS,KF_UV) !$ACC END DATA -IF (NPROC > 1) THEN - !$ACC EXIT DATA DELETE(FOUBUF) - DEALLOCATE(FOUBUF) -ENDIF - IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index 57bb0a1ac..aa067eafa 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -16,7 +16,7 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& & KFLDPTRUV,KFLDPTRSC) - USE PARKIND1 ,ONLY : JPIM ,JPRB + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD, JPRB USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX @@ -25,7 +25,7 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& USE TPM_GEOMETRY USE PREPSNM_MOD ,ONLY : PREPSNM - USE LEDIR_MOD ,ONLY : LEDIR + USE LEDIR_MOD ,ONLY : LEDIR, LEDIR_POINTERS, LEDIR_PACK_BUFFER, LEDIR_ALLOC_SIZE USE UVTVD_MOD USE UPDSP_MOD ,ONLY : UPDSP USE UPDSPB_MOD ,ONLY : UPDSPB @@ -33,6 +33,10 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& USE TPM_GEN ,ONLY : LSYNC_TRANS USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + +USE TPM_TRANS, ONLY: REUSE_PTR +USE TPM_DISTR, ONLY : NPROC + !**** *LTDIR* - Control of Direct Legendre transform step @@ -112,6 +116,12 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& REAL(KIND=JPRB), ALLOCATABLE, TARGET :: POA1(:,:,:) REAL(KIND=JPRB), ALLOCATABLE, TARGET :: POA2(:,:,:) REAL(KIND=JPRB), POINTER :: PU(:,:,:), PV(:,:,:), PVOR(:,:,:), PDIV(:,:,:) + +REAL(KIND=JPRBT), POINTER :: ZINPS(:), ZINPA(:) +REAL(KIND=JPRD), POINTER :: ZINPS0(:), ZINPA0(:) +REAL(KIND=JPRBT), POINTER :: ZOUT(:) +REAL(KIND=JPRD), POINTER :: ZOUT0(:) +INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS ! ------------------------------------------------------------------ IF (LHOOK) CALL DR_HOOK('LTDIR_MOD',0,ZHOOK_HANDLE) @@ -126,12 +136,49 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& !* 2. PREPARE WORK ARRAYS. ! -------------------- + IF (NPROC == 1) THEN + ! Short cut - no need to go through tansforms, we will go directly into + ! the legendre space, but for that we need twice the memory, roughly + ! (but we don't need the send/recv buffers) + ALLOC_SZ = KF_FS*D%NLENGTF*SIZEOF(REUSE_PTR(1)) + ELSE + ALLOC_SZ = 0 + ENDIF + + ! Check if the reuse buffer is large enough + ALLOC_SZ = ALLOC_SZ+LEDIR_ALLOC_SIZE(KF_FS) + + IF (.NOT. ALLOCATED(REUSE_PTR)) THEN + ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) + !$ACC ENTER DATA CREATE(REUSE_PTR) + ELSEIF (SIZEOF(REUSE_PTR) < ALLOC_SZ) THEN + ! and reallocate if needed + !$ACC EXIT DATA DELETE(REUSE_PTR) + DEALLOCATE(REUSE_PTR) + ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) + !$ACC ENTER DATA CREATE(REUSE_PTR) + ENDIF + + ! Figure out which pointers to use + ALLOC_POS=1 + IF (NPROC == 1) THEN + ALLOC_POS = ALLOC_POS+KF_FS*D%NLENGTF + ENDIF + + CALL LEDIR_POINTERS(REUSE_PTR(ALLOC_POS:),KF_FS,ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) + + CALL LEDIR_PACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + IF (NPROC > 1) THEN + !$ACC EXIT DATA DELETE(FOUBUF) + DEALLOCATE(FOUBUF) + ENDIF + ALLOCATE(POA1(2*KF_FS,R%NTMAX+3,D%NUMP)) !$ACC ENTER DATA CREATE(POA1) ! do the legendre transform - CALL LEDIR(FOUBUF,POA1,KF_FS,KF_UV) +CALL LEDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,POA1,KF_FS) !$ACC DATA COPYOUT(PSPVOR,PSPDIV) IF(KF_UV > 0) !$ACC DATA COPYOUT(PSPSCALAR) IF(PRESENT(PSPSCALAR) .AND. KF_SCALARS > 0) From fec161ff95cf80b51bee0e1c984a0634079b4dde Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:34 -0700 Subject: [PATCH 221/263] Move packing for legendre transform into dir_trans --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 55 ++++++++++++++++++- src/trans/gpu/internal/ltdir_mod.F90 | 58 +++----------------- 2 files changed, 60 insertions(+), 53 deletions(-) diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 671309bd4..5c776e151 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -73,7 +73,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ! ------------------------------------------------------------------ USE PARKIND1 ,ONLY : JPIM ,JPRB -USE PARKIND_ECTRANS ,ONLY : JPRBT +USE PARKIND_ECTRANS ,ONLY : JPRBT, JPRD USE TPM_GEN ,ONLY : NPROMATR, NOUT USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B @@ -87,6 +87,10 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& USE TPM_GEN ,ONLY : LSYNC_TRANS USE TRLTOM_MOD ,ONLY : TRLTOM, TRLTOM_CUDAAWARE USE FOURIER_OUT_MOD , ONLY: FOURIER_OUT + +USE TPM_TRANS, ONLY: REUSE_PTR +USE TPM_DISTR, ONLY : D, NPROC +USE LEDIR_MOD ,ONLY : LEDIR_POINTERS, LEDIR_PACK_BUFFER, LEDIR_ALLOC_SIZE ! IMPLICIT NONE @@ -125,10 +129,17 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& INTEGER(KIND=JPIM) :: IF_UV_G,IF_UV,ISTUV,IF_SCALARS,IF_SCALARS_G,IF_FS,IF_GP INTEGER(KIND=JPIM) :: JFLD,ISTSC_G,IENSC_G,ISTSC,IENSC,IENUV,IF_GPB +INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS + REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF_IN(:) REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF(:) REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) +REAL(KIND=JPRBT), POINTER :: ZINPS(:), ZINPA(:) +REAL(KIND=JPRD), POINTER :: ZINPS0(:), ZINPA0(:) +REAL(KIND=JPRBT), POINTER :: ZOUT(:) +REAL(KIND=JPRD), POINTER :: ZOUT0(:) + ! ------------------------------------------------------------------ @@ -154,12 +165,52 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& CALL TRLTOM_CUDAAWARE(FOUBUF_IN,FOUBUF,KF_FS) #else CALL TRLTOM(FOUBUF_IN,FOUBUF,KF_FS) + #endif + + ALLOC_SZ = LEDIR_ALLOC_SIZE(KF_FS) + ALLOC_POS =1 + IF (.NOT. ALLOCATED(REUSE_PTR)) THEN + ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) + !$ACC ENTER DATA CREATE(REUSE_PTR) + ELSEIF (SIZEOF(REUSE_PTR) < ALLOC_SZ) THEN + ! and reallocate if needed + !$ACC EXIT DATA DELETE(REUSE_PTR) + DEALLOCATE(REUSE_PTR) + ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) + !$ACC ENTER DATA CREATE(REUSE_PTR) + ENDIF + CALL LEDIR_POINTERS(REUSE_PTR(ALLOC_POS:),KF_FS,ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) + IF (KF_FS > 0) CALL LEDIR_PACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + + !$ACC EXIT DATA DELETE(FOUBUF) + DEALLOCATE(FOUBUF) + + ELSE + ! Short cut - no need to go through tansforms, we will go directly into + ! the legendre space, but for that we need twice the memory, roughly + ! (but we don't need the send/recv buffers) + ALLOC_SZ = KF_FS*D%NLENGTF*SIZEOF(REUSE_PTR(1)) + ALLOC_SZ = ALLOC_SZ+LEDIR_ALLOC_SIZE(KF_FS) + ALLOC_POS = 1+KF_FS*D%NLENGTF + + IF (.NOT. ALLOCATED(REUSE_PTR)) THEN + ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) + !$ACC ENTER DATA CREATE(REUSE_PTR) + ELSEIF (SIZEOF(REUSE_PTR) < ALLOC_SZ) THEN + ! and reallocate if needed + !$ACC EXIT DATA DELETE(REUSE_PTR) + DEALLOCATE(REUSE_PTR) + ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) + !$ACC ENTER DATA CREATE(REUSE_PTR) + ENDIF + CALL LEDIR_POINTERS(REUSE_PTR(ALLOC_POS:),KF_FS,ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) + IF (KF_FS > 0) CALL LEDIR_PACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) ENDIF CALL GSTATS(153,1) IF (KF_FS > 0) THEN - CALL LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS, & + CALL LTDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,KF_FS,KF_UV,KF_SCALARS, & & PSPVOR,PSPDIV,PSPSCALAR,& & PSPSC3A,PSPSC3B,PSPSC2) ENDIF diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index aa067eafa..bebb55dfe 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -10,7 +10,7 @@ MODULE LTDIR_MOD CONTAINS - SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& + SUBROUTINE LTDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,KF_FS,KF_UV,KF_SCALARS,& & PSPVOR,PSPDIV,PSPSCALAR,& & PSPSC3A,PSPSC3B,PSPSC2, & & KFLDPTRUV,KFLDPTRSC) @@ -25,7 +25,7 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& USE TPM_GEOMETRY USE PREPSNM_MOD ,ONLY : PREPSNM - USE LEDIR_MOD ,ONLY : LEDIR, LEDIR_POINTERS, LEDIR_PACK_BUFFER, LEDIR_ALLOC_SIZE + USE LEDIR_MOD ,ONLY : LEDIR USE UVTVD_MOD USE UPDSP_MOD ,ONLY : UPDSP USE UPDSPB_MOD ,ONLY : UPDSPB @@ -34,9 +34,6 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX -USE TPM_TRANS, ONLY: REUSE_PTR -USE TPM_DISTR, ONLY : NPROC - !**** *LTDIR* - Control of Direct Legendre transform step @@ -107,7 +104,10 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& REAL(KIND=JPRB) ,OPTIONAL,INTENT(OUT) :: PSPSC3B(:,:,:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) - REAL(KIND=JPRBT), ALLOCATABLE, INTENT(INOUT) :: FOUBUF(:) + REAL(KIND=JPRBT), INTENT(IN) :: ZINPS(:), ZINPA(:) + REAL(KIND=JPRD), INTENT(IN) :: ZINPS0(:), ZINPA0(:) + REAL(KIND=JPRBT), INTENT(OUT) :: ZOUT(:) + REAL(KIND=JPRD), INTENT(OUT) :: ZOUT0(:) ! LOCAL INTEGER SCALARS INTEGER(KIND=JPIM) :: IFC, IIFC, IDGLU, IFIRST @@ -116,12 +116,6 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& REAL(KIND=JPRB), ALLOCATABLE, TARGET :: POA1(:,:,:) REAL(KIND=JPRB), ALLOCATABLE, TARGET :: POA2(:,:,:) REAL(KIND=JPRB), POINTER :: PU(:,:,:), PV(:,:,:), PVOR(:,:,:), PDIV(:,:,:) - -REAL(KIND=JPRBT), POINTER :: ZINPS(:), ZINPA(:) -REAL(KIND=JPRD), POINTER :: ZINPS0(:), ZINPA0(:) -REAL(KIND=JPRBT), POINTER :: ZOUT(:) -REAL(KIND=JPRD), POINTER :: ZOUT0(:) -INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS ! ------------------------------------------------------------------ IF (LHOOK) CALL DR_HOOK('LTDIR_MOD',0,ZHOOK_HANDLE) @@ -136,49 +130,11 @@ SUBROUTINE LTDIR(FOUBUF,KF_FS,KF_UV,KF_SCALARS,& !* 2. PREPARE WORK ARRAYS. ! -------------------- - IF (NPROC == 1) THEN - ! Short cut - no need to go through tansforms, we will go directly into - ! the legendre space, but for that we need twice the memory, roughly - ! (but we don't need the send/recv buffers) - ALLOC_SZ = KF_FS*D%NLENGTF*SIZEOF(REUSE_PTR(1)) - ELSE - ALLOC_SZ = 0 - ENDIF - - ! Check if the reuse buffer is large enough - ALLOC_SZ = ALLOC_SZ+LEDIR_ALLOC_SIZE(KF_FS) - - IF (.NOT. ALLOCATED(REUSE_PTR)) THEN - ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) - !$ACC ENTER DATA CREATE(REUSE_PTR) - ELSEIF (SIZEOF(REUSE_PTR) < ALLOC_SZ) THEN - ! and reallocate if needed - !$ACC EXIT DATA DELETE(REUSE_PTR) - DEALLOCATE(REUSE_PTR) - ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) - !$ACC ENTER DATA CREATE(REUSE_PTR) - ENDIF - - ! Figure out which pointers to use - ALLOC_POS=1 - IF (NPROC == 1) THEN - ALLOC_POS = ALLOC_POS+KF_FS*D%NLENGTF - ENDIF - - CALL LEDIR_POINTERS(REUSE_PTR(ALLOC_POS:),KF_FS,ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) - - CALL LEDIR_PACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) - IF (NPROC > 1) THEN - !$ACC EXIT DATA DELETE(FOUBUF) - DEALLOCATE(FOUBUF) - ENDIF - - ALLOCATE(POA1(2*KF_FS,R%NTMAX+3,D%NUMP)) !$ACC ENTER DATA CREATE(POA1) ! do the legendre transform -CALL LEDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,POA1,KF_FS) + CALL LEDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,POA1,KF_FS) !$ACC DATA COPYOUT(PSPVOR,PSPDIV) IF(KF_UV > 0) !$ACC DATA COPYOUT(PSPSCALAR) IF(PRESENT(PSPSCALAR) .AND. KF_SCALARS > 0) From d2607da0ac523864d2a85e1efdb846648b11b170 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Thu, 2 Jun 2022 05:28:47 -0700 Subject: [PATCH 222/263] move self copy in trgtol, remove trgtol (not cudaaware) --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 77 +- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 49 +- src/trans/gpu/internal/ftinv_ctlad_mod.F90 | 1 - src/trans/gpu/internal/ledir_mod.F90 | 4 +- src/trans/gpu/internal/trgtol_mod.F90 | 824 ++----------------- 5 files changed, 124 insertions(+), 831 deletions(-) diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 5c776e151..0f08ec804 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -78,11 +78,12 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& USE TPM_GEN ,ONLY : NPROMATR, NOUT USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B USE TPM_DISTR, ONLY: NPROC +USE FTDIR_MOD ,ONLY : FTDIR USE SHUFFLE_MOD ,ONLY : SHUFFLE USE FIELD_SPLIT_MOD ,ONLY : FIELD_SPLIT USE LTDIR_MOD ,ONLY : LTDIR -USE FTDIR_CTL_MOD ,ONLY : FTDIR_CTL +USE FTDIR_CTL_MOD ,ONLY : GRID_TO_FOURIER USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS USE TRLTOM_MOD ,ONLY : TRLTOM, TRLTOM_CUDAAWARE @@ -90,7 +91,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& USE TPM_TRANS, ONLY: REUSE_PTR USE TPM_DISTR, ONLY : D, NPROC -USE LEDIR_MOD ,ONLY : LEDIR_POINTERS, LEDIR_PACK_BUFFER, LEDIR_ALLOC_SIZE +USE LEDIR_MOD ,ONLY : LEDIR_POINTERS, LEDIR_UNPACK_BUFFER, LEDIR_ALLOC_SIZE ! IMPLICIT NONE @@ -129,10 +130,11 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& INTEGER(KIND=JPIM) :: IF_UV_G,IF_UV,ISTUV,IF_SCALARS,IF_SCALARS_G,IF_FS,IF_GP INTEGER(KIND=JPIM) :: JFLD,ISTSC_G,IENSC_G,ISTSC,IENSC,IENUV,IF_GPB -INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS +INTEGER(KIND=8) :: IALLOC_SZ, IALLOC_POS REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF_IN(:) REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF(:) +REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:) REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) REAL(KIND=JPRBT), POINTER :: ZINPS(:), ZINPA(:) @@ -141,6 +143,27 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& REAL(KIND=JPRD), POINTER :: ZOUT0(:) +IALLOC_SZ=KF_FS*D%NLENGTF + + +IF (NPROC == 1) THEN + ! If we have only one process, we skip the intermediate buffer, so we need more space + IALLOC_SZ = IALLOC_SZ + LEDIR_ALLOC_SIZE(KF_FS)/SIZEOF(REUSE_PTR(1)) +ELSE + IALLOC_SZ = MAX(IALLOC_SZ, LEDIR_ALLOC_SIZE(KF_FS)/SIZEOF(REUSE_PTR(1))) +ENDIF + +IF (.NOT. ALLOCATED(REUSE_PTR)) THEN + ALLOCATE(REUSE_PTR(IALLOC_SZ)) + !$ACC ENTER DATA CREATE(REUSE_PTR) +ELSEIF (SIZE(REUSE_PTR) < IALLOC_SZ) THEN + !$ACC EXIT DATA DELETE(REUSE_PTR) + DEALLOCATE(REUSE_PTR) + ALLOCATE(REUSE_PTR(IALLOC_SZ)) + !$ACC ENTER DATA CREATE(REUSE_PTR) +ENDIF +PREEL_REAL => REUSE_PTR + ! ------------------------------------------------------------------ ! Perform transform @@ -151,12 +174,21 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& stop 4 ELSE - ! No splitting of fields, transform done in one go - CALL FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_COMPLEX,& + ! from the PGP arrays to PREEL_REAL + CALL GRID_TO_FOURIER(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_REAL,& & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) + ! fourier transform from PREEL_REAL to PREEL_COMPLEX (in-place!) + CALL GSTATS(1640,0) + IF (KF_FS > 0) THEN + CALL FTDIR(PREEL_REAL,PREEL_COMPLEX,KF_FS) + ELSE + PREEL_COMPLEX => PREEL_REAL + ENDIF + CALL GSTATS(1640,1) + CALL GSTATS(153,0) IF (NPROC > 1) THEN CALL FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KF_FS) @@ -168,20 +200,8 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& #endif - ALLOC_SZ = LEDIR_ALLOC_SIZE(KF_FS) - ALLOC_POS =1 - IF (.NOT. ALLOCATED(REUSE_PTR)) THEN - ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) - !$ACC ENTER DATA CREATE(REUSE_PTR) - ELSEIF (SIZEOF(REUSE_PTR) < ALLOC_SZ) THEN - ! and reallocate if needed - !$ACC EXIT DATA DELETE(REUSE_PTR) - DEALLOCATE(REUSE_PTR) - ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) - !$ACC ENTER DATA CREATE(REUSE_PTR) - ENDIF - CALL LEDIR_POINTERS(REUSE_PTR(ALLOC_POS:),KF_FS,ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) - IF (KF_FS > 0) CALL LEDIR_PACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + CALL LEDIR_POINTERS(REUSE_PTR(1:),KF_FS,ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) + IF (KF_FS > 0) CALL LEDIR_UNPACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) !$ACC EXIT DATA DELETE(FOUBUF) DEALLOCATE(FOUBUF) @@ -190,22 +210,9 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ! Short cut - no need to go through tansforms, we will go directly into ! the legendre space, but for that we need twice the memory, roughly ! (but we don't need the send/recv buffers) - ALLOC_SZ = KF_FS*D%NLENGTF*SIZEOF(REUSE_PTR(1)) - ALLOC_SZ = ALLOC_SZ+LEDIR_ALLOC_SIZE(KF_FS) - ALLOC_POS = 1+KF_FS*D%NLENGTF - - IF (.NOT. ALLOCATED(REUSE_PTR)) THEN - ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) - !$ACC ENTER DATA CREATE(REUSE_PTR) - ELSEIF (SIZEOF(REUSE_PTR) < ALLOC_SZ) THEN - ! and reallocate if needed - !$ACC EXIT DATA DELETE(REUSE_PTR) - DEALLOCATE(REUSE_PTR) - ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) - !$ACC ENTER DATA CREATE(REUSE_PTR) - ENDIF - CALL LEDIR_POINTERS(REUSE_PTR(ALLOC_POS:),KF_FS,ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) - IF (KF_FS > 0) CALL LEDIR_PACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + IALLOC_POS = 1+KF_FS*D%NLENGTF + CALL LEDIR_POINTERS(REUSE_PTR(IALLOC_POS:),KF_FS,ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) + IF (KF_FS > 0) CALL LEDIR_UNPACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) ENDIF CALL GSTATS(153,1) diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index 43e63441c..6d3d33b32 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -10,7 +10,7 @@ MODULE FTDIR_CTL_MOD CONTAINS -SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_COMPLEX, & +SUBROUTINE GRID_TO_FOURIER(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_REAL, & & KVSETUV,KVSETSC,KPTRGP,& & KVSETSC3A,KVSETSC3B,KVSETSC2,& & PGP,PGPUV,PGP3A,PGP3B,PGP2) @@ -63,13 +63,10 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_COMPLEX, & USE TPM_DISTR ,ONLY : D, MYPROC, NPROC USE TPM_TRANS, ONLY: REUSE_PTR -USE TRGTOL_MOD ,ONLY : TRGTOL, TRGTOL_CUDAAWARE -USE FTDIR_MOD ,ONLY : FTDIR +USE TRGTOL_MOD ,ONLY : TRGTOL_CUDAAWARE USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX - -USE LEDIR_MOD, ONLY: LEDIR_ALLOC_SIZE use ieee_arithmetic ! @@ -91,10 +88,9 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_COMPLEX, & REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP3A(:,:,:,:) REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP2(:,:,:) -REAL(KIND=JPRBT), POINTER, INTENT(OUT) :: PREEL_COMPLEX(:) +REAL(KIND=JPRBT), INTENT(OUT) :: PREEL_REAL(:) ! Local variables -REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:) INTEGER(KIND=JPIM) :: IST,JGL,IGL,JF_FS INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) @@ -153,52 +149,15 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_COMPLEX, & IST = IST+KF_SCALARS_G ENDIF -IALLOC_SIZE=KF_FS*D%NLENGTF - - -IF (NPROC == 1) THEN - ! If we have only one process, we skip the intermediate buffer, so we need more space - IALLOC_SIZE = IALLOC_SIZE + LEDIR_ALLOC_SIZE(KF_FS)/SIZEOF(REUSE_PTR(1)) -ENDIF - -IF (.NOT. ALLOCATED(REUSE_PTR)) THEN - ALLOCATE(REUSE_PTR(IALLOC_SIZE)) - !$ACC ENTER DATA CREATE(REUSE_PTR) -ELSEIF (SIZE(REUSE_PTR) < IALLOC_SIZE) THEN - !$ACC EXIT DATA DELETE(REUSE_PTR) - DEALLOCATE(REUSE_PTR) - ALLOCATE(REUSE_PTR(IALLOC_SIZE)) - !$ACC ENTER DATA CREATE(REUSE_PTR) -ENDIF -PREEL_REAL => REUSE_PTR - ! Transposition CALL GSTATS(158,0) -#ifdef USE_CUDA_AWARE_MPI_FT CALL TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) -#else -CALL TRGTOL(PREEL_REAL,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& - &PGP,PGPUV,PGP3A,PGP3B,PGP2) -#endif CALL GSTATS(158,1) -CALL GSTATS(106,0) - -! Fourier transform - -CALL GSTATS(1640,0) -IF (KF_FS > 0) THEN - CALL FTDIR(PREEL_REAL,PREEL_COMPLEX,KF_FS) -ELSE - PREEL_COMPLEX => PREEL_REAL -ENDIF -CALL GSTATS(1640,1) - -CALL GSTATS(106,1) ! ------------------------------------------------------------------ -END SUBROUTINE FTDIR_CTL +END SUBROUTINE GRID_TO_FOURIER END MODULE FTDIR_CTL_MOD diff --git a/src/trans/gpu/internal/ftinv_ctlad_mod.F90 b/src/trans/gpu/internal/ftinv_ctlad_mod.F90 index dfc1094f0..098f250cf 100755 --- a/src/trans/gpu/internal/ftinv_ctlad_mod.F90 +++ b/src/trans/gpu/internal/ftinv_ctlad_mod.F90 @@ -72,7 +72,6 @@ SUBROUTINE FTINV_CTLAD(KF_UV_G,KF_SCALARS_G,& USE FOURIER_INAD_MOD ,ONLY : FOURIER_INAD USE FSCAD_MOD ,ONLY : FSCAD USE FTINVAD_MOD ,ONLY : FTINVAD -USE TRGTOL_MOD ,ONLY : TRGTOL USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS ! diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 20cefcdee..d024de3d4 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -13,7 +13,7 @@ MODULE LEDIR_MOD USE PARKIND_ECTRANS ,ONLY : JPIM PRIVATE - PUBLIC :: LEDIR, LEDIR_ALLOC_SIZE, LEDIR_PACK_BUFFER, LEDIR_POINTERS + PUBLIC :: LEDIR, LEDIR_ALLOC_SIZE, LEDIR_UNPACK_BUFFER, LEDIR_POINTERS INTEGER(KIND=JPIM) :: A = 8 !Alignment CONTAINS @@ -128,7 +128,7 @@ SUBROUTINE LEDIR_POINTERS(ZBUF,KF_FS,ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) END SUBROUTINE -SUBROUTINE LEDIR_PACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) +SUBROUTINE LEDIR_UNPACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD USE TPM_DIM ,ONLY : R, R_NDGNH, R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 44b9b4077..59a4e3594 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -254,76 +254,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& ENDIF CALL GSTATS(412,1) - ! TODO We should do the local contribution *WHILE* sending the data... - ! Copy local contribution - IF(ISENDTOT(MYPROC) > 0 )THEN - ! I have to send something to myself... - - ! Input is KF_GP fields. We find the resulting KF_FS fields. - IFLDS = 0 - DO JFLD=1,KF_GP - IF(KVSET(JFLD) == MYSETV .OR. KVSET(JFLD) == -1) THEN - IFLDS = IFLDS+1 - IF(PRESENT(KPTRGP)) THEN - IFLDA(IFLDS) = KPTRGP(JFLD) - ELSE - IFLDA(IFLDS) = JFLD - ENDIF - ENDIF - ENDDO - - !$ACC DATA COPYIN(IFLDA(1:IFLDS)) ASYNC(1) - - ISEND_WSET_OFFSET_V = ISEND_WSET_OFFSET(MYSETW) - ISEND_WSET_SIZE_V = ISEND_WSET_SIZE(MYSETW) - IRECV_BUFR_TO_OUT_V = IRECV_BUFR_TO_OUT_OFFSET(MYPROC) - CALL GSTATS(1601,0) - IF(PRESENT(PGP)) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) - DO JFLD=1,KF_FS - DO JL=1,ISEND_WSET_SIZE_V - JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 - JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 - IFLD = IFLDA(JFLD) - IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ & - & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 - PREEL_REAL(IPOS) = PGP(JK,IFLD,JBLK) - ENDDO - ENDDO - ELSE - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) - DO JFLD=1,KF_FS - DO JL=1,ISEND_WSET_SIZE_V - JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 - JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 - IFLD = IFLDA(JFLD) - IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ & - & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 - IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) - PBOUND=UBOUND(PGPUV,2) - PREEL_REAL(IPOS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) - PREEL_REAL(IPOS) = PGP2(JK,IOFF+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) - PBOUND=UBOUND(PGP3A,2) - PREEL_REAL(IPOS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) - PBOUND=UBOUND(PGP3B,2) - PREEL_REAL(IPOS) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) - ENDIF - ENDDO - ENDDO - ENDIF - CALL GSTATS(1601,1) - - !$ACC END DATA - - ENDIF - ! Figure out processes that send or recv something ISEND_COUNTS = 0 IRECV_COUNTS = 0 @@ -425,12 +355,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& ENDIF !$ACC END DATA ENDDO - !$ACC END DATA !ZCOMBUFS (present) - !$ACC END DATA !PGP3B - !$ACC END DATA !PGP3A - !$ACC END DATA !PGP2 - !$ACC END DATA !PGPUV - !$ACC END DATA !PGP IF (IRECV_COUNTS > 0) ALLOCATE(ZCOMBUFR(ICOMBUFR_OFFSET(IRECV_COUNTS+1))) !$ACC DATA IF(IRECV_COUNTS > 0) CREATE(ZCOMBUFR) ASYNC(1) @@ -466,6 +390,76 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& ENDDO !$ACC END HOST_DATA + ! Copy local contribution + IF(ISENDTOT(MYPROC) > 0 )THEN + ! I have to send something to myself... + + ! Input is KF_GP fields. We find the resulting KF_FS fields. + IFLDS = 0 + DO JFLD=1,KF_GP + IF(KVSET(JFLD) == MYSETV .OR. KVSET(JFLD) == -1) THEN + IFLDS = IFLDS+1 + IF(PRESENT(KPTRGP)) THEN + IFLDA(IFLDS) = KPTRGP(JFLD) + ELSE + IFLDA(IFLDS) = JFLD + ENDIF + ENDIF + ENDDO + + !$ACC DATA COPYIN(IFLDA(1:IFLDS)) ASYNC(1) + + ISEND_WSET_OFFSET_V = ISEND_WSET_OFFSET(MYSETW) + ISEND_WSET_SIZE_V = ISEND_WSET_SIZE(MYSETW) + IRECV_BUFR_TO_OUT_V = IRECV_BUFR_TO_OUT_OFFSET(MYPROC) + CALL GSTATS(1601,0) + IF(PRESENT(PGP)) THEN + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) + DO JFLD=1,KF_FS + DO JL=1,ISEND_WSET_SIZE_V + JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ & + & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 + PREEL_REAL(IPOS) = PGP(JK,IFLD,JBLK) + ENDDO + ENDDO + ELSE + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) + DO JFLD=1,KF_FS + DO JL=1,ISEND_WSET_SIZE_V + JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ & + & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 + IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) + PBOUND=UBOUND(PGPUV,2) + PREEL_REAL(IPOS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) + PREEL_REAL(IPOS) = PGP2(JK,IOFF+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) + PBOUND=UBOUND(PGP3A,2) + PREEL_REAL(IPOS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) + PBOUND=UBOUND(PGP3B,2) + PREEL_REAL(IPOS) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ENDIF + ENDDO + ENDDO + ENDIF + CALL GSTATS(1601,1) + + !$ACC END DATA + + ENDIF + + IF(IR > 0) THEN CALL MPL_WAIT(KREQUEST=IREQ(1:IR), & & CDSTRING='TRGTOL_CUDAAWARE: WAIT FOR SENDS AND RECEIVES') @@ -505,6 +499,12 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& !$ACC END DATA ! ZCOMBUFR !$ACC END DATA ! IRECV_BUFR_TO_OUT,PGPINDICES + !$ACC END DATA !ZCOMBUFS (present) + !$ACC END DATA !PGP3B + !$ACC END DATA !PGP3A + !$ACC END DATA !PGP2 + !$ACC END DATA !PGPUV + !$ACC END DATA !PGP IF (IRECV_COUNTS > 0) DEALLOCATE(ZCOMBUFR) @@ -512,676 +512,4 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& END SUBROUTINE TRGTOL_CUDAAWARE - SUBROUTINE TRGTOL(PREEL_REAL,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& - &PGP,PGPUV,PGP3A,PGP3B,PGP2) - - !**** *TRGTOL * - transposition of grid point data from column - ! structure to latitudinal. Reorganize data between - ! grid point calculations and direct Fourier Transform - - ! Purpose. - ! -------- - - - !** Interface. - ! ---------- - ! *call* *trgtol(...) - - ! Explicit arguments : - ! -------------------- - ! PREEL_REAL - Latitudinal data ready for direct FFT (output) - ! PGP - Blocked grid point data (input) - - ! Implicit arguments : - ! -------------------- - - ! Method. - ! ------- - ! See documentation - - ! Externals. - ! ---------- - - ! Reference. - ! ---------- - ! ECMWF Research Department documentation of the IFS - - ! Author. - ! ------- - ! MPP Group *ECMWF* - - ! Modifications. - ! -------------- - ! Original: 95-10-01 - ! D.Dent : 97-08-04 Reorganisation to allow - ! NPRTRV to differ from NPRGPEW - ! : 98-06-17 add mailbox control logic (from TRLTOM) - ! =99-03-29= Mats Hamrud and Deborah Salmond - ! JUMP in FFT's changed to 1 - ! KINDEX introduced and ZCOMBUF not used for same PE - ! 01-11-23 Deborah Salmond and John Hague - ! LIMP_NOOLAP Option for non-overlapping message passing - ! and buffer packing - ! 01-12-18 Peter Towers - ! Improved vector performance of GTOL_PACK,GTOL_UNPACK - ! 03-04-02 G. Radnoti: call barrier always when nproc>1 - ! 08-01-01 G.Mozdzynski: cleanup - ! 09-01-02 G.Mozdzynski: use non-blocking recv and send - ! ------------------------------------------------------------------ - - - - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT - USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - - USE MPL_MODULE ,ONLY : MPL_RECV, MPL_SEND, MPL_WAIT, JP_NON_BLOCKING_STANDARD - - USE TPM_GEN ,ONLY : NOUT - USE TPM_DISTR ,ONLY : D, NPRCIDS, NPRTRNS, MTAGGL, & - & MYSETV, MYSETW, MYPROC, NPROC - USE TPM_TRANS ,ONLY : LDIVGP, LGPNORM, LSCDERS, LUVDER, LVORGP, NGPBLKS - - USE INIGPTR_MOD ,ONLY : INIGPTR - USE PE2SET_MOD ,ONLY : PE2SET - !USE MYSENDSET_MOD - !USE MYRECVSET_MOD - USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS - ! - USE MPI - - IMPLICIT NONE - - REAL(KIND=JPRBT),INTENT(OUT) :: PREEL_REAL(:) - INTEGER(KIND=JPIM),INTENT(IN) :: KVSET(:) - INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP - INTEGER(KIND=JPIM),INTENT(IN) :: KF_SCALARS_G - INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) - REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP(:,:,:) - REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGPUV(:,:,:,:) - REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3A(:,:,:,:) - REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3B(:,:,:,:) - REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP2(:,:,:) - - REAL(KIND=JPRBT),ALLOCATABLE :: ZCOMBUFS(:,:),ZCOMBUFR(:,:) - REAL(KIND=JPRBT),ALLOCATABLE :: PREEL_REAL_TMP(:) - REAL(KIND=JPRBT) :: ZDUM(2) - - INTEGER(KIND=JPIM) :: ISENT (NPROC) - INTEGER(KIND=JPIM) :: IRCVD (NPROC) - INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) - INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) - INTEGER(KIND=JPIM) :: IREQ (NPROC*2) - INTEGER(KIND=JPIM) :: JRECV (NPROC) - INTEGER(KIND=JPIM) :: JSEND (NPROC) - - ! LOCAL INTEGER SCALARS - INTEGER(KIND=JPIM) :: IFIRST, IFIRSTLAT, IGL, IGLL, ILAST,& - &ILASTLAT, ILEN, JROC, IPOS, ISETA, & - &ISETB, IRECV, IRECVSET, & - &ISETV, ISEND, ISENDSET, ITAG, J, JBLK, JFLD, & - &JGL, JK, JL, JLOOP, ISETW, IFLD, & - &II,INDOFFX,IBUFLENS,IBUFLENR,INRECV, IPROC,IFLDS, & - &INSEND,INS,INR,IR, iunit - - ! LOCAL LOGICAL SCALARS - LOGICAL :: LLPGPUV,LLPGP3A,LLPGP3B,LLPGP2,LLPGPONLY - LOGICAL :: LLUV(KF_GP),LLGP2(KF_GP),LLGP3A(KF_GP),LLGP3B(KF_GP) - LOGICAL :: LLDONE, LLINDER - INTEGER(KIND=JPIM) :: IUVLEVS(KF_GP),IUVPARS(KF_GP),IGP2PARS(KF_GP) - INTEGER(KIND=JPIM) :: IGP3APARS(KF_GP),IGP3ALEVS(KF_GP),IGP3BPARS(KF_GP),IGP3BLEVS(KF_GP) - INTEGER(KIND=JPIM) :: IUVPAR,IUVLEV,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR,IPAROFF - INTEGER(KIND=JPIM) :: IOFF,IOFF1,IOFFNS,IOFFEW,J1,J2 - INTEGER(KIND=JPIM) :: KINDEX(D%NLENGTF),INDOFF(NPROC),IFLDOFF(MAX(KF_GP,KF_FS)) - INTEGER(KIND=JPIM) :: ISEND_FLD_TOTAL(NPROC),ISEND_FLD_START(NPROC),ISEND_FLD_END - INTEGER(KIND=JPIM) :: IRECV_FLD_START,IRECV_FLD_END - INTEGER(KIND=JPIM) :: INUMFLDS - INTEGER(KIND=JPIM) :: IGPTRSEND(2,NGPBLKS,NPRTRNS) - INTEGER(KIND=JPIM) :: IGPTRRECV(NPRTRNS) - INTEGER(KIND=JPIM) :: IGPTROFF(NGPBLKS) - ! INTEGER FUNCTIONS - INTEGER(KIND=JPIM) :: IRECVCOUNT,ISENDCOUNT - INTEGER(KIND=JPIM) :: IJPOS(NGPBLKS),IFLDA(KF_GP),JJ,JI,IFLDT - REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR - - INTEGER(KIND=JPIM) :: IERROR, irank - INTEGER(KIND=JPIM) :: KF, KGL, KI - - REAL(KIND=JPRBT) :: TIMEF, tc - - ! ------------------------------------------------------------------ - - !* 0. Some initializations - ! -------------------- - - IF (LHOOK) CALL DR_HOOK('TRGTOL',0,ZHOOK_HANDLE) - - iunit=300+myproc - - CALL GSTATS(1805,0) - - LLINDER = .FALSE. - LLPGPUV = .FALSE. - LLPGP3A = .FALSE. - LLPGP3B = .FALSE. - LLPGP2 = .FALSE. - LLPGPONLY = .FALSE. - IF(PRESENT(KPTRGP)) LLINDER = .TRUE. - IF(PRESENT(PGP)) LLPGPONLY = .TRUE. - IF(PRESENT(PGPUV)) LLPGPUV = .TRUE. - IF(PRESENT(PGP3A)) LLPGP3A = .TRUE. - IF(PRESENT(PGP3B)) LLPGP3B = .TRUE. - IF(PRESENT(PGP2)) LLPGP2 = .TRUE. - IUVPAR=0 - IUVLEV=0 - IOFF1=0 - IOFFNS=KF_SCALARS_G - IOFFEW=2*KF_SCALARS_G - LLUV(:) = .FALSE. - IUVPARS(:) = -99 - IUVLEVS(:) = -99 - IF (LLPGPUV) THEN - IOFF=0 - IUVLEV=UBOUND(PGPUV,2) - IF(LVORGP) THEN - IUVPAR=IUVPAR+1 - DO J=1,IUVLEV - IUVLEVS(IOFF+J)=J - IUVPARS(IOFF+J)=IUVPAR - LLUV(IOFF+J)=.TRUE. - ENDDO - IOFF=IOFF+IUVLEV - ENDIF - IF(LDIVGP) THEN - IUVPAR=IUVPAR+1 - DO J=1,IUVLEV - IUVLEVS(IOFF+J)=J - IUVPARS(IOFF+J)=IUVPAR - LLUV(IOFF+J)=.TRUE. - ENDDO - IOFF=IOFF+IUVLEV - ENDIF - DO J=1,IUVLEV - IUVLEVS(IOFF+J)=J - IUVPARS(IOFF+J)=IUVPAR+1 - IUVLEVS(IOFF+J+IUVLEV)=J - IUVPARS(IOFF+J+IUVLEV)=IUVPAR+2 - ENDDO - IUVPAR=IUVPAR+2 - LLUV(IOFF+1:IOFF+2*IUVLEV)=.TRUE. - IOFF=IOFF+2*IUVLEV - IOFF1=IOFF - IOFFNS=IOFFNS+IOFF - IOFFEW=IOFFEW+IOFF - IOFF=IUVPAR*IUVLEV+KF_SCALARS_G - IF(LUVDER) THEN - IF(LSCDERS) IOFF=IOFF+KF_SCALARS_G - DO J=1,IUVLEV - IUVLEVS(IOFF+J)=J - IUVPARS(IOFF+J)=IUVPAR+1 - LLUV(IOFF+J)=.TRUE. - IUVLEVS(IOFF+J+IUVLEV)=J - IUVPARS(IOFF+J+IUVLEV)=IUVPAR+2 - LLUV(IOFF+J+IUVLEV)=.TRUE. - ENDDO - IUVPAR=IUVPAR+2 - IOFF=IOFF+2*IUVLEV - IOFFEW=IOFFEW+2*IUVLEV - ENDIF - ENDIF - LLGP2(:)=.FALSE. - IF(LLPGP2) THEN - IOFF=IOFF1 - IGP2PAR=UBOUND(PGP2,2) - IF(LSCDERS) IGP2PAR=IGP2PAR/3 - DO J=1,IGP2PAR - LLGP2(J+IOFF) = .TRUE. - IGP2PARS(J+IOFF)=J - ENDDO - IOFF1=IOFF1+IGP2PAR - IF(LSCDERS) THEN - IOFF=IOFFNS - DO J=1,IGP2PAR - LLGP2(J+IOFF) = .TRUE. - IGP2PARS(J+IOFF)=J+IGP2PAR - ENDDO - IOFFNS=IOFF+IGP2PAR - IOFF=IOFFEW - DO J=1,IGP2PAR - LLGP2(J+IOFF) = .TRUE. - IGP2PARS(J+IOFF)=J+2*IGP2PAR - ENDDO - IOFFEW=IOFF+IGP2PAR - ENDIF - ENDIF - LLGP3A(:) = .FALSE. - IF(LLPGP3A) THEN - IGP3ALEV=UBOUND(PGP3A,2) - IGP3APAR=UBOUND(PGP3A,3) - IF(LSCDERS) IGP3APAR=IGP3APAR/3 - IOFF=IOFF1 - DO J1=1,IGP3APAR - DO J2=1,IGP3ALEV - LLGP3A(J2+(J1-1)*IGP3ALEV+IOFF) = .TRUE. - IGP3APARS(J2+(J1-1)*IGP3ALEV+IOFF)=J1 - IGP3ALEVS(J2+(J1-1)*IGP3ALEV+IOFF)=J2 - ENDDO - ENDDO - IPAROFF=IGP3APAR - IOFF1=IOFF1+IGP3APAR*IGP3ALEV - IF(LSCDERS) THEN - IOFF=IOFFNS - DO J1=1,IGP3APAR - DO J2=1,IGP3ALEV - LLGP3A(J2+(J1-1)*IGP3ALEV+IOFF) = .TRUE. - IGP3APARS(J2+(J1-1)*IGP3ALEV+IOFF)=J1+IPAROFF - IGP3ALEVS(J2+(J1-1)*IGP3ALEV+IOFF)=J2 - ENDDO - ENDDO - IPAROFF=IPAROFF+IGP3APAR - IOFFNS=IOFFNS+IGP3APAR*IGP3ALEV - IOFF=IOFFEW - DO J1=1,IGP3APAR - DO J2=1,IGP3ALEV - LLGP3A(J2+(J1-1)*IGP3ALEV+IOFF) = .TRUE. - IGP3APARS(J2+(J1-1)*IGP3ALEV+IOFF)=J1+IPAROFF - IGP3ALEVS(J2+(J1-1)*IGP3ALEV+IOFF)=J2 - ENDDO - ENDDO - IOFFEW=IOFFEW+IGP3APAR*IGP3ALEV - ENDIF - ENDIF - LLGP3B(:) = .FALSE. - IF(LLPGP3B) THEN - IGP3BLEV=UBOUND(PGP3B,2) - IGP3BPAR=UBOUND(PGP3B,3) - IF(LSCDERS) IGP3BPAR=IGP3BPAR/3 - IOFF=IOFF1 - DO J1=1,IGP3BPAR - DO J2=1,IGP3BLEV - LLGP3B(J2+(J1-1)*IGP3BLEV+IOFF) = .TRUE. - IGP3BPARS(J2+(J1-1)*IGP3BLEV+IOFF)=J1 - IGP3BLEVS(J2+(J1-1)*IGP3BLEV+IOFF)=J2 - ENDDO - ENDDO - IPAROFF=IGP3BPAR - IOFF1=IOFF1+IGP3BPAR*IGP3BLEV - IF(LSCDERS) THEN - IOFF=IOFFNS - DO J1=1,IGP3BPAR - DO J2=1,IGP3BLEV - LLGP3B(J2+(J1-1)*IGP3BLEV+IOFF) = .TRUE. - IGP3BPARS(J2+(J1-1)*IGP3BLEV+IOFF)=J1+IPAROFF - IGP3BLEVS(J2+(J1-1)*IGP3BLEV+IOFF)=J2 - ENDDO - ENDDO - IPAROFF=IPAROFF+IGP3BPAR - IOFFNS=IOFFNS+IGP3BPAR*IGP3BLEV - IOFF=IOFFEW - DO J1=1,IGP3BPAR - DO J2=1,IGP3BLEV - LLGP3B(J2+(J1-1)*IGP3BLEV+IOFF) = .TRUE. - IGP3BPARS(J2+(J1-1)*IGP3BLEV+IOFF)=J1+IPAROFF - IGP3BLEVS(J2+(J1-1)*IGP3BLEV+IOFF)=J2 - ENDDO - ENDDO - IOFFEW=IOFFEW+IGP3BPAR*IGP3BLEV - ENDIF - ENDIF - - - CALL INIGPTR(IGPTRSEND,IGPTRRECV) - LLDONE = .FALSE. - - ITAG = MTAGGL - - INDOFFX = 0 - IBUFLENS = 0 - IBUFLENR = 0 - INRECV = 0 - INSEND = 0 - - DO JROC=1,NPROC - - CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) - IRECVSET = ISETA - ISEND = JROC - ISENDSET = ISETV - ISENT(JROC) = 0 - IRCVD(JROC) = 0 - - ! count up expected number of fields - IPOS = 0 - DO JFLD=1,KF_GP - IF(KVSET(JFLD) == ISENDSET .OR. KVSET(JFLD) == -1) IPOS = IPOS+1 - ENDDO - ISEND_FLD_TOTAL(JROC) = IPOS - ISENDTOT(JROC) = IGPTRRECV(ISETW)*IPOS - - IF( JROC /= MYPROC) THEN - IBUFLENS = MAX(IBUFLENS,ISENDTOT(JROC)) - IF(ISENDTOT(JROC) > 0) THEN - INSEND = INSEND+1 - JSEND(INSEND)=JROC - ENDIF - ENDIF - - IFIRSTLAT = MAX(D%NPTRLS(MYSETW),D%NFRSTLAT(IRECVSET)) - ILASTLAT = MIN(D%NPTRLS(MYSETW)+D%NULTPP(MYSETW)-1,D%NLSTLAT(IRECVSET)) - - IPOS = 0 - DO JGL=IFIRSTLAT,ILASTLAT - IGL = D%NPTRFRSTLAT(IRECVSET)+JGL-D%NFRSTLAT(IRECVSET) - IPOS = IPOS+D%NONL(IGL,ISETB) - ENDDO - - IRECVTOT(JROC) = IPOS*KF_FS - - IF(IRECVTOT(JROC) > 0 .AND. MYPROC /= JROC) THEN - INRECV = INRECV + 1 - JRECV(INRECV)=JROC - ENDIF - - IF( JROC /= MYPROC) IBUFLENR = MAX(IBUFLENR,IRECVTOT(JROC)) - - IF(IPOS > 0) THEN - INDOFF(JROC) = INDOFFX - INDOFFX = INDOFFX+IPOS - IPOS = 0 - DO JGL=IFIRSTLAT,ILASTLAT - IGL = D%NPTRFRSTLAT(IRECVSET)+JGL-D%NFRSTLAT(IRECVSET) - IGLL = JGL-D%NPTRLS(MYSETW)+1 - DO JL=D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL),& - &D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL)+D%NONL(IGL,ISETB)-1 - IPOS = IPOS+1 - KINDEX(IPOS+INDOFF(JROC)) = JL - ENDDO - ENDDO - ENDIF - - ENDDO - - ISENDCOUNT=0 - IRECVCOUNT=0 - DO J=1,NPROC - ISENDCOUNT=MAX(ISENDCOUNT,ISENDTOT(J)) - IRECVCOUNT=MAX(IRECVCOUNT,IRECVTOT(J)) - ENDDO - IF (IBUFLENS > 0) ALLOCATE(ZCOMBUFS(-1:ISENDCOUNT,INSEND)) - IF (IBUFLENR > 0) ALLOCATE(ZCOMBUFR(-1:IRECVCOUNT,INRECV)) - - CALL GSTATS(1805,1) - - ! Send loop............................................................. - - ! Copy local contribution - - IF(ISENDTOT(MYPROC) > 0 )THEN - IFLDS = 0 - DO JFLD=1,KF_GP - IF(KVSET(JFLD) == MYSETV .OR. KVSET(JFLD) == -1) THEN - IFLDS = IFLDS+1 - IF(LLINDER) THEN - IFLDOFF(IFLDS) = KPTRGP(JFLD) - ELSE - IFLDOFF(IFLDS) = JFLD - ENDIF - ENDIF - ENDDO - - IPOS=0 - DO JBLK=1,NGPBLKS - IGPTROFF(JBLK)=IPOS - IFIRST = IGPTRSEND(1,JBLK,MYSETW) - IF(IFIRST > 0) THEN - ILAST = IGPTRSEND(2,JBLK,MYSETW) - IPOS=IPOS+ILAST-IFIRST+1 - ENDIF - ENDDO - - CALL GSTATS(1601,0) - #ifdef NECSX - !$OMP PARALLEL DO SCHEDULE(DYNAMIC) PRIVATE(JFLD,JBLK,JK,IFLD,IPOS,IFIRST,ILAST) - #else - !$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JFLD,JBLK,JK,IFLD,IPOS,IFIRST,ILAST) - #endif - DO JBLK=1,NGPBLKS - IFIRST = IGPTRSEND(1,JBLK,MYSETW) - IF(IFIRST > 0) THEN - ILAST = IGPTRSEND(2,JBLK,MYSETW) - IF(LLPGPONLY) THEN - DO JK=IFIRST,ILAST - IPOS = KINDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 - DO JFLD=1,IFLDS - IFLD = IFLDOFF(JFLD) - PREEL_REAL(JFLD+KF_FS*IPOS) = PGP(JK,IFLD,JBLK) - ENDDO - ENDDO - ELSE - DO JFLD=1,IFLDS - IFLD = IFLDOFF(JFLD) - IF(LLUV(IFLD)) THEN - DO JK=IFIRST,ILAST - IPOS = KINDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 - PREEL_REAL(JFLD+KF_FS*IPOS) = PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) - ENDDO - ELSEIF(LLGP2(IFLD)) THEN - DO JK=IFIRST,ILAST - IPOS = KINDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 - PREEL_REAL(JFLD+KF_FS*IPOS) = PGP2(JK,IGP2PARS(IFLD),JBLK) - ENDDO - ELSEIF(LLGP3A(IFLD)) THEN - DO JK=IFIRST,ILAST - IPOS = KINDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 - PREEL_REAL(JFLD+KF_FS*IPOS) = PGP3A(JK,IGP3ALEVS(IFLD),IGP3APARS(IFLD),JBLK) - ENDDO - ELSEIF(LLGP3B(IFLD)) THEN - DO JK=IFIRST,ILAST - IPOS = KINDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 - PREEL_REAL(JFLD+KF_FS*IPOS) = PGP3B(JK,IGP3BLEVS(IFLD),IGP3BPARS(IFLD),JBLK) - ENDDO - ELSE - CALL ABORT_TRANS('TRLTOG_MOD: ERROR') - ENDIF - ENDDO - ENDIF - ENDIF - ENDDO - !$OMP END PARALLEL DO - CALL GSTATS(1601,1) - - ENDIF - - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=TIMEF() - #endif - !....Pack loop......................................................... - - ISEND_FLD_START=1 - CALL GSTATS(1602,0) - !$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(IFLDT,JBLK,IFIRST,ILAST,JK,JJ,JI,& - !$OMP& INS,ISEND,ISETA,ISETB,ISETW,ISETV,ISENDSET,ISEND_FLD_END,IFLD,IPOS,& - !$OMP& IFLDA,JFLD,IJPOS) - DO INS=1,INSEND - ISEND=JSEND(INS) - CALL PE2SET(ISEND,ISETA,ISETB,ISETW,ISETV) - ISENDSET = ISETV - ISEND_FLD_END = ISEND_FLD_TOTAL(ISEND) - IFLD = 0 - IPOS = 0 - DO JFLD=1,KF_GP - IF(KVSET(JFLD) == ISENDSET .OR. KVSET(JFLD) == -1 ) THEN - IFLD = IFLD+1 - IFLDA(IFLD)=JFLD - ENDIF - ENDDO - - DO JBLK=1,NGPBLKS - IFIRST = IGPTRSEND(1,JBLK,ISETW) - IF(IFIRST > 0) THEN - ILAST = IGPTRSEND(2,JBLK,ISETW) - IJPOS(JBLK)=IPOS - IPOS=IPOS+(ILAST-IFIRST+1) - ENDIF - ENDDO - - - DO JJ=ISEND_FLD_START(ISEND),ISEND_FLD_END - IFLDT=IFLDA(JJ) - DO JBLK=1,NGPBLKS - IFIRST = IGPTRSEND(1,JBLK,ISETW) - IF(IFIRST > 0) THEN - ILAST = IGPTRSEND(2,JBLK,ISETW) - IF(LLINDER) THEN - DO JK=IFIRST,ILAST - JI=(JJ-ISEND_FLD_START(ISEND))*IPOS+IJPOS(JBLK)+JK-IFIRST+1 - ZCOMBUFS(JI,INS) = PGP(JK,KPTRGP(IFLDT),JBLK) - ENDDO - ELSE - IF(LLPGPONLY) THEN - DO JK=IFIRST,ILAST - JI=(JJ-ISEND_FLD_START(ISEND))*IPOS+IJPOS(JBLK)+JK-IFIRST+1 - ZCOMBUFS(JI,INS) = PGP(JK,IFLDT,JBLK) - ENDDO - ELSEIF(LLUV(IFLDT)) THEN - DO JK=IFIRST,ILAST - JI=(JJ-ISEND_FLD_START(ISEND))*IPOS+IJPOS(JBLK)+JK-IFIRST+1 - ZCOMBUFS(JI,INS) = PGPUV(JK,IUVLEVS(IFLDT),IUVPARS(IFLDT),JBLK) - ENDDO - ELSEIF(LLGP2(IFLDT)) THEN - DO JK=IFIRST,ILAST - JI=(JJ-ISEND_FLD_START(ISEND))*IPOS+IJPOS(JBLK)+JK-IFIRST+1 - ZCOMBUFS(JI,INS) = PGP2(JK,IGP2PARS(IFLDT),JBLK) - ENDDO - ELSEIF(LLGP3A(IFLDT)) THEN - DO JK=IFIRST,ILAST - JI=(JJ-ISEND_FLD_START(ISEND))*IPOS+IJPOS(JBLK)+JK-IFIRST+1 - ZCOMBUFS(JI,INS) = PGP3A(JK,IGP3ALEVS(IFLDT),IGP3APARS(IFLDT),JBLK) - ENDDO - ELSEIF(LLGP3B(IFLDT)) THEN - DO JK=IFIRST,ILAST - JI=(JJ-ISEND_FLD_START(ISEND))*IPOS+IJPOS(JBLK)+JK-IFIRST+1 - ZCOMBUFS(JI,INS) = PGP3B(JK,IGP3BLEVS(IFLDT),IGP3BPARS(IFLDT),JBLK) - ENDDO - ENDIF - ENDIF - ENDIF - ENDDO - ENDDO - - IPOS=(ISEND_FLD_END-ISEND_FLD_START(ISEND)+1)*IPOS - ZCOMBUFS(-1,INS) = 1 - ZCOMBUFS(0,INS) = IFLD - ENDDO - !$OMP END PARALLEL DO - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=(TIMEF()-Tc)/1000.0_JPRBT - CALL MPI_COMM_RANK(MPI_COMM_WORLD, IRANK, IERROR) - !IF(irank==0) WRITE(*,*) "packing (trgtol) in sec: ", Tc - #endif - - CALL GSTATS(1602,1) - - IF(.NOT.LGPNORM)THEN - CALL GSTATS(803,0) - ELSE - CALL GSTATS(804,0) - ENDIF - IR=0 - - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=TIMEF() - #endif - ! Receive loop......................................................... - DO INR=1,INRECV - IR=IR+1 - IRECV=JRECV(INR) - CALL MPL_RECV(ZCOMBUFR(-1:IRECVTOT(IRECV),INR), & - & KSOURCE=NPRCIDS(IRECV), & - & KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IREQ(IR), & - & KTAG=ITAG,CDSTRING='TRLTOG:' ) - !print*,irank,size(ZCOMBUFR(-1:IRECVTOT(IRECV),INR)) - ENDDO - - !....Send loop......................................................... - DO INS=1,INSEND - IR=IR+1 - ISEND=JSEND(INS) - CALL MPL_SEND(ZCOMBUFS(-1:ISENDTOT(ISEND),INS),KDEST=NPRCIDS(ISEND), & - & KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IREQ(IR), & - & KTAG=ITAG,CDSTRING='TRGTOL:' ) - ENDDO - - IF(IR > 0) THEN - CALL MPL_WAIT(KREQUEST=IREQ(1:IR), & - & CDSTRING='TRGTOL: WAIT FOR SENDS AND RECEIVES') - ENDIF - - !#ifdef COMVERBOSE - ! call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - ! Tc=(TIMEF()-Tc)/1000.0_JPRBT - ! !IF(irank==0) WRITE(*,*) "non-CUDA-aware isend/irecv (trgtol) in sec: ", Tc - !#endif - - IF(.NOT.LGPNORM)THEN - CALL GSTATS(803,1) - ELSE - CALL GSTATS(804,1) - ENDIF - - !#ifdef COMVERBOSE - ! call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - ! Tc=TIMEF() - !#endif - ! Unpack loop......................................................... - - CALL GSTATS(1603,0) - - - !$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JL,II,JFLD,INR,IRECV,ILEN,IRECV_FLD_START,IRECV_FLD_END,IPOS) - DO INR=1,INRECV - IRECV=JRECV(INR) - ILEN = IRECVTOT(IRECV)/KF_FS - IRECV_FLD_START = ZCOMBUFR(-1,INR) - IRECV_FLD_END = ZCOMBUFR(0,INR) - DO JFLD=IRECV_FLD_START,IRECV_FLD_END - DO JL=1,ILEN - II = KINDEX(INDOFF(IRECV)+JL)-1 - PREEL_REAL(JFLD+KF_FS*II) = ZCOMBUFR(JL+(JFLD-IRECV_FLD_START)*ILEN,INR) - ENDDO - ENDDO - ENDDO - !$OMP END PARALLEL DO - - CALL GSTATS(1603,1) - - IF (IBUFLENS > 0) DEALLOCATE(ZCOMBUFS) - IF (IBUFLENR > 0) DEALLOCATE(ZCOMBUFR) - - ! this is not efficient but I don't care about this part of code - !$ACC UPDATE DEVICE(PREEL_REAL) - - !$ACC DATA PRESENT(PREEL_REAL) CREATE(PREEL_REAL_TMP) - !$ACC KERNELS - PREEL_REAL_TMP(:) = PREEL_REAL(:) - !$ACC END KERNELS - - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) - DO KGL=1,D%NDGL_FS - DO KF=1,KF_FS - !$ACC LOOP SEQ - DO KI=1,D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL) - PREEL_REAL(KF_FS*D%NSTAGTF(KGL)+(KF-1)*(D%NSTAGTF(KGL+1)-D%NSTAGTF(KGL))+KI) = & - & PREEL_REAL_TMP(KF_FS*D%NSTAGTF(KGL)+(KI-1)*KF_FS+KF) - ENDDO - ENDDO - ENDDO - !$ACC END DATA - - IF (LHOOK) CALL DR_HOOK('TRGTOL',1,ZHOOK_HANDLE) - - END SUBROUTINE TRGTOL END MODULE TRGTOL_MOD From 2576d413919585799338e0db0021085d2ddc9c12 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:34 -0700 Subject: [PATCH 223/263] Add allocator / incomplete cleanup but working --- src/trans/gpu/internal/allocator_mod.F90 | 140 +++++++++++++++++ src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 112 ++++++-------- src/trans/gpu/internal/fourier_out_mod.F90 | 53 ++++--- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 54 ++++--- src/trans/gpu/internal/ftdir_mod.F90 | 13 +- src/trans/gpu/internal/ledir_mod.F90 | 122 ++++++++------- src/trans/gpu/internal/ltdir_mod.F90 | 102 +++++++++--- src/trans/gpu/internal/ltinv_ctlad_mod.F90 | 5 +- src/trans/gpu/internal/trgtol_mod.F90 | 41 ++--- src/trans/gpu/internal/trltom_mod.F90 | 155 ++++--------------- 10 files changed, 479 insertions(+), 318 deletions(-) create mode 100644 src/trans/gpu/internal/allocator_mod.F90 diff --git a/src/trans/gpu/internal/allocator_mod.F90 b/src/trans/gpu/internal/allocator_mod.F90 new file mode 100644 index 000000000..7f351e369 --- /dev/null +++ b/src/trans/gpu/internal/allocator_mod.F90 @@ -0,0 +1,140 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) +MODULE ALLOCATOR_MOD + + USE PARKIND_ECTRANS ,ONLY : JPIM + USE ISO_C_BINDING, ONLY: C_INT8_T, C_SIZE_T + + IMPLICIT NONE + + ! The buffered allocator uses double buffering. The idea is that the allocator + ! iterates through its two buffers, and each allocate returns one or the other + ! buffer. It is a two-step allocator - it expects you to create reservation + ! handles first for all allocations. Then the allocator is instantiated (i.e. + ! the buffers are actually allocated). Instantiation will do an allocation + ! that is large enough two hold all consecutive allocations. Other allocations + ! might be overwritten (like you can't access the allocation done two steps + ! before). + ! After instantiation, you can retrieve your buffers by passing the allocator + ! and the handles to GET_ALLOCATION. Also, we provide helper function + ! ASSIGN_PTR, because an allocation is often split among several "sub-buffers", + ! so you can for example assign the first half of an allocation to one + ! buffer, while the second half to another buffer. + ! If you see "Logical errors" that usually means you try to retrieve a buffer + ! that is not within the reserved allocation size. This might be a valid + ! region in the sense that it is physically allocated, but it might be part of + ! the double buffer. + + + TYPE BUFFERED_ALLOCATOR + INTEGER(KIND=C_SIZE_T) :: BUFR_SZ(0:1) + INTEGER(KIND=JPIM) :: NEXT_BUF + INTEGER(C_INT8_T), POINTER :: PTR(:) + END TYPE + TYPE ALLOCATION_RESERVATION_HANDLE + INTEGER(KIND=C_SIZE_T) :: SZ + INTEGER(KIND=JPIM) :: BUF + END TYPE + + INTERFACE ASSIGN_PTR + SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES) + USE ISO_C_BINDING + INTEGER(KIND=C_INT8_T), POINTER, INTENT(IN) :: SRC(:) + REAL(KIND=C_FLOAT), POINTER, INTENT(OUT) :: DST(:) + INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES + END SUBROUTINE + SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES) + USE ISO_C_BINDING + INTEGER(KIND=C_INT8_T), POINTER, INTENT(IN) :: SRC(:) + REAL(KIND=C_DOUBLE), POINTER, INTENT(OUT) :: DST(:) + INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES + END SUBROUTINE + END INTERFACE + +CONTAINS + + FUNCTION MAKE_BUFFERED_ALLOCATOR() + TYPE(BUFFERED_ALLOCATOR) :: MAKE_BUFFERED_ALLOCATOR + + MAKE_BUFFERED_ALLOCATOR%BUFR_SZ(:) = 0 + MAKE_BUFFERED_ALLOCATOR%NEXT_BUF = 0 + END FUNCTION + + FUNCTION RESERVE(ALLOCATOR, SZ) + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=C_SIZE_T), INTENT(IN) :: SZ + + TYPE(ALLOCATION_RESERVATION_HANDLE) :: RESERVE + + ALLOCATOR%BUFR_SZ(ALLOCATOR%NEXT_BUF) = MAX(ALLOCATOR%BUFR_SZ(ALLOCATOR%NEXT_BUF),SZ) + RESERVE%BUF = ALLOCATOR%NEXT_BUF + RESERVE%SZ = SZ + + ALLOCATOR%NEXT_BUF = 1-ALLOCATOR%NEXT_BUF + END FUNCTION + + SUBROUTINE INSTANTIATE_ALLOCATOR(ALLOCATOR, OLD_PTR) + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(C_INT8_T), OPTIONAL, INTENT(INOUT), POINTER :: OLD_PTR(:) + + ALLOCATOR%BUFR_SZ(1) = ALIGN(ALLOCATOR%BUFR_SZ(1),128) + ALLOCATOR%BUFR_SZ(2) = ALIGN(ALLOCATOR%BUFR_SZ(2),128) + + IF (PRESENT(OLD_PTR)) THEN + IF (SIZEOF(OLD_PTR) < SUM(ALLOCATOR%BUFR_SZ) ) THEN + !$ACC EXIT DATA DELETE(OLD_PTR) IF(PRESENT(OLD_PTR)) + DEALLOCATE(OLD_PTR) + NULLIFY(OLD_PTR) + ALLOCATE(ALLOCATOR%PTR(1:SUM(ALLOCATOR%BUFR_SZ))) + !$ACC ENTER DATA CREATE(ALLOCATOR%PTR) + ELSE + ALLOCATOR%PTR(1:) => OLD_PTR(1:) + ENDIF + ELSE + ALLOCATE(ALLOCATOR%PTR(1:SUM(ALLOCATOR%BUFR_SZ))) + !$ACC ENTER DATA CREATE(ALLOCATOR%PTR) + ENDIF + END SUBROUTINE + + FUNCTION GET_ALLOCATION(ALLOCATOR, RESERVATION) + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(ALLOCATION_RESERVATION_HANDLE), INTENT(IN) :: RESERVATION + + INTEGER(KIND=C_INT8_T), POINTER :: GET_ALLOCATION(:) + + IF (RESERVATION%SZ > ALLOCATOR%BUFR_SZ(RESERVATION%BUF)) THEN + PRINT *, "Logical Error in GET_ALLOCATOIN" + STOP 4 + ENDIF + IF (RESERVATION%BUF == 0) THEN + GET_ALLOCATION(1:) => ALLOCATOR%PTR(1:RESERVATION%SZ) + ELSE + GET_ALLOCATION(1:) => ALLOCATOR%PTR(ALLOCATOR%BUFR_SZ(0)+1: & + ALLOCATOR%BUFR_SZ(0)+RESERVATION%SZ) + ENDIF + END FUNCTION + + SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES) + USE ISO_C_BINDING + INTEGER(KIND=C_INT8_T), POINTER, INTENT(IN) :: SRC(:) + REAL(KIND=C_FLOAT), POINTER, INTENT(OUT) :: DST(:) + INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES + IF (START_IN_BYTES + LENGTH_IN_BYTES - 1 > SIZE(SRC, KIND=C_SIZE_T)) THEN + PRINT *, "Logical Error in ASSIGN_PTR - OOB assignment" + STOP 4 + ENDIF + CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES)), DST, & + & [SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES))/SIZEOF(DST(0))]) + END SUBROUTINE + SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES) + USE ISO_C_BINDING + INTEGER(KIND=C_INT8_T), POINTER, INTENT(IN) :: SRC(:) + REAL(KIND=C_DOUBLE), POINTER, INTENT(OUT) :: DST(:) + INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES + IF (START_IN_BYTES + LENGTH_IN_BYTES - 1 > SIZE(SRC, KIND=C_SIZE_T)) THEN + PRINT *, "Logical Error in ASSIGN_PTR - OOB assignment" + STOP 4 + ENDIF + CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES)), DST, & + & [SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES))/SIZEOF(DST(0))]) + END SUBROUTINE +END MODULE diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 0f08ec804..db18fbfae 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -78,20 +78,23 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& USE TPM_GEN ,ONLY : NPROMATR, NOUT USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B USE TPM_DISTR, ONLY: NPROC -USE FTDIR_MOD ,ONLY : FTDIR +USE FTDIR_MOD ,ONLY : FTDIR, FTDIR_HANDLE, PREPARE_FTDIR USE SHUFFLE_MOD ,ONLY : SHUFFLE USE FIELD_SPLIT_MOD ,ONLY : FIELD_SPLIT -USE LTDIR_MOD ,ONLY : LTDIR -USE FTDIR_CTL_MOD ,ONLY : GRID_TO_FOURIER +USE LTDIR_MOD ,ONLY : LTDIR, PREPARE_LTDIR +USE LEDIR_MOD, ONLY: LEDIR_HANDLE +USE FTDIR_CTL_MOD ,ONLY : GRID_TO_FOURIER, PREPARE_GRID_TO_FOURIER, GRID_TO_FOURIER_HANDLE USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS -USE TRLTOM_MOD ,ONLY : TRLTOM, TRLTOM_CUDAAWARE -USE FOURIER_OUT_MOD , ONLY: FOURIER_OUT +USE TRLTOM_MOD ,ONLY : TRLTOM_CUDAAWARE, TRLTOM_HANDLE, PREPARE_TRLTOM +USE FOURIER_OUT_MOD , ONLY: FOURIER_OUT, FOURIER_OUT_HANDLE, PREPARE_FOURIER_OUT -USE TPM_TRANS, ONLY: REUSE_PTR USE TPM_DISTR, ONLY : D, NPROC -USE LEDIR_MOD ,ONLY : LEDIR_POINTERS, LEDIR_UNPACK_BUFFER, LEDIR_ALLOC_SIZE +USE LEDIR_MOD ,ONLY : LEDIR_POINTERS, LEDIR_UNPACK_BUFFER, LEDIR_ALLOC_SIZE, PREPARE_LEDIR_UNPACK, LEDIR_UNPACK_HANDLE + +USE ALLOCATOR_MOD +USE ISO_C_BINDING, ONLY: C_INT8_T ! IMPLICIT NONE @@ -132,50 +135,44 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& INTEGER(KIND=8) :: IALLOC_SZ, IALLOC_POS -REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF_IN(:) -REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF(:) +REAL(KIND=JPRBT), POINTER :: FOUBUF_IN(:) +REAL(KIND=JPRBT), POINTER :: FOUBUF(:) REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:) REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) REAL(KIND=JPRBT), POINTER :: ZINPS(:), ZINPA(:) REAL(KIND=JPRD), POINTER :: ZINPS0(:), ZINPA0(:) -REAL(KIND=JPRBT), POINTER :: ZOUT(:) -REAL(KIND=JPRD), POINTER :: ZOUT0(:) - - -IALLOC_SZ=KF_FS*D%NLENGTF - -IF (NPROC == 1) THEN - ! If we have only one process, we skip the intermediate buffer, so we need more space - IALLOC_SZ = IALLOC_SZ + LEDIR_ALLOC_SIZE(KF_FS)/SIZEOF(REUSE_PTR(1)) -ELSE - IALLOC_SZ = MAX(IALLOC_SZ, LEDIR_ALLOC_SIZE(KF_FS)/SIZEOF(REUSE_PTR(1))) -ENDIF - -IF (.NOT. ALLOCATED(REUSE_PTR)) THEN - ALLOCATE(REUSE_PTR(IALLOC_SZ)) - !$ACC ENTER DATA CREATE(REUSE_PTR) -ELSEIF (SIZE(REUSE_PTR) < IALLOC_SZ) THEN - !$ACC EXIT DATA DELETE(REUSE_PTR) - DEALLOCATE(REUSE_PTR) - ALLOCATE(REUSE_PTR(IALLOC_SZ)) - !$ACC ENTER DATA CREATE(REUSE_PTR) -ENDIF -PREEL_REAL => REUSE_PTR - -! ------------------------------------------------------------------ +TYPE(BUFFERED_ALLOCATOR) :: ALLOCATOR +TYPE(GRID_TO_FOURIER_HANDLE) :: HGRID_TO_FOURIER +TYPE(FTDIR_HANDLE) :: HFTDIR +TYPE(FOURIER_OUT_HANDLE) :: HFOURIER_OUT +TYPE(TRLTOM_HANDLE) :: HTRLTOM +TYPE(LEDIR_UNPACK_HANDLE) :: HLEDIR_UNPACK +TYPE(LEDIR_HANDLE) :: HLEDIR + + IF(NPROMATR > 0) THEN + PRINT *, "ERROR, not implemented right now (NPROMATR > 0)" + STOP 4 + ENDIF -! Perform transform + ! Prepare everything + ALLOCATOR = MAKE_BUFFERED_ALLOCATOR() + HGRID_TO_FOURIER = PREPARE_GRID_TO_FOURIER(ALLOCATOR,KF_GP,KF_FS) + HFTDIR = PREPARE_FTDIR() + IF (NPROC > 1) THEN + HFOURIER_OUT = PREPARE_FOURIER_OUT(ALLOCATOR, KF_FS) + HTRLTOM = PREPARE_TRLTOM(ALLOCATOR, KF_FS) + HLEDIR_UNPACK = PREPARE_LEDIR_UNPACK(ALLOCATOR, KF_FS) + ELSE + HLEDIR_UNPACK = PREPARE_LEDIR_UNPACK(ALLOCATOR, KF_FS) + ENDIF + HLEDIR = PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) -IF_GPB = 2*KF_UV_G+KF_SCALARS_G -IF(NPROMATR > 0 .AND. IF_GPB > NPROMATR) THEN - PRINT *, "ERROR, not implemented right now" - stop 4 -ELSE + CALL INSTANTIATE_ALLOCATOR(ALLOCATOR) ! from the PGP arrays to PREEL_REAL - CALL GRID_TO_FOURIER(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_REAL,& + CALL GRID_TO_FOURIER(ALLOCATOR,HGRID_TO_FOURIER,KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_REAL,& & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) @@ -183,7 +180,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ! fourier transform from PREEL_REAL to PREEL_COMPLEX (in-place!) CALL GSTATS(1640,0) IF (KF_FS > 0) THEN - CALL FTDIR(PREEL_REAL,PREEL_COMPLEX,KF_FS) + CALL FTDIR(HFTDIR,PREEL_REAL,PREEL_COMPLEX,KF_FS) ELSE PREEL_COMPLEX => PREEL_REAL ENDIF @@ -191,38 +188,31 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& CALL GSTATS(153,0) IF (NPROC > 1) THEN - CALL FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KF_FS) -#ifdef USE_CUDA_AWARE_MPI_FT WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_CUDAAWARE' - CALL TRLTOM_CUDAAWARE(FOUBUF_IN,FOUBUF,KF_FS) -#else - CALL TRLTOM(FOUBUF_IN,FOUBUF,KF_FS) - -#endif - - CALL LEDIR_POINTERS(REUSE_PTR(1:),KF_FS,ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) - IF (KF_FS > 0) CALL LEDIR_UNPACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) - !$ACC EXIT DATA DELETE(FOUBUF) - DEALLOCATE(FOUBUF) + IF (KF_FS > 0) THEN + CALL FOURIER_OUT(ALLOCATOR,HFOURIER_OUT,PREEL_COMPLEX,FOUBUF_IN,KF_FS) + CALL TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,FOUBUF_IN,FOUBUF,KF_FS) + CALL LEDIR_UNPACK_BUFFER(ALLOCATOR,HLEDIR_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + ENDIF ELSE ! Short cut - no need to go through tansforms, we will go directly into ! the legendre space, but for that we need twice the memory, roughly ! (but we don't need the send/recv buffers) - IALLOC_POS = 1+KF_FS*D%NLENGTF - CALL LEDIR_POINTERS(REUSE_PTR(IALLOC_POS:),KF_FS,ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) - IF (KF_FS > 0) CALL LEDIR_UNPACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + FOUBUF => PREEL_COMPLEX + CALL LEDIR_UNPACK_BUFFER(ALLOCATOR,HLEDIR_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) ENDIF CALL GSTATS(153,1) IF (KF_FS > 0) THEN - CALL LTDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,KF_FS,KF_UV,KF_SCALARS, & - & PSPVOR,PSPDIV,PSPSCALAR,& - & PSPSC3A,PSPSC3B,PSPSC2) + CALL LTDIR(ALLOCATOR,HLEDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALARS, & + & PSPVOR,PSPDIV,PSPSCALAR,& + & PSPSC3A,PSPSC3B,PSPSC2) ENDIF -ENDIF + !$ACC EXIT DATA DELETE(ALLOCATOR%PTR) + DEALLOCATE(ALLOCATOR%PTR) ! ------------------------------------------------------------------ END SUBROUTINE DIR_TRANS_CTL diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 index dae8aac1c..2150cb9a4 100755 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ b/src/trans/gpu/internal/fourier_out_mod.F90 @@ -9,8 +9,28 @@ ! MODULE FOURIER_OUT_MOD + USE ALLOCATOR_MOD + IMPLICIT NONE + + TYPE FOURIER_OUT_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HFOUBUF_IN + END TYPE CONTAINS -SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) + FUNCTION PREPARE_FOURIER_OUT(ALLOCATOR, KF_FS) RESULT(HFOURIER_OUT) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: D + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + TYPE(FOURIER_OUT_HANDLE) :: HFOURIER_OUT + + REAL(KIND=JPRBT) :: DUMMY + + HFOURIER_OUT%HFOUBUF_IN = RESERVE(ALLOCATOR, D%NLENGT0B*KF_FS*2*SIZEOF(DUMMY)) + END FUNCTION +SUBROUTINE FOURIER_OUT(ALLOCATOR,HFOURIER_OUT,PREEL_COMPLEX,FOUBUF_IN,KF_FS) !**** *FOURIER_OUT* - Copy fourier data from local array to buffer @@ -23,7 +43,7 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) ! CALL FOURIER_OUT(...) ! Explicit arguments : PREEL - local fourier/GP array -! -------------------- KFIELDS - number of fields +! -------------------- KF_FS - number of fields ! ! Externals. None. ! ---------- @@ -38,35 +58,28 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) ! ------------------------------------------------------------------ +USE ALLOCATOR_MOD USE PARKIND_ECTRANS ,ONLY : JPIM,JPRBT USE TPM_DISTR ,ONLY : D,MYSETW,MYPROC, NPROC, D_NSTAGTF, D_NPNTGTB0,D_NPTRLS USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NLOEN USE TPM_DIM, ONLY: R_NSMAX +USE ISO_C_BINDING ! IMPLICIT NONE REAL(KIND=JPRBT), INTENT(IN) :: PREEL_COMPLEX(:) -REAL(KIND=JPRBT), ALLOCATABLE, INTENT(OUT) :: FOUBUF_IN(:) -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS +REAL(KIND=JPRBT), POINTER, INTENT(OUT) :: FOUBUF_IN(:) +INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS +TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR +TYPE(FOURIER_OUT_HANDLE), INTENT(IN) :: HFOURIER_OUT INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC REAL(KIND=JPRBT) :: SCAL -IF(MYPROC > NPROC/2)THEN - IBEG=1 - IEND=D%NDGL_FS - IINC=1 -ELSE - IBEG=D%NDGL_FS - IEND=1 - IINC=-1 -ENDIF - -ALLOCATE(FOUBUF_IN(D%NLENGT0B*KFIELDS*2)) -!$ACC ENTER DATA CREATE(FOUBUF_IN) +CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HFOURIER_OUT%HFOUBUF_IN),& + & 1_C_SIZE_T, D%NLENGT0B*KF_FS*2*SIZEOF(FOUBUF_IN(1))) !$ACC DATA PRESENT(D,G_NMEN,D_NPNTGTB0,FOUBUF_IN,PREEL_COMPLEX,D_NSTAGTF,G_NLOEN) ASYNC(1) @@ -78,13 +91,13 @@ SUBROUTINE FOURIER_OUT(PREEL_COMPLEX,FOUBUF_IN,KFIELDS) !$ACC& ASYNC(1) TILE(32,16,1) DO KGL=1,D%NDGL_FS DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG) - DO JF=1,KFIELDS + DO JF=1,KF_FS IGLG = OFFSET_VAR+KGL-1 IF (JM <= G_NMEN(IGLG)) THEN - IOFF_LAT = KFIELDS*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + IOFF_LAT = KF_FS*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) - ISTA = D_NPNTGTB0(JM,KGL)*KFIELDS*2 + ISTA = D_NPNTGTB0(JM,KGL)*KF_FS*2 FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+1) FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+2) diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index 6d3d33b32..499f32b05 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -9,8 +9,33 @@ ! MODULE FTDIR_CTL_MOD + USE TRGTOL_MOD, ONLY: GRID_TO_FOURIER_HANDLE + IMPLICIT NONE + CONTAINS -SUBROUTINE GRID_TO_FOURIER(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_REAL, & +FUNCTION PREPARE_GRID_TO_FOURIER(ALLOCATOR,KF_GP,KF_FS) RESULT(HGRID_TO_FOURIER) + USE PARKIND_ECTRANS, ONLY : JPIM, JPRB, JPRBT + USE ALLOCATOR_MOD + USE TPM_DISTR, ONLY: D + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_GP, KF_FS + TYPE(GRID_TO_FOURIER_HANDLE) :: HGRID_TO_FOURIER + + REAL(KIND=JPRBT) :: DUMMY + + INTEGER(KIND=C_SIZE_T) :: NELEM + + HGRID_TO_FOURIER%HCOMBUFS = RESERVE(ALLOCATOR, KF_GP*D%NGPTOT*SIZEOF(DUMMY)) + + NELEM = KF_FS*D%NLENGTF*SIZEOF(DUMMY) ! ZCOMBUFR + NELEM = NELEM + KF_FS*D%NLENGTF*SIZEOF(DUMMY) ! PREEL_REAL + HGRID_TO_FOURIER%HCOMBUFR_AND_REEL = RESERVE(ALLOCATOR, NELEM) + +END FUNCTION +SUBROUTINE GRID_TO_FOURIER(ALLOCATOR,HGRID_TO_FOURIER,KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_REAL, & & KVSETUV,KVSETSC,KPTRGP,& & KVSETSC3A,KVSETSC3B,KVSETSC2,& & PGP,PGPUV,PGP3A,PGP3B,PGP2) @@ -56,26 +81,12 @@ SUBROUTINE GRID_TO_FOURIER(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_REAL, & ! ------------------------------------------------------------------ USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT - -USE TPM_GEN, only: nout -!USE TPM_DIM -!USE TPM_GEOMETRY -USE TPM_DISTR ,ONLY : D, MYPROC, NPROC -USE TPM_TRANS, ONLY: REUSE_PTR - USE TRGTOL_MOD ,ONLY : TRGTOL_CUDAAWARE -USE MPL_MODULE ,ONLY : MPL_BARRIER -USE TPM_GEN ,ONLY : LSYNC_TRANS USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX -use ieee_arithmetic -! +USE ALLOCATOR_MOD IMPLICIT NONE - - -! Dummy arguments - INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) @@ -88,7 +99,10 @@ SUBROUTINE GRID_TO_FOURIER(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_REAL, & REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP3A(:,:,:,:) REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP2(:,:,:) -REAL(KIND=JPRBT), INTENT(OUT) :: PREEL_REAL(:) +REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_REAL(:) + +TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR +TYPE(GRID_TO_FOURIER_HANDLE), INTENT(IN) :: HGRID_TO_FOURIER ! Local variables @@ -100,8 +114,6 @@ SUBROUTINE GRID_TO_FOURIER(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_REAL, & INTEGER(KIND=JPIM) :: IBEG,IEND,IINC,IFIRST INTEGER(KIND=JPIM) :: ISIZE,IFIELDS,ICHUNK,ICHUNKS,JK -INTEGER(KIND=JPIM) :: IALLOC_SIZE - ! ------------------------------------------------------------------ ! Field distribution in Spectral/Fourier space @@ -152,10 +164,8 @@ SUBROUTINE GRID_TO_FOURIER(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_REAL, & ! Transposition CALL GSTATS(158,0) - -CALL TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,IVSET,KPTRGP,& +CALL TRGTOL_CUDAAWARE(ALLOCATOR,HGRID_TO_FOURIER,PREEL_REAL,KF_FS,KF_GP,IVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) - CALL GSTATS(158,1) ! ------------------------------------------------------------------ END SUBROUTINE GRID_TO_FOURIER diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index eae855cf4..2cbac1257 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -9,8 +9,18 @@ ! MODULE FTDIR_MOD + IMPLICIT NONE + + TYPE FTDIR_HANDLE + END TYPE CONTAINS -SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) + + FUNCTION PREPARE_FTDIR() RESULT(HFTDIR) + IMPLICIT NONE + TYPE(FTDIR_HANDLE) :: HFTDIR + END FUNCTION + +SUBROUTINE FTDIR(HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) !**** *FTDIR - Direct Fourier transform @@ -58,6 +68,7 @@ SUBROUTINE FTDIR(PREEL_REAL,PREEL_COMPLEX,KFIELD) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:) +TYPE(FTDIR_HANDLE) :: HFTDIR INTEGER(KIND=JPIM) :: IGLG,KGL diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index d024de3d4..3189b3942 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -11,12 +11,29 @@ MODULE LEDIR_MOD USE PARKIND_ECTRANS ,ONLY : JPIM + USE ALLOCATOR_MOD + IMPLICIT NONE + + TYPE LEDIR_UNPACK_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: ZINPS_AND_ZINPA + END TYPE + TYPE LEDIR_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HOUT_AND_POA + END TYPE - PRIVATE - PUBLIC :: LEDIR, LEDIR_ALLOC_SIZE, LEDIR_UNPACK_BUFFER, LEDIR_POINTERS INTEGER(KIND=JPIM) :: A = 8 !Alignment CONTAINS +FUNCTION PREPARE_LEDIR_UNPACK(ALLOCATOR, KF_FS) RESULT(HLEDIR_UNPACK) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + TYPE(LEDIR_UNPACK_HANDLE) :: HLEDIR_UNPACK + + HLEDIR_UNPACK%ZINPS_AND_ZINPA = RESERVE(ALLOCATOR, LEDIR_ALLOC_SIZE(KF_FS)) +END FUNCTION SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD @@ -59,81 +76,78 @@ FUNCTION LEDIR_ALLOC_SIZE(KF_FS) INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 - INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 - INTEGER(KIND=8) :: LEDIR_ALLOC_SIZE + INTEGER(KIND=C_SIZE_T) :: LEDIR_ALLOC_SIZE REAL(KIND=JPRBT) :: ZPRBT_DUMMY REAL(KIND=JPRD) :: ZPRD_DUMMY - CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& - IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) + CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& + IIN0_STRIDES0=IIN0_STRIDES0,IIN0_STRIDES1=IIN0_STRIDES1) ! Check if the reuse buffer is large enough - LEDIR_ALLOC_SIZE = ALIGN(IIN_STRIDES1*D%NUMP,8)*SIZEOF(ZPRBT_DUMMY) & - +ALIGN(IIN_STRIDES1*D%NUMP,8)*SIZEOF(ZPRBT_DUMMY) & - +ALIGN(IOUT_STRIDES1*D%NUMP,8)*SIZEOF(ZPRBT_DUMMY) & - +ALIGN(IIN0_STRIDES1,8)*SIZEOF(ZPRD_DUMMY) & - +ALIGN(IIN0_STRIDES1,8)*SIZEOF(ZPRD_DUMMY) & - +ALIGN(IOUT0_STRIDES1,8)*SIZEOF(ZPRD_DUMMY) + LEDIR_ALLOC_SIZE = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + LEDIR_ALLOC_SIZE = LEDIR_ALLOC_SIZE + ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + LEDIR_ALLOC_SIZE = LEDIR_ALLOC_SIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) + LEDIR_ALLOC_SIZE = LEDIR_ALLOC_SIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) END FUNCTION -SUBROUTINE LEDIR_POINTERS(ZBUF,KF_FS,ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) +SUBROUTINE LEDIR_POINTERS(ALLOCATOR,HLEDIR_UNPACK,KF_FS,ZINPS,ZINPA,ZINPS0,ZINPA0) USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD USE TPM_DIM ,ONLY : R USE TPM_DISTR ,ONLY : D USE, INTRINSIC :: ISO_C_BINDING IMPLICIT NONE - REAL(KIND=JPRBT), INTENT(IN), TARGET :: ZBUF(:) - REAL(KIND=JPRBT), INTENT(OUT), OPTIONAL, POINTER :: ZINPS(:), ZINPA(:), ZOUT(:) - REAL(KIND=JPRD), INTENT(OUT), OPTIONAL, POINTER :: ZINPS0(:), ZINPA0(:), ZOUT0(:) + REAL(KIND=JPRBT), INTENT(OUT), OPTIONAL, POINTER :: ZINPS(:), ZINPA(:) + REAL(KIND=JPRD), INTENT(OUT), OPTIONAL, POINTER :: ZINPS0(:), ZINPA0(:) INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(LEDIR_UNPACK_HANDLE), INTENT(IN) :: HLEDIR_UNPACK INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 - INTEGER(KIND=8) :: ALLOC_POS - - REAL(KIND=JPRBT), POINTER :: ZBASE(:) - REAL(KIND=JPRD), POINTER :: ZBASE0(:) + INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) - CALL C_F_POINTER(C_LOC(ZBUF(:)), ZBASE, & - & [SIZEOF(ZBUF(:))/SIZEOF(ZBASE(0))]) - - ALLOC_POS=1 - IF (PRESENT(ZINPS)) ZINPS(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IIN_STRIDES1*D%NUMP-1) - ALLOC_POS=ALLOC_POS+ALIGN(IIN_STRIDES1*D%NUMP,8) - IF (PRESENT(ZINPA)) ZINPA(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IIN_STRIDES1*D%NUMP-1) - ALLOC_POS=ALLOC_POS+ALIGN(IIN_STRIDES1*D%NUMP,8) - IF (PRESENT(ZOUT)) ZOUT(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IOUT_STRIDES1*D%NUMP-1) - ALLOC_POS=ALLOC_POS+ALIGN(IOUT_STRIDES1*D%NUMP,8) - - ! The BASE0 pointer points to the rest, but likely in a different type! - CALL C_F_POINTER(C_LOC(ZBASE(ALLOC_POS:)), ZBASE0, & - & [SIZEOF(ZBASE(ALLOC_POS:))/SIZEOF(ZBASE0(0))]) - ALLOC_POS=1 - IF (PRESENT(ZINPS0)) ZINPS0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IIN0_STRIDES1-1) - ALLOC_POS=ALLOC_POS+ALIGN(IIN0_STRIDES1,8) - IF (PRESENT(ZINPA0)) ZINPA0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IIN0_STRIDES1-1) - ALLOC_POS=ALLOC_POS+ALIGN(IIN0_STRIDES1,8) - IF (PRESENT(ZOUT0)) ZOUT0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IOUT0_STRIDES1-1) - ALLOC_POS=ALLOC_POS+ALIGN(IOUT0_STRIDES1,8) + IALLOC_POS=1 + + IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPS(0)),128) + IF (PRESENT(ZINPS)) & + CALL ASSIGN_PTR(ZINPS, GET_ALLOCATION(ALLOCATOR, HLEDIR_UNPACK%ZINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS=IALLOC_POS+IALLOC_SZ + + IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPA(0)),128) + IF (PRESENT(ZINPA)) & + CALL ASSIGN_PTR(ZINPA, GET_ALLOCATION(ALLOCATOR, HLEDIR_UNPACK%ZINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS=IALLOC_POS+IALLOC_SZ + + IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPS0(0)),128) + IF (PRESENT(ZINPS0)) & + CALL ASSIGN_PTR(ZINPS0, GET_ALLOCATION(ALLOCATOR, HLEDIR_UNPACK%ZINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS=IALLOC_POS+IALLOC_SZ + + IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPA0(0)),128) + IF (PRESENT(ZINPA0)) & + CALL ASSIGN_PTR(ZINPA0, GET_ALLOCATION(ALLOCATOR, HLEDIR_UNPACK%ZINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS=IALLOC_POS+IALLOC_SZ END SUBROUTINE -SUBROUTINE LEDIR_UNPACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) +SUBROUTINE LEDIR_UNPACK_BUFFER(ALLOCATOR,HLEDIR_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD USE TPM_DIM ,ONLY : R, R_NDGNH, R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU USE TPM_FIELDS ,ONLY : F - USE TPM_TRANS, ONLY: REUSE_PTR USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1,MYPROC,NPROC USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX @@ -143,10 +157,12 @@ SUBROUTINE LEDIR_UNPACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) IMPLICIT NONE - REAL(KIND=JPRBT), ALLOCATABLE, INTENT(INOUT) :: FOUBUF(:) - REAL(KIND=JPRBT), INTENT(INOUT) :: ZINPS(:), ZINPA(:) - REAL(KIND=JPRD), INTENT(INOUT) :: ZINPS0(:), ZINPA0(:) + REAL(KIND=JPRBT), POINTER, INTENT(INOUT) :: FOUBUF(:) + REAL(KIND=JPRBT), POINTER, INTENT(INOUT) :: ZINPS(:), ZINPA(:) + REAL(KIND=JPRD), POINTER, INTENT(INOUT) :: ZINPS0(:), ZINPA0(:) INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(LEDIR_UNPACK_HANDLE), INTENT(IN) :: HLEDIR_UNPACK REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) @@ -157,6 +173,8 @@ SUBROUTINE LEDIR_UNPACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) REAL(KIND=JPRBT) :: PAIA, PAIS, V1, V2, SCAL + CALL LEDIR_POINTERS(ALLOCATOR,HLEDIR_UNPACK,KF_FS, & + ZINPS=ZINPS,ZINPA=ZINPA,ZINPS0=ZINPS0,ZINPA0=ZINPA0) CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& IIN0_STRIDES0=IIN0_STRIDES0) @@ -204,7 +222,7 @@ SUBROUTINE LEDIR_UNPACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) OFFSET_VAR=D_NPTRLS(MYSETW) - PREEL_COMPLEX(1:) => REUSE_PTR(1:) + PREEL_COMPLEX(1:) => FOUBUF(1:) !$ACC DATA PRESENT(PREEL_COMPLEX,D_NSTAGTF,G_NLOEN) !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS,IGLS,V1,V2,SCAL,IGLG) ASYNC(1) TILE(32,16,1) DO JGL=1,R_NDGNH @@ -251,7 +269,7 @@ SUBROUTINE LEDIR_UNPACK_BUFFER(FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) !$ACC END DATA END SUBROUTINE -SUBROUTINE LEDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,POA1,KF_FS) +SUBROUTINE LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) !**** *LEDIR* - Direct Legendre transform. @@ -315,9 +333,9 @@ SUBROUTINE LEDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,POA1,KF_FS) ! DUMMY ARGUMENTS REAL(KIND=JPRBT), INTENT(IN) :: ZINPS(:), ZINPA(:) REAL(KIND=JPRD), INTENT(IN) :: ZINPS0(:), ZINPA0(:) -REAL(KIND=JPRBT), INTENT(OUT) :: ZOUT(:) -REAL(KIND=JPRD), INTENT(OUT) :: ZOUT0(:) -REAL(KIND=JPRBT), INTENT(OUT) :: POA1(:,:,:) +REAL(KIND=JPRBT), INTENT(INOUT) :: ZOUT(:) +REAL(KIND=JPRD), INTENT(INOUT) :: ZOUT0(:) +REAL(KIND=JPRBT), INTENT(OUT), POINTER :: POA1(:,:,:) INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS ! LOCAL VARIABLES diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index bebb55dfe..47b89aedc 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -1,3 +1,4 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) ! (C) Copyright 1987- ECMWF. ! (C) Copyright 2022- NVIDIA. ! @@ -9,8 +10,46 @@ ! MODULE LTDIR_MOD - CONTAINS - SUBROUTINE LTDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,KF_FS,KF_UV,KF_SCALARS,& + IMPLICIT NONE + +CONTAINS + FUNCTION PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) RESULT(HLEDIR) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE TPM_DISTR, ONLY: D + USE TPM_DIM, ONLY: R + USE ISO_C_BINDING + USE LEDIR_MOD + USE ALLOCATOR_MOD + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV + TYPE(LEDIR_HANDLE) :: HLEDIR + + INTEGER(KIND=C_SIZE_T) :: IALLOC_SZ + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + + REAL(KIND=JPRBT) :: ZPRBT_DUMMY + REAL(KIND=JPRD) :: ZPRD_DUMMY + + CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_STRIDES1=IOUT_STRIDES1,& + IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_STRIDES1=IOUT0_STRIDES1) + + ! POA1 + IALLOC_SZ = ALIGN(2*KF_FS*(R%NTMAX+3)*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + ! POA2 + IALLOC_SZ = IALLOC_SZ + ALIGN(4*KF_UV*(R%NTMAX+3)*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + ! ZOUT + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + ! ZOUT0 + IALLOC_SZ = IALLOC_SZ+ ALIGN(IOUT0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) + + HLEDIR%HOUT_AND_POA = RESERVE(ALLOCATOR, IALLOC_SZ) + + END FUNCTION + SUBROUTINE LTDIR(ALLOCATOR,HLEDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALARS,& & PSPVOR,PSPDIV,PSPSCALAR,& & PSPSC3A,PSPSC3B,PSPSC2, & & KFLDPTRUV,KFLDPTRSC) @@ -25,7 +64,7 @@ SUBROUTINE LTDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,KF_FS,KF_UV,KF_SCALARS,& USE TPM_GEOMETRY USE PREPSNM_MOD ,ONLY : PREPSNM - USE LEDIR_MOD ,ONLY : LEDIR + USE LEDIR_MOD USE UVTVD_MOD USE UPDSP_MOD ,ONLY : UPDSP USE UPDSPB_MOD ,ONLY : UPDSPB @@ -33,6 +72,7 @@ SUBROUTINE LTDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,KF_FS,KF_UV,KF_SCALARS,& USE TPM_GEN ,ONLY : LSYNC_TRANS USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + USE ALLOCATOR_MOD !**** *LTDIR* - Control of Direct Legendre transform step @@ -106,16 +146,23 @@ SUBROUTINE LTDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,KF_FS,KF_UV,KF_SCALARS,& INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) REAL(KIND=JPRBT), INTENT(IN) :: ZINPS(:), ZINPA(:) REAL(KIND=JPRD), INTENT(IN) :: ZINPS0(:), ZINPA0(:) - REAL(KIND=JPRBT), INTENT(OUT) :: ZOUT(:) - REAL(KIND=JPRD), INTENT(OUT) :: ZOUT0(:) ! LOCAL INTEGER SCALARS INTEGER(KIND=JPIM) :: IFC, IIFC, IDGLU, IFIRST REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - REAL(KIND=JPRB), ALLOCATABLE, TARGET :: POA1(:,:,:) - REAL(KIND=JPRB), ALLOCATABLE, TARGET :: POA2(:,:,:) + REAL(KIND=JPRB), POINTER :: POA1_L(:), POA1(:,:,:) + REAL(KIND=JPRB), POINTER :: POA2_L(:), POA2(:,:,:) REAL(KIND=JPRB), POINTER :: PU(:,:,:), PV(:,:,:), PVOR(:,:,:), PDIV(:,:,:) + REAL(KIND=JPRBT), POINTER :: ZOUT(:) + REAL(KIND=JPRD), POINTER :: ZOUT0(:) + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(LEDIR_HANDLE), INTENT(IN) :: HLEDIR + INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + + ! ------------------------------------------------------------------ IF (LHOOK) CALL DR_HOOK('LTDIR_MOD',0,ZHOOK_HANDLE) @@ -130,11 +177,37 @@ SUBROUTINE LTDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,KF_FS,KF_UV,KF_SCALARS,& !* 2. PREPARE WORK ARRAYS. ! -------------------- - ALLOCATE(POA1(2*KF_FS,R%NTMAX+3,D%NUMP)) - !$ACC ENTER DATA CREATE(POA1) + CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_STRIDES1=IOUT_STRIDES1,& + IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_STRIDES1=IOUT0_STRIDES1) + + IALLOC_POS = 1 + + IALLOC_SZ = ALIGN(2*KF_FS*(R%NTMAX+3)*D%NUMP*SIZEOF(POA1_L(1)),128) + CALL ASSIGN_PTR(POA1_L, GET_ALLOCATION(ALLOCATOR, HLEDIR%HOUT_AND_POA),& + & IALLOC_POS, IALLOC_SZ) + CALL C_F_POINTER(C_LOC(POA1_L), POA1, (/ 2*KF_FS, R%NTMAX+3, D%NUMP /)) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + IALLOC_SZ = ALIGN(4*KF_UV*(R%NTMAX+3)*D%NUMP*SIZEOF(POA2_L(1)),128) + CALL ASSIGN_PTR(POA2_L, GET_ALLOCATION(ALLOCATOR, HLEDIR%HOUT_AND_POA),& + & IALLOC_POS, IALLOC_SZ) + CALL C_F_POINTER(C_LOC(POA2_L), POA2, (/ 4*KF_UV, R%NTMAX+3, D%NUMP /)) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZOUT + IALLOC_SZ = ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZOUT(1)),128) + CALL ASSIGN_PTR(ZOUT, GET_ALLOCATION(ALLOCATOR, HLEDIR%HOUT_AND_POA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZOUT0 + IALLOC_SZ = ALIGN(IOUT0_STRIDES1*SIZEOF(ZOUT0(1)),128) + CALL ASSIGN_PTR(ZOUT0, GET_ALLOCATION(ALLOCATOR, HLEDIR%HOUT_AND_POA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ ! do the legendre transform - CALL LEDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,POA1,KF_FS) + CALL LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) !$ACC DATA COPYOUT(PSPVOR,PSPDIV) IF(KF_UV > 0) !$ACC DATA COPYOUT(PSPSCALAR) IF(PRESENT(PSPSCALAR) .AND. KF_SCALARS > 0) @@ -148,9 +221,6 @@ SUBROUTINE LTDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,KF_FS,KF_UV,KF_SCALARS,& ! --------------------------------- IF( KF_UV > 0 ) THEN - ALLOCATE(POA2(4*KF_UV,R%NTMAX+3,D%NUMP)) - !$ACC ENTER DATA CREATE(POA2) - ! U and V are in POA1 IFIRST = 0 PU => POA1(IFIRST+1:IFIRST+2*KF_UV,:,:) @@ -169,9 +239,6 @@ SUBROUTINE LTDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,KF_FS,KF_UV,KF_SCALARS,& CALL UPDSPB(KF_UV,PVOR,PSPVOR,KFLDPTRUV) CALL UPDSPB(KF_UV,PDIV,PSPDIV,KFLDPTRUV) - - !$ACC EXIT DATA DELETE(POA2) - DEALLOCATE(POA2) ENDIF ! ------------------------------------------------------------------ @@ -183,9 +250,6 @@ SUBROUTINE LTDIR(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0,KF_FS,KF_UV,KF_SCALARS,& & PSPSCALAR,& & PSPSC3A,PSPSC3B,PSPSC2 , & & KFLDPTRUV,KFLDPTRSC) - - !$ACC EXIT DATA DELETE(POA1) - DEALLOCATE(POA1) IF (LSYNC_TRANS) THEN CALL GSTATS(430,0) diff --git a/src/trans/gpu/internal/ltinv_ctlad_mod.F90 b/src/trans/gpu/internal/ltinv_ctlad_mod.F90 index 4a3a1eff5..a794e5aa5 100755 --- a/src/trans/gpu/internal/ltinv_ctlad_mod.F90 +++ b/src/trans/gpu/internal/ltinv_ctlad_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -58,7 +59,7 @@ SUBROUTINE LTINV_CTLAD(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,& USE TPM_TRANS ,ONLY : FOUBUF, FOUBUF_IN USE TPM_DISTR ,ONLY : D USE LTINVAD_MOD ,ONLY : LTINVAD -USE TRLTOM_MOD ,ONLY : TRLTOM +!USE TRLTOM_MOD ,ONLY : TRLTOM IMPLICIT NONE @@ -90,7 +91,7 @@ SUBROUTINE LTINV_CTLAD(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,& ALLOCATE(FOUBUF_IN(MAX(1,IBLEN))) ENDIF CALL GSTATS(180,0) -CALL TRLTOM(FOUBUF,FOUBUF_IN,2*KF_OUT_LT) +!CALL TRLTOM(FOUBUF,FOUBUF_IN,2*KF_OUT_LT) CALL GSTATS(180,1) IF (.NOT.LALLOPERM) DEALLOCATE(FOUBUF) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 59a4e3594..20cfdcf29 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -9,8 +9,14 @@ ! MODULE TRGTOL_MOD + USE ALLOCATOR_MOD + IMPLICIT NONE + + TYPE GRID_TO_FOURIER_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFS, HCOMBUFR_AND_REEL + END TYPE CONTAINS - SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& + SUBROUTINE TRGTOL_CUDAAWARE(ALLOCATOR,HGRID_TO_FOURIER,PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) !**** *TRGTOL * - transposition of grid point data from column @@ -82,12 +88,12 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& USE OML_MOD ,ONLY : OML_MY_THREAD USE MPI USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX - USE TPM_TRANS ,ONLY : NPROMA + USE ALLOCATOR_MOD IMPLICIT NONE - REAL(KIND=JPRBT),INTENT(OUT) :: PREEL_REAL(:) + REAL(KIND=JPRBT),INTENT(OUT), POINTER :: PREEL_REAL(:) INTEGER(KIND=JPIM),INTENT(IN) :: KVSET(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) @@ -96,11 +102,13 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3A(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP2(:,:,:) + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(GRID_TO_FOURIER_HANDLE), INTENT(IN) :: HGRID_TO_FOURIER ! LOCAL VARIABLES ! LOCAL INTEGER SCALARS - REAL(KIND=JPRBT),ALLOCATABLE :: ZCOMBUFS(:),ZCOMBUFR(:) + REAL(KIND=JPRBT), POINTER :: ZCOMBUFS(:),ZCOMBUFR(:) INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) @@ -127,7 +135,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& INTEGER(KIND=JPIM) :: IFLDA(KF_GP) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR INTEGER(JPIM), PARAMETER :: PGP_INDICES_UV = 1 INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP2 = 2 @@ -229,6 +236,9 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& IRECVTOT(JROC) = IPOS*KF_FS ENDDO + CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HGRID_TO_FOURIER%HCOMBUFR_AND_REEL),& + & KF_FS*D%NLENGTF*SIZEOF(PREEL_REAL(1))+1, KF_FS*D%NLENGTF*SIZEOF(PREEL_REAL(1))) + !$ACC DATA COPYIN(IRECV_BUFR_TO_OUT,PGP_INDICES) PRESENT(PREEL_REAL) ASYNC(1) CALL GSTATS(1805,1) @@ -283,9 +293,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(IRECV_TO_PROC(JROC)) ENDDO - ! Do this with "enter data" syntax because we are in the PGP data clause - IF (ISEND_COUNTS > 0) ALLOCATE(ZCOMBUFS(ICOMBUFS_OFFSET(ISEND_COUNTS+1))) - !$ACC ENTER DATA IF(ISEND_COUNTS > 0) CREATE(ZCOMBUFS) ASYNC(1) + CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HGRID_TO_FOURIER%HCOMBUFS),& + & 1_C_SIZE_T, ICOMBUFS_OFFSET(ISEND_COUNTS+1)*SIZEOF(ZCOMBUFS(1))) !....Pack loop......................................................... !$ACC DATA IF(ISEND_COUNTS > 0) PRESENT(ZCOMBUFS) ASYNC(1) @@ -355,12 +364,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& ENDIF !$ACC END DATA ENDDO - - IF (IRECV_COUNTS > 0) ALLOCATE(ZCOMBUFR(ICOMBUFR_OFFSET(IRECV_COUNTS+1))) - !$ACC DATA IF(IRECV_COUNTS > 0) CREATE(ZCOMBUFR) ASYNC(1) - !$ACC WAIT(1) - CALL GSTATS(1602,1) IF (LSYNC_TRANS) THEN @@ -368,7 +372,11 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& CALL MPL_BARRIER(CDSTRING='') CALL GSTATS(430,1) ENDIF + CALL GSTATS(411,0) + CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HGRID_TO_FOURIER%HCOMBUFR_AND_REEL),& + & 1_C_SIZE_T, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*SIZEOF(ZCOMBUFR(1))) + !$ACC DATA IF(IRECV_COUNTS > 0) PRESENT(ZCOMBUFR) IR=0 @@ -471,14 +479,9 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& ENDIF CALL GSTATS(411,1) - !$ACC EXIT DATA IF(ISEND_COUNTS > 0) DELETE(ZCOMBUFS) - IF (ISEND_COUNTS > 0) DEALLOCATE(ZCOMBUFS) - ! Unpack loop......................................................... CALL GSTATS(1603,0) - - DO INR=1,IRECV_COUNTS IPROC=IRECV_TO_PROC(INR) ILEN = IRECVTOT(IPROC)/KF_FS @@ -493,7 +496,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& ENDDO ENDDO ENDDO - !$ACC WAIT(1) CALL GSTATS(1603,1) @@ -506,7 +508,6 @@ SUBROUTINE TRGTOL_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& !$ACC END DATA !PGPUV !$ACC END DATA !PGP - IF (IRECV_COUNTS > 0) DEALLOCATE(ZCOMBUFR) IF (LHOOK) CALL DR_HOOK('TRGTOL_CUDAAWARE',1,ZHOOK_HANDLE) diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index 71f204788..2286393ab 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -9,9 +9,29 @@ ! MODULE TRLTOM_MOD + USE ALLOCATOR_MOD + IMPLICIT NONE + + TYPE TRLTOM_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HPFBUF + END TYPE CONTAINS -SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) + FUNCTION PREPARE_TRLTOM(ALLOCATOR, KF_FS) RESULT(HTRLTOM) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: D + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + TYPE(TRLTOM_HANDLE) :: HTRLTOM + + REAL(KIND=JPRBT) :: DUMMY + + HTRLTOM%HPFBUF = RESERVE(ALLOCATOR, D%NLENGT1B*2*KF_FS*SIZEOF(DUMMY)) + END FUNCTION +SUBROUTINE TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) !**** *TRLTOM * - transposition in Fourierspace @@ -74,14 +94,18 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) IMPLICIT NONE INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_FS -REAL(KIND=JPRBT) ,INTENT(OUT), ALLOCATABLE :: PFBUF(:) -REAL(KIND=JPRBT) ,INTENT(INOUT), ALLOCATABLE :: PFBUF_IN(:) +REAL(KIND=JPRBT) ,INTENT(OUT), POINTER :: PFBUF(:) +REAL(KIND=JPRBT) ,INTENT(INOUT), POINTER :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK REAL(KIND=JPHOOK) :: ZHOOK_HANDLE INTEGER(KIND=JPIM) :: IERROR +INTEGER::IGROUP_MS,IGROUP_WORLD,IRANKS1(NPRTRW),IRANKS2(NPRTRW) +TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR +TYPE(TRLTOM_HANDLE), INTENT(IN) :: HTRLTOM + #ifdef PARKINDTRANS_SINGLE #define TRLTOM_DTYPE MPI_REAL #else @@ -90,8 +114,10 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) IF (LHOOK) CALL DR_HOOK('TRLTOM_CUDAAWARE',0,ZHOOK_HANDLE) -ALLOCATE(PFBUF(D%NLENGT1B*2*KF_FS)) -!$ACC ENTER DATA CREATE(PFBUF) +CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION(ALLOCATOR, HTRLTOM%HPFBUF),& + & 1_C_SIZE_T, D%NLENGT1B*2*KF_FS*SIZEOF(PFBUF(1))) + +!$ACC DATA PRESENT(PFBUF,PFBUF_IN) IF(NPROC > 1) THEN DO J=1,NPRTRW @@ -114,7 +140,7 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) TO_SEND = FROM_SEND + ILENS(IRANK) - 1 FROM_RECV = IOFFR(IRANK) + 1 TO_RECV = FROM_RECV + ILENR(IRANK) - 1 - !$ACC KERNELS ASYNC(1) DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) + !$ACC KERNELS ASYNC(1) PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND) !$ACC END KERNELS ILENS(IRANK) = 0 @@ -145,130 +171,17 @@ SUBROUTINE TRLTOM_CUDAAWARE(PFBUF_IN,PFBUF,KF_FS) ILEN = D%NLTSGTB(MYSETW)*2*KF_FS ISTA = D%NSTAGT1B(MYSETW)*2*KF_FS+1 CALL GSTATS(1607,0) - !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) + !$ACC PARALLEL LOOP DEFAULT(NONE) DO J=ISTA,ISTA+ILEN-1 PFBUF(J) = PFBUF_IN(J) ENDDO CALL GSTATS(1607,1) ENDIF -IF (ALLOCATED(PFBUF_IN)) THEN - !$ACC EXIT DATA DELETE(PFBUF_IN) - DEALLOCATE(PFBUF_IN) -ENDIF - +!$ACC END DATA IF (LHOOK) CALL DR_HOOK('TRLTOM_CUDAAWARE',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ END SUBROUTINE TRLTOM_CUDAAWARE -SUBROUTINE TRLTOM(PFBUF_IN,PFBUF,KF_FS) - -!**** *TRLTOM * - transposition in Fourier space - -! Purpose. -! -------- -! Transpose Fourier coefficients from partitioning -! over latitudes to partitioning over wave numbers -! This is done between inverse Legendre Transform -! and inverse FFT. -! This is the inverse routine of TRMTOL. - -!** Interface. -! ---------- -! *CALL* *TRLTOM(...)* - -! Explicit arguments : PFBUF - Fourier coefficient buffer. It is -! -------------------- used for both input and output. -! KF_FS - Number of fields communicated - -! Implicit arguments : -! -------------------- - -! Method. -! ------- -! See documentation - -! Externals. -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! MPP Group *ECMWF* - -! Modifications. -! -------------- -! Original : 95-10-01 -! Modified : 97-06-18 G. Mozdzynski - control MPI mailbox use -! (NCOMBFLEN) for nphase.eq.1 -! Modified : 99-05-28 D.Salmond - Optimise copies. -! Modified : 00-02-02 M.Hamrud - Remove NPHASE -! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message -! passing and buffer packing -! G.Mozdzynski : 08-01-01 Cleanup -! Y.Seity : 07-08-30 Add barrier synchronisation under LSYNC_TRANS -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT -USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK -USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM -USE TPM_DISTR ,ONLY : D, NPRTRW, NPROC, MYPROC, MYSETW -USE TPM_GEN ,ONLY : LSYNC_TRANS -USE MPI - -IMPLICIT NONE - -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_FS -REAL(KIND=JPRBT) ,INTENT(INOUT), ALLOCATABLE :: PFBUF(:) -REAL(KIND=JPRBT) ,INTENT(IN), ALLOCATABLE :: PFBUF_IN(:) - -INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) -INTEGER(KIND=JPIM) :: J, ILEN, ISTA -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE -INTEGER(KIND=JPIM) :: IERROR - -IF (LHOOK) CALL DR_HOOK('TRLTOM',0,ZHOOK_HANDLE) - -ALLOCATE(PFBUF(D%NLENGT1B*2*KF_FS)) -!$ACC ENTER DATA CREATE(PFBUF) - -IF(NPROC > 1) THEN - DO J=1,NPRTRW - ILENS(J) = D%NLTSGTB(J)*2*KF_FS - IOFFS(J) = D%NSTAGT0B(J)*2*KF_FS - ILENR(J) = D%NLTSFTB(J)*2*KF_FS - IOFFR(J) = D%NSTAGT1B(J)*2*KF_FS - ENDDO - - CALL GSTATS(806,0) - !$ACC UPDATE HOST(PFBUF_IN) - - CALL MPL_ALLTOALLV(PSENDBUF=PFBUF_IN,KSENDCOUNTS=ILENS,& - & PRECVBUF=PFBUF,KRECVCOUNTS=ILENR,KSENDDISPL=IOFFS,KRECVDISPL=IOFFR,& - & KCOMM=MPL_ALL_MS_COMM,CDSTRING='TRLTOM:') - - !$ACC UPDATE DEVICE(PFBUF) - CALL MPI_BARRIER(MPI_COMM_WORLD,IERROR) - CALL GSTATS(806,1) -ELSE - ILEN = D%NLTSGTB(MYSETW)*2*KF_FS - ISTA = D%NSTAGT1B(MYSETW)*2*KF_FS+1 - CALL GSTATS(1607,0) - !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) - DO J=ISTA,ISTA+ILEN-1 - PFBUF(J) = PFBUF_IN(J) - ENDDO - CALL GSTATS(1607,1) -ENDIF - -!$ACC EXIT DATA DELETE(PFBUF_IN) -DEALLOCATE(PFBUF_IN) - -IF (LHOOK) CALL DR_HOOK('TRLTOM',1,ZHOOK_HANDLE) -! ------------------------------------------------------------------ -END SUBROUTINE TRLTOM END MODULE TRLTOM_MOD From e5fbb9e22e445c36dc20b0584847d8a924db6fc5 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:35 -0700 Subject: [PATCH 224/263] Simplify KVSET computation in dirtrans --- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 56 ++++++++++-------------- 1 file changed, 24 insertions(+), 32 deletions(-) diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index 499f32b05..8dc6875f1 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -106,59 +106,51 @@ SUBROUTINE GRID_TO_FOURIER(ALLOCATOR,HGRID_TO_FOURIER,KF_UV_G,KF_SCALARS_G,KF_GP ! Local variables -INTEGER(KIND=JPIM) :: IST,JGL,IGL,JF_FS -INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) -INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) INTEGER(KIND=JPIM) :: IVSET(KF_GP) -INTEGER(KIND=JPIM) :: IFGP2,IFGP3A,IFGP3B,IOFF,J3 -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC,IFIRST -INTEGER(KIND=JPIM) :: ISIZE,IFIELDS,ICHUNK,ICHUNKS,JK +INTEGER(KIND=JPIM) :: IOFF,J3 ! ------------------------------------------------------------------ ! Field distribution in Spectral/Fourier space +IOFF=0 IF(PRESENT(KVSETUV)) THEN - IVSETUV(:) = KVSETUV(:) + IVSET(IOFF+1:IOFF+KF_UV_G) = KVSETUV(:) + IOFF=IOFF+KF_UV_G + IVSET(IOFF+1:IOFF+KF_UV_G) = KVSETUV(:) + IOFF=IOFF+KF_UV_G ELSE - IVSETUV(:) = -1 + IVSET(IOFF+1:IOFF+KF_UV_G) = -1 + IOFF=IOFF+KF_UV_G + IVSET(IOFF+1:IOFF+KF_UV_G) = -1 + IOFF=IOFF+KF_UV_G ENDIF -IVSETSC(:) = -1 IF(PRESENT(KVSETSC)) THEN - IVSETSC(:) = KVSETSC(:) + IVSET(IOFF+1:IOFF+KF_SCALARS_G) = KVSETSC(:) + IOFF=IOFF+KF_SCALARS_G ELSE - IOFF=0 IF(PRESENT(KVSETSC2)) THEN - IFGP2=UBOUND(KVSETSC2,1) - IVSETSC(1:IFGP2)=KVSETSC2(:) - IOFF=IOFF+IFGP2 + IVSET(IOFF+1:IOFF+SIZE(KVSETSC2)) = KVSETSC2(:) + IOFF=IOFF+SIZE(KVSETSC2) ENDIF IF(PRESENT(KVSETSC3A)) THEN - IFGP3A=UBOUND(KVSETSC3A,1) - DO J3=1,UBOUND(PGP3A,3) - IVSETSC(IOFF+1:IOFF+IFGP3A)=KVSETSC3A(:) - IOFF=IOFF+IFGP3A + DO J3=1,SIZE(PGP3A,3) + IVSET(IOFF+1:IOFF+SIZE(KVSETSC3A))=KVSETSC3A(:) + IOFF=IOFF+SIZE(KVSETSC3A) ENDDO ENDIF IF(PRESENT(KVSETSC3B)) THEN - IFGP3B=UBOUND(KVSETSC3B,1) - DO J3=1,UBOUND(PGP3B,3) - IVSETSC(IOFF+1:IOFF+IFGP3B)=KVSETSC3B(:) - IOFF=IOFF+IFGP3B + DO J3=1,SIZE(PGP3B,3) + IVSET(IOFF+1:IOFF+SIZE(KVSETSC3B))=KVSETSC3B(:) + IOFF=IOFF+SIZE(KVSETSC3B) ENDDO ENDIF ENDIF -IST = 1 -IF(KF_UV_G > 0) THEN - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G -ENDIF -IF(KF_SCALARS_G > 0) THEN - IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) - IST = IST+KF_SCALARS_G +IF (IOFF /= 2*KF_UV_G+KF_SCALARS_G) THEN + PRINT*, "ERROR IN KVSET COMPUTATION" + FLUSH(6) + STOP 38 ENDIF ! Transposition From c3181ddc33435fef2bb52360858ed6ea080153c0 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:35 -0700 Subject: [PATCH 225/263] Remove ftdir_ctl wrapper --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 8 +- src/trans/gpu/internal/ftdir_ctl_mod.F90 | 165 ------------------- src/trans/gpu/internal/trgtol_mod.F90 | 108 +++++++++--- 3 files changed, 89 insertions(+), 192 deletions(-) delete mode 100755 src/trans/gpu/internal/ftdir_ctl_mod.F90 diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index db18fbfae..eeeb081b1 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -84,7 +84,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& USE FIELD_SPLIT_MOD ,ONLY : FIELD_SPLIT USE LTDIR_MOD ,ONLY : LTDIR, PREPARE_LTDIR USE LEDIR_MOD, ONLY: LEDIR_HANDLE -USE FTDIR_CTL_MOD ,ONLY : GRID_TO_FOURIER, PREPARE_GRID_TO_FOURIER, GRID_TO_FOURIER_HANDLE +USE TRGTOL_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS USE TRLTOM_MOD ,ONLY : TRLTOM_CUDAAWARE, TRLTOM_HANDLE, PREPARE_TRLTOM @@ -144,7 +144,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& REAL(KIND=JPRD), POINTER :: ZINPS0(:), ZINPA0(:) TYPE(BUFFERED_ALLOCATOR) :: ALLOCATOR -TYPE(GRID_TO_FOURIER_HANDLE) :: HGRID_TO_FOURIER +TYPE(TRGTOL_HANDLE) :: HTRGTOL TYPE(FTDIR_HANDLE) :: HFTDIR TYPE(FOURIER_OUT_HANDLE) :: HFOURIER_OUT TYPE(TRLTOM_HANDLE) :: HTRLTOM @@ -158,7 +158,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ! Prepare everything ALLOCATOR = MAKE_BUFFERED_ALLOCATOR() - HGRID_TO_FOURIER = PREPARE_GRID_TO_FOURIER(ALLOCATOR,KF_GP,KF_FS) + HTRGTOL = PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) HFTDIR = PREPARE_FTDIR() IF (NPROC > 1) THEN HFOURIER_OUT = PREPARE_FOURIER_OUT(ALLOCATOR, KF_FS) @@ -172,7 +172,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& CALL INSTANTIATE_ALLOCATOR(ALLOCATOR) ! from the PGP arrays to PREEL_REAL - CALL GRID_TO_FOURIER(ALLOCATOR,HGRID_TO_FOURIER,KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_REAL,& + CALL TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,& & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 deleted file mode 100755 index 8dc6875f1..000000000 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ /dev/null @@ -1,165 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE FTDIR_CTL_MOD - USE TRGTOL_MOD, ONLY: GRID_TO_FOURIER_HANDLE - IMPLICIT NONE - -CONTAINS -FUNCTION PREPARE_GRID_TO_FOURIER(ALLOCATOR,KF_GP,KF_FS) RESULT(HGRID_TO_FOURIER) - USE PARKIND_ECTRANS, ONLY : JPIM, JPRB, JPRBT - USE ALLOCATOR_MOD - USE TPM_DISTR, ONLY: D - - IMPLICIT NONE - - TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR - INTEGER(KIND=JPIM), INTENT(IN) :: KF_GP, KF_FS - TYPE(GRID_TO_FOURIER_HANDLE) :: HGRID_TO_FOURIER - - REAL(KIND=JPRBT) :: DUMMY - - INTEGER(KIND=C_SIZE_T) :: NELEM - - HGRID_TO_FOURIER%HCOMBUFS = RESERVE(ALLOCATOR, KF_GP*D%NGPTOT*SIZEOF(DUMMY)) - - NELEM = KF_FS*D%NLENGTF*SIZEOF(DUMMY) ! ZCOMBUFR - NELEM = NELEM + KF_FS*D%NLENGTF*SIZEOF(DUMMY) ! PREEL_REAL - HGRID_TO_FOURIER%HCOMBUFR_AND_REEL = RESERVE(ALLOCATOR, NELEM) - -END FUNCTION -SUBROUTINE GRID_TO_FOURIER(ALLOCATOR,HGRID_TO_FOURIER,KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,PREEL_REAL, & - & KVSETUV,KVSETSC,KPTRGP,& - & KVSETSC3A,KVSETSC3B,KVSETSC2,& - & PGP,PGPUV,PGP3A,PGP3B,PGP2) - - -!**** *FTDIR_CTL - Direct Fourier transform control - -! Purpose. Control routine for Grid-point to Fourier transform -! -------- - -!** Interface. -! ---------- -! CALL FTDIR_CTL(..) - -! Explicit arguments : -! -------------------- -! KF_UV_G - global number of spectral u-v fields -! KF_SCALARS_G - global number of scalar spectral fields -! KF_GP - total number of output gridpoint fields -! KF_FS - total number of fields in fourier space -! PGP - gridpoint array -! KVSETUV - "B" set in spectral/fourier space for -! u and v variables -! KVSETSC - "B" set in spectral/fourier space for -! scalar variables -! KPTRGP - pointer array to fields in gridpoint space - -! Method. -! ------- - -! Externals. TRGTOL - transposition routine -! ---------- FOURIER_OUT - copy fourier data to Fourier buffer -! FTDIR - fourier transform - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 00-03-03 - -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT -USE TRGTOL_MOD ,ONLY : TRGTOL_CUDAAWARE -USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX -USE ALLOCATOR_MOD - -IMPLICIT NONE - -INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) -REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP(:,:,:) -REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGPUV(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP3A(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP3B(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP2(:,:,:) -REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_REAL(:) - -TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR -TYPE(GRID_TO_FOURIER_HANDLE), INTENT(IN) :: HGRID_TO_FOURIER - -! Local variables - -INTEGER(KIND=JPIM) :: IVSET(KF_GP) -INTEGER(KIND=JPIM) :: IOFF,J3 - -! ------------------------------------------------------------------ - -! Field distribution in Spectral/Fourier space - -IOFF=0 -IF(PRESENT(KVSETUV)) THEN - IVSET(IOFF+1:IOFF+KF_UV_G) = KVSETUV(:) - IOFF=IOFF+KF_UV_G - IVSET(IOFF+1:IOFF+KF_UV_G) = KVSETUV(:) - IOFF=IOFF+KF_UV_G -ELSE - IVSET(IOFF+1:IOFF+KF_UV_G) = -1 - IOFF=IOFF+KF_UV_G - IVSET(IOFF+1:IOFF+KF_UV_G) = -1 - IOFF=IOFF+KF_UV_G -ENDIF -IF(PRESENT(KVSETSC)) THEN - IVSET(IOFF+1:IOFF+KF_SCALARS_G) = KVSETSC(:) - IOFF=IOFF+KF_SCALARS_G -ELSE - IF(PRESENT(KVSETSC2)) THEN - IVSET(IOFF+1:IOFF+SIZE(KVSETSC2)) = KVSETSC2(:) - IOFF=IOFF+SIZE(KVSETSC2) - ENDIF - IF(PRESENT(KVSETSC3A)) THEN - DO J3=1,SIZE(PGP3A,3) - IVSET(IOFF+1:IOFF+SIZE(KVSETSC3A))=KVSETSC3A(:) - IOFF=IOFF+SIZE(KVSETSC3A) - ENDDO - ENDIF - IF(PRESENT(KVSETSC3B)) THEN - DO J3=1,SIZE(PGP3B,3) - IVSET(IOFF+1:IOFF+SIZE(KVSETSC3B))=KVSETSC3B(:) - IOFF=IOFF+SIZE(KVSETSC3B) - ENDDO - ENDIF -ENDIF - -IF (IOFF /= 2*KF_UV_G+KF_SCALARS_G) THEN - PRINT*, "ERROR IN KVSET COMPUTATION" - FLUSH(6) - STOP 38 -ENDIF - -! Transposition - -CALL GSTATS(158,0) -CALL TRGTOL_CUDAAWARE(ALLOCATOR,HGRID_TO_FOURIER,PREEL_REAL,KF_FS,KF_GP,IVSET,KPTRGP,& - &PGP,PGPUV,PGP3A,PGP3B,PGP2) -CALL GSTATS(158,1) -! ------------------------------------------------------------------ -END SUBROUTINE GRID_TO_FOURIER -END MODULE FTDIR_CTL_MOD - diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 20cfdcf29..9eade265b 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -12,12 +12,37 @@ MODULE TRGTOL_MOD USE ALLOCATOR_MOD IMPLICIT NONE - TYPE GRID_TO_FOURIER_HANDLE + PRIVATE + PUBLIC :: TRGTOL_HANDLE, TRGTOL, PREPARE_TRGTOL + + TYPE TRGTOL_HANDLE TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFS, HCOMBUFR_AND_REEL END TYPE CONTAINS - SUBROUTINE TRGTOL_CUDAAWARE(ALLOCATOR,HGRID_TO_FOURIER,PREEL_REAL,KF_FS,KF_GP,KVSET,KPTRGP,& - &PGP,PGPUV,PGP3A,PGP3B,PGP2) + FUNCTION PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) RESULT(HTRGTOL) + USE PARKIND_ECTRANS, ONLY : JPIM, JPRB, JPRBT + USE ALLOCATOR_MOD + USE TPM_DISTR, ONLY: D + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_GP, KF_FS + TYPE(TRGTOL_HANDLE) :: HTRGTOL + + REAL(KIND=JPRBT) :: DUMMY + + INTEGER(KIND=C_SIZE_T) :: NELEM + + HTRGTOL%HCOMBUFS = RESERVE(ALLOCATOR, KF_GP*D%NGPTOT*SIZEOF(DUMMY)) + + NELEM = KF_FS*D%NLENGTF*SIZEOF(DUMMY) ! ZCOMBUFR + NELEM = NELEM + KF_FS*D%NLENGTF*SIZEOF(DUMMY) ! PREEL_REAL + HTRGTOL%HCOMBUFR_AND_REEL = RESERVE(ALLOCATOR, NELEM) + + END FUNCTION + SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,& + &PGP,PGPUV,PGP3A,PGP3B,PGP2,KPTRGP,KVSETUV,KVSETSC,KVSETSC3A,KVSETSC3B,KVSETSC2) !**** *TRGTOL * - transposition of grid point data from column ! structure to latitudinal. Reorganize data between @@ -94,16 +119,13 @@ SUBROUTINE TRGTOL_CUDAAWARE(ALLOCATOR,HGRID_TO_FOURIER,PREEL_REAL,KF_FS,KF_GP,KV IMPLICIT NONE REAL(KIND=JPRBT),INTENT(OUT), POINTER :: PREEL_REAL(:) - INTEGER(KIND=JPIM),INTENT(IN) :: KVSET(:) - INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP + INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) - REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP(:,:,:) - REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGPUV(:,:,:,:) - REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3A(:,:,:,:) - REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3B(:,:,:,:) - REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP2(:,:,:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:), KVSETSC(:), KVSETSC3A(:), KVSETSC3B(:), KVSETSC2(:) + REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP(:,:,:), PGPUV(:,:,:,:), PGP3A(:,:,:,:), PGP3B(:,:,:,:), PGP2(:,:,:) + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR - TYPE(GRID_TO_FOURIER_HANDLE), INTENT(IN) :: HGRID_TO_FOURIER + TYPE(TRGTOL_HANDLE), INTENT(IN) :: HTRGTOL ! LOCAL VARIABLES @@ -123,7 +145,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(ALLOCATOR,HGRID_TO_FOURIER,PREEL_REAL,KF_FS,KF_GP,KV &JGL, JI, JK, JL, ISETW, IFLD, & &II,IBUFLENR,IRECV_COUNTS, IPROC,IFLDS, & &ISEND_COUNTS,INS,INR,IR, JKL, PBOUND, IERROR, ILOCAL_LAT - INTEGER(KIND=JPIM) :: KF, KGL, KI + INTEGER(KIND=JPIM) :: KF, KGL, KI, J3 INTEGER(KIND=JPIM) :: IOFF, ILAT_STRIP INTEGER(KIND=JPIM) :: IRECV_BUFR_TO_OUT(D%NLENGTF,2),IRECV_BUFR_TO_OUT_OFFSET(NPROC), IRECV_BUFR_TO_OUT_V @@ -133,6 +155,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(ALLOCATOR,HGRID_TO_FOURIER,PREEL_REAL,KF_FS,KF_GP,KV INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V INTEGER(KIND=JPIM) :: IFLDA(KF_GP) + INTEGER(KIND=JPIM) :: IVSET(KF_GP) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE @@ -153,8 +176,47 @@ SUBROUTINE TRGTOL_CUDAAWARE(ALLOCATOR,HGRID_TO_FOURIER,PREEL_REAL,KF_FS,KF_GP,KV !* 0. Some initializations ! -------------------- + IOFF=0 + IF(PRESENT(KVSETUV)) THEN + IVSET(IOFF+1:IOFF+KF_UV_G) = KVSETUV(:) + IOFF=IOFF+KF_UV_G + IVSET(IOFF+1:IOFF+KF_UV_G) = KVSETUV(:) + IOFF=IOFF+KF_UV_G + ELSE + IVSET(IOFF+1:IOFF+KF_UV_G) = -1 + IOFF=IOFF+KF_UV_G + IVSET(IOFF+1:IOFF+KF_UV_G) = -1 + IOFF=IOFF+KF_UV_G + ENDIF + IF(PRESENT(KVSETSC)) THEN + IVSET(IOFF+1:IOFF+KF_SCALARS_G) = KVSETSC(:) + IOFF=IOFF+KF_SCALARS_G + ELSE + IF(PRESENT(KVSETSC2)) THEN + IVSET(IOFF+1:IOFF+SIZE(KVSETSC2)) = KVSETSC2(:) + IOFF=IOFF+SIZE(KVSETSC2) + ENDIF + IF(PRESENT(KVSETSC3A)) THEN + DO J3=1,SIZE(PGP3A,3) + IVSET(IOFF+1:IOFF+SIZE(KVSETSC3A))=KVSETSC3A(:) + IOFF=IOFF+SIZE(KVSETSC3A) + ENDDO + ENDIF + IF(PRESENT(KVSETSC3B)) THEN + DO J3=1,SIZE(PGP3B,3) + IVSET(IOFF+1:IOFF+SIZE(KVSETSC3B))=KVSETSC3B(:) + IOFF=IOFF+SIZE(KVSETSC3B) + ENDDO + ENDIF + ENDIF + + IF (IOFF /= 2*KF_UV_G+KF_SCALARS_G) THEN + PRINT*, "ERROR IN IVSET COMPUTATION" + FLUSH(6) + STOP 38 + ENDIF - IF (LHOOK) CALL DR_HOOK('TRGTOL_CUDAAWARE',0,ZHOOK_HANDLE) + IF (LHOOK) CALL DR_HOOK('TRGTOL',0,ZHOOK_HANDLE) CALL GSTATS(1805,0) IOFF=1 @@ -171,12 +233,12 @@ SUBROUTINE TRGTOL_CUDAAWARE(ALLOCATOR,HGRID_TO_FOURIER,PREEL_REAL,KF_FS,KF_GP,KV ! Prepare sender arrays ! find number of fields on a certain V-set IF(NPRTRV == 1) THEN - ! This is needed because KVSET(JFLD) == -1 if there is only one V-set + ! This is needed because IVSET(JFLD) == -1 if there is only one V-set ISEND_FIELD_COUNT(1) = KF_GP ELSE ISEND_FIELD_COUNT(:) = 0 DO JFLD=1,KF_GP - ISEND_FIELD_COUNT(KVSET(JFLD)) = ISEND_FIELD_COUNT(KVSET(JFLD)) + 1 + ISEND_FIELD_COUNT(IVSET(JFLD)) = ISEND_FIELD_COUNT(IVSET(JFLD)) + 1 ENDDO ENDIF ! find number of grid-points on a certain W-set that overlap with myself @@ -236,7 +298,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(ALLOCATOR,HGRID_TO_FOURIER,PREEL_REAL,KF_FS,KF_GP,KV IRECVTOT(JROC) = IPOS*KF_FS ENDDO - CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HGRID_TO_FOURIER%HCOMBUFR_AND_REEL),& + CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& & KF_FS*D%NLENGTF*SIZEOF(PREEL_REAL(1))+1, KF_FS*D%NLENGTF*SIZEOF(PREEL_REAL(1))) !$ACC DATA COPYIN(IRECV_BUFR_TO_OUT,PGP_INDICES) PRESENT(PREEL_REAL) ASYNC(1) @@ -293,7 +355,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(ALLOCATOR,HGRID_TO_FOURIER,PREEL_REAL,KF_FS,KF_GP,KV ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(IRECV_TO_PROC(JROC)) ENDDO - CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HGRID_TO_FOURIER%HCOMBUFS),& + CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFS),& & 1_C_SIZE_T, ICOMBUFS_OFFSET(ISEND_COUNTS+1)*SIZEOF(ZCOMBUFS(1))) !....Pack loop......................................................... @@ -309,7 +371,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(ALLOCATOR,HGRID_TO_FOURIER,PREEL_REAL,KF_FS,KF_GP,KV IFLDS = 0 DO JFLD=1,KF_GP - IF(KVSET(JFLD) == ISETV .OR. KVSET(JFLD) == -1 ) THEN + IF(IVSET(JFLD) == ISETV .OR. IVSET(JFLD) == -1 ) THEN IFLDS = IFLDS+1 IF(PRESENT(KPTRGP)) THEN IFLDA(IFLDS)=KPTRGP(JFLD) @@ -374,7 +436,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(ALLOCATOR,HGRID_TO_FOURIER,PREEL_REAL,KF_FS,KF_GP,KV ENDIF CALL GSTATS(411,0) - CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HGRID_TO_FOURIER%HCOMBUFR_AND_REEL),& + CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& & 1_C_SIZE_T, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*SIZEOF(ZCOMBUFR(1))) !$ACC DATA IF(IRECV_COUNTS > 0) PRESENT(ZCOMBUFR) @@ -405,7 +467,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(ALLOCATOR,HGRID_TO_FOURIER,PREEL_REAL,KF_FS,KF_GP,KV ! Input is KF_GP fields. We find the resulting KF_FS fields. IFLDS = 0 DO JFLD=1,KF_GP - IF(KVSET(JFLD) == MYSETV .OR. KVSET(JFLD) == -1) THEN + IF(IVSET(JFLD) == MYSETV .OR. IVSET(JFLD) == -1) THEN IFLDS = IFLDS+1 IF(PRESENT(KPTRGP)) THEN IFLDA(IFLDS) = KPTRGP(JFLD) @@ -470,7 +532,7 @@ SUBROUTINE TRGTOL_CUDAAWARE(ALLOCATOR,HGRID_TO_FOURIER,PREEL_REAL,KF_FS,KF_GP,KV IF(IR > 0) THEN CALL MPL_WAIT(KREQUEST=IREQ(1:IR), & - & CDSTRING='TRGTOL_CUDAAWARE: WAIT FOR SENDS AND RECEIVES') + & CDSTRING='TRGTOL: WAIT FOR SENDS AND RECEIVES') ENDIF IF (LSYNC_TRANS) THEN CALL GSTATS(431,0) @@ -509,8 +571,8 @@ SUBROUTINE TRGTOL_CUDAAWARE(ALLOCATOR,HGRID_TO_FOURIER,PREEL_REAL,KF_FS,KF_GP,KV !$ACC END DATA !PGP - IF (LHOOK) CALL DR_HOOK('TRGTOL_CUDAAWARE',1,ZHOOK_HANDLE) + IF (LHOOK) CALL DR_HOOK('TRGTOL',1,ZHOOK_HANDLE) - END SUBROUTINE TRGTOL_CUDAAWARE + END SUBROUTINE TRGTOL END MODULE TRGTOL_MOD From 0475c8f40708b24d958ef26def3414ac3efd26c4 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:35 -0700 Subject: [PATCH 226/263] simplify ledir a bit --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 7 +- src/trans/gpu/internal/ledir_mod.F90 | 138 ++++++++----------- 2 files changed, 56 insertions(+), 89 deletions(-) diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index eeeb081b1..1f727eb56 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -91,7 +91,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& USE FOURIER_OUT_MOD , ONLY: FOURIER_OUT, FOURIER_OUT_HANDLE, PREPARE_FOURIER_OUT USE TPM_DISTR, ONLY : D, NPROC -USE LEDIR_MOD ,ONLY : LEDIR_POINTERS, LEDIR_UNPACK_BUFFER, LEDIR_ALLOC_SIZE, PREPARE_LEDIR_UNPACK, LEDIR_UNPACK_HANDLE +USE LEDIR_MOD ,ONLY : LEDIR_UNPACK_BUFFER, PREPARE_LEDIR_UNPACK, LEDIR_UNPACK_HANDLE USE ALLOCATOR_MOD USE ISO_C_BINDING, ONLY: C_INT8_T @@ -133,8 +133,6 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& INTEGER(KIND=JPIM) :: IF_UV_G,IF_UV,ISTUV,IF_SCALARS,IF_SCALARS_G,IF_FS,IF_GP INTEGER(KIND=JPIM) :: JFLD,ISTSC_G,IENSC_G,ISTSC,IENSC,IENUV,IF_GPB -INTEGER(KIND=8) :: IALLOC_SZ, IALLOC_POS - REAL(KIND=JPRBT), POINTER :: FOUBUF_IN(:) REAL(KIND=JPRBT), POINTER :: FOUBUF(:) REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:) @@ -200,8 +198,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ! Short cut - no need to go through tansforms, we will go directly into ! the legendre space, but for that we need twice the memory, roughly ! (but we don't need the send/recv buffers) - FOUBUF => PREEL_COMPLEX - CALL LEDIR_UNPACK_BUFFER(ALLOCATOR,HLEDIR_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + CALL LEDIR_UNPACK_BUFFER(ALLOCATOR,HLEDIR_UNPACK,PREEL_COMPLEX,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) ENDIF CALL GSTATS(153,1) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 3189b3942..ddbdeca3b 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -25,14 +25,33 @@ MODULE LEDIR_MOD INTEGER(KIND=JPIM) :: A = 8 !Alignment CONTAINS FUNCTION PREPARE_LEDIR_UNPACK(ALLOCATOR, KF_FS) RESULT(HLEDIR_UNPACK) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD - IMPLICIT NONE + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE TPM_DIM ,ONLY : R + USE TPM_DISTR ,ONLY : D + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + TYPE(LEDIR_UNPACK_HANDLE) :: HLEDIR_UNPACK - TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR - INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS - TYPE(LEDIR_UNPACK_HANDLE) :: HLEDIR_UNPACK + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=C_SIZE_T) :: ISIZE + + REAL(KIND=JPRBT) :: ZPRBT_DUMMY + REAL(KIND=JPRD) :: ZPRD_DUMMY - HLEDIR_UNPACK%ZINPS_AND_ZINPA = RESERVE(ALLOCATOR, LEDIR_ALLOC_SIZE(KF_FS)) + CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& + IIN0_STRIDES0=IIN0_STRIDES0,IIN0_STRIDES1=IIN0_STRIDES1) + + ! Check if the reuse buffer is large enough + ISIZE = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) + + HLEDIR_UNPACK%ZINPS_AND_ZINPA = RESERVE(ALLOCATOR, ISIZE) END FUNCTION SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) @@ -66,115 +85,66 @@ SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRI IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(R%NDGNH,A) END SUBROUTINE - -FUNCTION LEDIR_ALLOC_SIZE(KF_FS) +SUBROUTINE LEDIR_UNPACK_BUFFER(ALLOCATOR,HLEDIR_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD - USE TPM_DIM ,ONLY : R - USE TPM_DISTR ,ONLY : D - - IMPLICIT NONE - - INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS - - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 - INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 - INTEGER(KIND=C_SIZE_T) :: LEDIR_ALLOC_SIZE - - REAL(KIND=JPRBT) :: ZPRBT_DUMMY - REAL(KIND=JPRD) :: ZPRD_DUMMY - - CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& - IIN0_STRIDES0=IIN0_STRIDES0,IIN0_STRIDES1=IIN0_STRIDES1) - - ! Check if the reuse buffer is large enough - LEDIR_ALLOC_SIZE = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) - LEDIR_ALLOC_SIZE = LEDIR_ALLOC_SIZE + ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) - LEDIR_ALLOC_SIZE = LEDIR_ALLOC_SIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) - LEDIR_ALLOC_SIZE = LEDIR_ALLOC_SIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) -END FUNCTION + USE TPM_DIM ,ONLY : R, R_NDGNH, R_NDGL + USE TPM_GEOMETRY ,ONLY : G, G_NDGLU + USE TPM_FIELDS ,ONLY : F + USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1,MYPROC,NPROC + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX -SUBROUTINE LEDIR_POINTERS(ALLOCATOR,HLEDIR_UNPACK,KF_FS,ZINPS,ZINPA,ZINPS0,ZINPA0) - USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD - USE TPM_DIM ,ONLY : R - USE TPM_DISTR ,ONLY : D + USE TPM_DISTR ,ONLY : D,MYSETW, NPROC,D_NSTAGTF,D_NPTRLS + USE TPM_GEOMETRY ,ONLY : G,G_NLOEN USE, INTRINSIC :: ISO_C_BINDING + IMPLICIT NONE - REAL(KIND=JPRBT), INTENT(OUT), OPTIONAL, POINTER :: ZINPS(:), ZINPA(:) - REAL(KIND=JPRD), INTENT(OUT), OPTIONAL, POINTER :: ZINPS0(:), ZINPA0(:) - INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + REAL(KIND=JPRBT), POINTER, INTENT(INOUT) :: FOUBUF(:) + REAL(KIND=JPRBT), POINTER, INTENT(INOUT) :: ZINPS(:), ZINPA(:) + REAL(KIND=JPRD), POINTER, INTENT(INOUT) :: ZINPS0(:), ZINPA0(:) + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(LEDIR_UNPACK_HANDLE), INTENT(IN) :: HLEDIR_UNPACK + REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ + INTEGER(KIND=8) :: JF, OFFSET_VAR + INTEGER(KIND=JPIM) :: KM, ISL, IGLS, OFFSET1, OFFSET2, JGL, IGLG, KMLOC + + REAL(KIND=JPRBT) :: PAIA, PAIS, V1, V2, SCAL + CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) IALLOC_POS=1 IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPS(0)),128) - IF (PRESENT(ZINPS)) & - CALL ASSIGN_PTR(ZINPS, GET_ALLOCATION(ALLOCATOR, HLEDIR_UNPACK%ZINPS_AND_ZINPA),& - & IALLOC_POS, IALLOC_SZ) + CALL ASSIGN_PTR(ZINPS, GET_ALLOCATION(ALLOCATOR, HLEDIR_UNPACK%ZINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPA(0)),128) - IF (PRESENT(ZINPA)) & - CALL ASSIGN_PTR(ZINPA, GET_ALLOCATION(ALLOCATOR, HLEDIR_UNPACK%ZINPS_AND_ZINPA),& - & IALLOC_POS, IALLOC_SZ) + CALL ASSIGN_PTR(ZINPA, GET_ALLOCATION(ALLOCATOR, HLEDIR_UNPACK%ZINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPS0(0)),128) - IF (PRESENT(ZINPS0)) & - CALL ASSIGN_PTR(ZINPS0, GET_ALLOCATION(ALLOCATOR, HLEDIR_UNPACK%ZINPS_AND_ZINPA),& - & IALLOC_POS, IALLOC_SZ) + CALL ASSIGN_PTR(ZINPS0, GET_ALLOCATION(ALLOCATOR, HLEDIR_UNPACK%ZINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPA0(0)),128) - IF (PRESENT(ZINPA0)) & - CALL ASSIGN_PTR(ZINPA0, GET_ALLOCATION(ALLOCATOR, HLEDIR_UNPACK%ZINPS_AND_ZINPA),& - & IALLOC_POS, IALLOC_SZ) + CALL ASSIGN_PTR(ZINPA0, GET_ALLOCATION(ALLOCATOR, HLEDIR_UNPACK%ZINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ -END SUBROUTINE - -SUBROUTINE LEDIR_UNPACK_BUFFER(ALLOCATOR,HLEDIR_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) - USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD - USE TPM_DIM ,ONLY : R, R_NDGNH, R_NDGL - USE TPM_GEOMETRY ,ONLY : G, G_NDGLU - USE TPM_FIELDS ,ONLY : F - USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1,MYPROC,NPROC - USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX - - USE TPM_DISTR ,ONLY : D,MYSETW, NPROC,D_NSTAGTF,D_NPTRLS - USE TPM_GEOMETRY ,ONLY : G,G_NLOEN - USE, INTRINSIC :: ISO_C_BINDING - - IMPLICIT NONE - - REAL(KIND=JPRBT), POINTER, INTENT(INOUT) :: FOUBUF(:) - REAL(KIND=JPRBT), POINTER, INTENT(INOUT) :: ZINPS(:), ZINPA(:) - REAL(KIND=JPRD), POINTER, INTENT(INOUT) :: ZINPS0(:), ZINPA0(:) - INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV - TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR - TYPE(LEDIR_UNPACK_HANDLE), INTENT(IN) :: HLEDIR_UNPACK - - REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) - - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 - INTEGER(KIND=JPIM) :: IIN0_STRIDES0 - INTEGER(KIND=8) :: JF, OFFSET_VAR - INTEGER(KIND=JPIM) :: KM, ISL, IGLS, OFFSET1, OFFSET2, JGL, IGLG, KMLOC - - REAL(KIND=JPRBT) :: PAIA, PAIS, V1, V2, SCAL - - CALL LEDIR_POINTERS(ALLOCATOR,HLEDIR_UNPACK,KF_FS, & - ZINPS=ZINPS,ZINPA=ZINPA,ZINPS0=ZINPS0,ZINPA0=ZINPA0) CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& IIN0_STRIDES0=IIN0_STRIDES0) From 30c9ece3ebbbd9b7bcae508048ead8c62ee8504b Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:35 -0700 Subject: [PATCH 227/263] share reuse_ptr --- src/trans/gpu/internal/allocator_mod.F90 | 6 +++++- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 13 +++++++++---- src/trans/gpu/internal/tpm_trans.F90 | 2 +- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/trans/gpu/internal/allocator_mod.F90 b/src/trans/gpu/internal/allocator_mod.F90 index 7f351e369..9b0c5bdf4 100644 --- a/src/trans/gpu/internal/allocator_mod.F90 +++ b/src/trans/gpu/internal/allocator_mod.F90 @@ -79,19 +79,23 @@ SUBROUTINE INSTANTIATE_ALLOCATOR(ALLOCATOR, OLD_PTR) ALLOCATOR%BUFR_SZ(1) = ALIGN(ALLOCATOR%BUFR_SZ(1),128) ALLOCATOR%BUFR_SZ(2) = ALIGN(ALLOCATOR%BUFR_SZ(2),128) - IF (PRESENT(OLD_PTR)) THEN + IF (ASSOCIATED(OLD_PTR)) THEN IF (SIZEOF(OLD_PTR) < SUM(ALLOCATOR%BUFR_SZ) ) THEN !$ACC EXIT DATA DELETE(OLD_PTR) IF(PRESENT(OLD_PTR)) DEALLOCATE(OLD_PTR) NULLIFY(OLD_PTR) ALLOCATE(ALLOCATOR%PTR(1:SUM(ALLOCATOR%BUFR_SZ))) !$ACC ENTER DATA CREATE(ALLOCATOR%PTR) + + OLD_PTR => ALLOCATOR%PTR ELSE ALLOCATOR%PTR(1:) => OLD_PTR(1:) + NULLIFY(OLD_PTR) ENDIF ELSE ALLOCATE(ALLOCATOR%PTR(1:SUM(ALLOCATOR%BUFR_SZ))) !$ACC ENTER DATA CREATE(ALLOCATOR%PTR) + OLD_PTR => ALLOCATOR%PTR ENDIF END SUBROUTINE diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 1f727eb56..466435148 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -92,6 +92,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& USE TPM_DISTR, ONLY : D, NPROC USE LEDIR_MOD ,ONLY : LEDIR_UNPACK_BUFFER, PREPARE_LEDIR_UNPACK, LEDIR_UNPACK_HANDLE +USE TPM_TRANS, ONLY:REUSE_PTR USE ALLOCATOR_MOD USE ISO_C_BINDING, ONLY: C_INT8_T @@ -149,6 +150,8 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& TYPE(LEDIR_UNPACK_HANDLE) :: HLEDIR_UNPACK TYPE(LEDIR_HANDLE) :: HLEDIR +INTEGER(KIND=C_INT8_T), POINTER :: PTR(:) + IF(NPROMATR > 0) THEN PRINT *, "ERROR, not implemented right now (NPROMATR > 0)" STOP 4 @@ -167,7 +170,12 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ENDIF HLEDIR = PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) - CALL INSTANTIATE_ALLOCATOR(ALLOCATOR) + ! TODO this is going to be simplified when we have it implemented for invtrans too + IF(ALLOCATED(REUSE_PTR)) & + CALL C_F_POINTER(C_LOC(REUSE_PTR), PTR, (/ SIZEOF(REUSE_PTR) /) ) + CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, PTR) + IF (ASSOCIATED(PTR)) & + CALL C_F_POINTER(C_LOC(PTR), REUSE_PTR, (/ SIZEOF(PTR)/SIZEOF(REUSE_PTR(1)) /) ) ! from the PGP arrays to PREEL_REAL CALL TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,& @@ -207,9 +215,6 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& & PSPVOR,PSPDIV,PSPSCALAR,& & PSPSC3A,PSPSC3B,PSPSC2) ENDIF - - !$ACC EXIT DATA DELETE(ALLOCATOR%PTR) - DEALLOCATE(ALLOCATOR%PTR) ! ------------------------------------------------------------------ END SUBROUTINE DIR_TRANS_CTL diff --git a/src/trans/gpu/internal/tpm_trans.F90 b/src/trans/gpu/internal/tpm_trans.F90 index 158f7680a..c9d1e71b8 100755 --- a/src/trans/gpu/internal/tpm_trans.F90 +++ b/src/trans/gpu/internal/tpm_trans.F90 @@ -69,6 +69,6 @@ MODULE TPM_TRANS ! we adapt the size. After 2 iterations this should lead to constant runtimes ! (the first iteration is used to get the max buffer size, the second iteration ! is going to recreate the graphs if needed) -REAL(KIND=JPRBT),ALLOCATABLE,TARGET :: REUSE_PTR(:) +REAL(KIND=JPRBT),POINTER :: REUSE_PTR(:) END MODULE TPM_TRANS From 36581dec62c3223013f8c55b34a36f5211b4122a Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:36 -0700 Subject: [PATCH 228/263] Merge fourier_out and pack_buffs into new file --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 23 +- src/trans/gpu/internal/fourier_out_mod.F90 | 114 ------ src/trans/gpu/internal/ledir_mod.F90 | 187 --------- src/trans/gpu/internal/trgtol_mod.F90 | 5 + src/trans/gpu/internal/trltom_pack_unpack.F90 | 386 ++++++++++++++++++ 5 files changed, 402 insertions(+), 313 deletions(-) delete mode 100755 src/trans/gpu/internal/fourier_out_mod.F90 create mode 100755 src/trans/gpu/internal/trltom_pack_unpack.F90 diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 466435148..7e30d283f 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -88,10 +88,9 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS USE TRLTOM_MOD ,ONLY : TRLTOM_CUDAAWARE, TRLTOM_HANDLE, PREPARE_TRLTOM -USE FOURIER_OUT_MOD , ONLY: FOURIER_OUT, FOURIER_OUT_HANDLE, PREPARE_FOURIER_OUT +USE TRLTOM_PACK_UNPACK USE TPM_DISTR, ONLY : D, NPROC -USE LEDIR_MOD ,ONLY : LEDIR_UNPACK_BUFFER, PREPARE_LEDIR_UNPACK, LEDIR_UNPACK_HANDLE USE TPM_TRANS, ONLY:REUSE_PTR USE ALLOCATOR_MOD @@ -145,9 +144,10 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& TYPE(BUFFERED_ALLOCATOR) :: ALLOCATOR TYPE(TRGTOL_HANDLE) :: HTRGTOL TYPE(FTDIR_HANDLE) :: HFTDIR -TYPE(FOURIER_OUT_HANDLE) :: HFOURIER_OUT +TYPE(TRLTOM_PACK_HANDLE) :: HTRLTOM_PACK TYPE(TRLTOM_HANDLE) :: HTRLTOM -TYPE(LEDIR_UNPACK_HANDLE) :: HLEDIR_UNPACK +TYPE(TRLTOM_UNPACK_HANDLE) :: HTRLTOM_UNPACK +TYPE(TRLTOM_DIRECT_HANDLE) :: HTRLTOM_DIRECT TYPE(LEDIR_HANDLE) :: HLEDIR INTEGER(KIND=C_INT8_T), POINTER :: PTR(:) @@ -162,11 +162,11 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& HTRGTOL = PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) HFTDIR = PREPARE_FTDIR() IF (NPROC > 1) THEN - HFOURIER_OUT = PREPARE_FOURIER_OUT(ALLOCATOR, KF_FS) + HTRLTOM_PACK = PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) HTRLTOM = PREPARE_TRLTOM(ALLOCATOR, KF_FS) - HLEDIR_UNPACK = PREPARE_LEDIR_UNPACK(ALLOCATOR, KF_FS) + HTRLTOM_UNPACK = PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) ELSE - HLEDIR_UNPACK = PREPARE_LEDIR_UNPACK(ALLOCATOR, KF_FS) + HTRLTOM_DIRECT = PREPARE_TRLTOM_DIRECT(ALLOCATOR, KF_FS) ENDIF HLEDIR = PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) @@ -197,16 +197,15 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_CUDAAWARE' IF (KF_FS > 0) THEN - CALL FOURIER_OUT(ALLOCATOR,HFOURIER_OUT,PREEL_COMPLEX,FOUBUF_IN,KF_FS) + CALL TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) CALL TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,FOUBUF_IN,FOUBUF,KF_FS) - CALL LEDIR_UNPACK_BUFFER(ALLOCATOR,HLEDIR_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + CALL TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) ENDIF ELSE ! Short cut - no need to go through tansforms, we will go directly into - ! the legendre space, but for that we need twice the memory, roughly - ! (but we don't need the send/recv buffers) - CALL LEDIR_UNPACK_BUFFER(ALLOCATOR,HLEDIR_UNPACK,PREEL_COMPLEX,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + ! the legendre space + CALL TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) ENDIF CALL GSTATS(153,1) diff --git a/src/trans/gpu/internal/fourier_out_mod.F90 b/src/trans/gpu/internal/fourier_out_mod.F90 deleted file mode 100755 index 2150cb9a4..000000000 --- a/src/trans/gpu/internal/fourier_out_mod.F90 +++ /dev/null @@ -1,114 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE FOURIER_OUT_MOD - USE ALLOCATOR_MOD - IMPLICIT NONE - - TYPE FOURIER_OUT_HANDLE - TYPE(ALLOCATION_RESERVATION_HANDLE) :: HFOUBUF_IN - END TYPE -CONTAINS - FUNCTION PREPARE_FOURIER_OUT(ALLOCATOR, KF_FS) RESULT(HFOURIER_OUT) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT - USE TPM_DISTR, ONLY: D - - IMPLICIT NONE - - TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR - INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS - TYPE(FOURIER_OUT_HANDLE) :: HFOURIER_OUT - - REAL(KIND=JPRBT) :: DUMMY - - HFOURIER_OUT%HFOUBUF_IN = RESERVE(ALLOCATOR, D%NLENGT0B*KF_FS*2*SIZEOF(DUMMY)) - END FUNCTION -SUBROUTINE FOURIER_OUT(ALLOCATOR,HFOURIER_OUT,PREEL_COMPLEX,FOUBUF_IN,KF_FS) - -!**** *FOURIER_OUT* - Copy fourier data from local array to buffer - -! Purpose. -! -------- -! Routine for copying fourier data from local array to buffer - -!** Interface. -! ---------- -! CALL FOURIER_OUT(...) - -! Explicit arguments : PREEL - local fourier/GP array -! -------------------- KF_FS - number of fields -! -! Externals. None. -! ---------- - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 2000-04-01 - -! ------------------------------------------------------------------ - -USE ALLOCATOR_MOD -USE PARKIND_ECTRANS ,ONLY : JPIM,JPRBT -USE TPM_DISTR ,ONLY : D,MYSETW,MYPROC, NPROC, D_NSTAGTF, D_NPNTGTB0,D_NPTRLS -USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NLOEN -USE TPM_DIM, ONLY: R_NSMAX -USE ISO_C_BINDING -! - -IMPLICIT NONE - -REAL(KIND=JPRBT), INTENT(IN) :: PREEL_COMPLEX(:) -REAL(KIND=JPRBT), POINTER, INTENT(OUT) :: FOUBUF_IN(:) -INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS -TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR -TYPE(FOURIER_OUT_HANDLE), INTENT(IN) :: HFOURIER_OUT - -INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL - -REAL(KIND=JPRBT) :: SCAL - -CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HFOURIER_OUT%HFOUBUF_IN),& - & 1_C_SIZE_T, D%NLENGT0B*KF_FS*2*SIZEOF(FOUBUF_IN(1))) - -!$ACC DATA PRESENT(D,G_NMEN,D_NPNTGTB0,FOUBUF_IN,PREEL_COMPLEX,D_NSTAGTF,G_NLOEN) ASYNC(1) - -! scale results and move into next transformation buffer - -OFFSET_VAR=D_NPTRLS(MYSETW) - -!$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,ISTA,SCAL) DEFAULT(NONE) & -!$ACC& ASYNC(1) TILE(32,16,1) -DO KGL=1,D%NDGL_FS - DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG) - DO JF=1,KF_FS - IGLG = OFFSET_VAR+KGL-1 - IF (JM <= G_NMEN(IGLG)) THEN - IOFF_LAT = KF_FS*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) - - SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) - ISTA = D_NPNTGTB0(JM,KGL)*KF_FS*2 - - FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+1) - FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+2) - ENDIF - ENDDO - ENDDO -ENDDO -!$ACC END DATA - -!$ACC WAIT(1) - -END SUBROUTINE FOURIER_OUT -END MODULE FOURIER_OUT_MOD - diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index ddbdeca3b..cd7ca1567 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -14,45 +14,12 @@ MODULE LEDIR_MOD USE ALLOCATOR_MOD IMPLICIT NONE - TYPE LEDIR_UNPACK_HANDLE - TYPE(ALLOCATION_RESERVATION_HANDLE) :: ZINPS_AND_ZINPA - END TYPE TYPE LEDIR_HANDLE TYPE(ALLOCATION_RESERVATION_HANDLE) :: HOUT_AND_POA END TYPE - INTEGER(KIND=JPIM) :: A = 8 !Alignment CONTAINS -FUNCTION PREPARE_LEDIR_UNPACK(ALLOCATOR, KF_FS) RESULT(HLEDIR_UNPACK) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD - USE TPM_DIM ,ONLY : R - USE TPM_DISTR ,ONLY : D - - IMPLICIT NONE - - TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR - INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS - TYPE(LEDIR_UNPACK_HANDLE) :: HLEDIR_UNPACK - - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 - INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 - INTEGER(KIND=C_SIZE_T) :: ISIZE - - REAL(KIND=JPRBT) :: ZPRBT_DUMMY - REAL(KIND=JPRD) :: ZPRD_DUMMY - - CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& - IIN0_STRIDES0=IIN0_STRIDES0,IIN0_STRIDES1=IIN0_STRIDES1) - - ! Check if the reuse buffer is large enough - ISIZE = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) - ISIZE = ISIZE + ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) - ISIZE = ISIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) - ISIZE = ISIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) - - HLEDIR_UNPACK%ZINPS_AND_ZINPA = RESERVE(ALLOCATOR, ISIZE) -END FUNCTION SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD @@ -85,160 +52,6 @@ SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRI IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(R%NDGNH,A) END SUBROUTINE -SUBROUTINE LEDIR_UNPACK_BUFFER(ALLOCATOR,HLEDIR_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) - USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD - USE TPM_DIM ,ONLY : R, R_NDGNH, R_NDGL - USE TPM_GEOMETRY ,ONLY : G, G_NDGLU - USE TPM_FIELDS ,ONLY : F - USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1,MYPROC,NPROC - USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX - - USE TPM_DISTR ,ONLY : D,MYSETW, NPROC,D_NSTAGTF,D_NPTRLS - USE TPM_GEOMETRY ,ONLY : G,G_NLOEN - USE, INTRINSIC :: ISO_C_BINDING - - IMPLICIT NONE - - REAL(KIND=JPRBT), POINTER, INTENT(INOUT) :: FOUBUF(:) - REAL(KIND=JPRBT), POINTER, INTENT(INOUT) :: ZINPS(:), ZINPA(:) - REAL(KIND=JPRD), POINTER, INTENT(INOUT) :: ZINPS0(:), ZINPA0(:) - INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV - TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR - TYPE(LEDIR_UNPACK_HANDLE), INTENT(IN) :: HLEDIR_UNPACK - - REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) - - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 - INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 - INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 - - INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ - - INTEGER(KIND=8) :: JF, OFFSET_VAR - INTEGER(KIND=JPIM) :: KM, ISL, IGLS, OFFSET1, OFFSET2, JGL, IGLG, KMLOC - - REAL(KIND=JPRBT) :: PAIA, PAIS, V1, V2, SCAL - - CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& - IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) - - IALLOC_POS=1 - - IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPS(0)),128) - CALL ASSIGN_PTR(ZINPS, GET_ALLOCATION(ALLOCATOR, HLEDIR_UNPACK%ZINPS_AND_ZINPA),& - & IALLOC_POS, IALLOC_SZ) - IALLOC_POS=IALLOC_POS+IALLOC_SZ - - IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPA(0)),128) - CALL ASSIGN_PTR(ZINPA, GET_ALLOCATION(ALLOCATOR, HLEDIR_UNPACK%ZINPS_AND_ZINPA),& - & IALLOC_POS, IALLOC_SZ) - IALLOC_POS=IALLOC_POS+IALLOC_SZ - - IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPS0(0)),128) - CALL ASSIGN_PTR(ZINPS0, GET_ALLOCATION(ALLOCATOR, HLEDIR_UNPACK%ZINPS_AND_ZINPA),& - & IALLOC_POS, IALLOC_SZ) - IALLOC_POS=IALLOC_POS+IALLOC_SZ - - IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPA0(0)),128) - CALL ASSIGN_PTR(ZINPA0, GET_ALLOCATION(ALLOCATOR, HLEDIR_UNPACK%ZINPS_AND_ZINPA),& - & IALLOC_POS, IALLOC_SZ) - IALLOC_POS=IALLOC_POS+IALLOC_SZ - - CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& - IIN0_STRIDES0=IIN0_STRIDES0) - - !$ACC DATA & - !$ACC& PRESENT(ZINPS,ZINPA,ZINPS0,ZINPA0) & - !$ACC& PRESENT(F,F%RW) & - !$ACC& PRESENT(D,D_MYMS,R,G,G_NDGLU) & - !$ACC& PRESENT(D_NPNTGTB1) - - IF (NPROC > 1) THEN - - !$ACC DATA PRESENT(FOUBUF) - !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) ASYNC(1) - DO KMLOC=1,D_NUMP - DO JGL=1,R_NDGNH - DO JF=1,KF_FS*2 - KM = D_MYMS(KMLOC) - ISL = R_NDGNH-G_NDGLU(KM)+1 - IF (JGL >= ISL) THEN - !(DO JGL=ISL,R_NDGNH) - IGLS = R_NDGL+1-JGL - OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*2*KF_FS - OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_FS - PAIA = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) - PAIS = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) - IF (JF <= 4*KF_UV) THEN - ! Multiply in case of velocity - PAIA = PAIA*F%RACTHE(JGL) - PAIS = PAIS*F%RACTHE(JGL) - ENDIF - IF (KM /= 0) THEN - ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIA*F%RW(JGL) - ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIS*F%RW(JGL) - ELSEIF (MOD(JF-1,2) == 0) THEN - ! every other field is sufficient because Im(KM=0) == 0 - ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F%RW(JGL) - ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIS*F%RW(JGL) - ENDIF - ENDIF - ENDDO - ENDDO - END DO - !$ACC END DATA - ELSE - - OFFSET_VAR=D_NPTRLS(MYSETW) - - PREEL_COMPLEX(1:) => FOUBUF(1:) - !$ACC DATA PRESENT(PREEL_COMPLEX,D_NSTAGTF,G_NLOEN) - !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS,IGLS,V1,V2,SCAL,IGLG) ASYNC(1) TILE(32,16,1) - DO JGL=1,R_NDGNH - DO KMLOC=1,D_NUMP - DO JF=1,KF_FS*2 - KM = D_MYMS(KMLOC) - ISL = R_NDGNH-G_NDGLU(KM)+1 - IF (JGL >= ISL) THEN - !(DO JGL=ISL,R_NDGNH) - IGLS = JGL - - OFFSET1 = KF_FS*D_NSTAGTF(IGLS)+(JF-1)/2*(D_NSTAGTF(IGLS+1)-D_NSTAGTF(IGLS)) - IGLG = OFFSET_VAR+IGLS-1 - SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) - V1 = SCAL * PREEL_COMPLEX(OFFSET1+2*(KMLOC-1)+1+MOD(JF-1,2)) - - IGLS = R_NDGL+1-JGL - OFFSET2 = KF_FS*D_NSTAGTF(IGLS)+(JF-1)/2*(D_NSTAGTF(IGLS+1)-D_NSTAGTF(IGLS)) - IGLG = OFFSET_VAR+IGLS-1 - SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) - V2 = SCAL * PREEL_COMPLEX(OFFSET2+2*(KMLOC-1)+1+MOD(JF-1,2)) - - PAIA = V1-V2 - PAIS = V1+V2 - IF (JF <= 4*KF_UV) THEN - ! Multiply in case of velocity - PAIA = PAIA*F%RACTHE(JGL) - PAIS = PAIS*F%RACTHE(JGL) - ENDIF - IF (KM /= 0) THEN - ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIA*F%RW(JGL) - ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIS*F%RW(JGL) - ELSEIF (MOD(JF-1,2) == 0) THEN - ! every other field is sufficient because Im(KM=0) == 0 - ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F%RW(JGL) - ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIS*F%RW(JGL) - ENDIF - ENDIF - ENDDO - ENDDO - END DO - !$ACC END DATA - ENDIF - - !$ACC END DATA -END SUBROUTINE SUBROUTINE LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) !**** *LEDIR* - Direct Legendre transform. diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 9eade265b..5ca7378f7 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -176,6 +176,11 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !* 0. Some initializations ! -------------------- + ! Note we have either + ! - KVSETUV and KVSETSC (with PGP, which has u, v, and scalar fields), or + ! - KVSETUV, KVSETSC2, KVSETSC3A KVSETSC3B (with PGPUV, GP3A, PGP3B and PGP2) + ! KVSETs are optionals. Their sizes canalso be inferred from KV_UV_G/KV_SCALARS_G (which + ! should match PSPXXX and PGPXXX arrays) IOFF=0 IF(PRESENT(KVSETUV)) THEN IVSET(IOFF+1:IOFF+KF_UV_G) = KVSETUV(:) diff --git a/src/trans/gpu/internal/trltom_pack_unpack.F90 b/src/trans/gpu/internal/trltom_pack_unpack.F90 new file mode 100755 index 000000000..d2f118aff --- /dev/null +++ b/src/trans/gpu/internal/trltom_pack_unpack.F90 @@ -0,0 +1,386 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TRLTOM_PACK_UNPACK + USE ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: TRLTOM_PACK_HANDLE, PREPARE_TRLTOM_PACK, TRLTOM_PACK + PUBLIC :: TRLTOM_UNPACK_HANDLE, PREPARE_TRLTOM_UNPACK, TRLTOM_UNPACK + PUBLIC :: TRLTOM_DIRECT_HANDLE, PREPARE_TRLTOM_DIRECT, TRLTOM_DIRECT + + TYPE TRLTOM_PACK_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HFOUBUF_IN + END TYPE + TYPE TRLTOM_UNPACK_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: ZINPS_AND_ZINPA + END TYPE + TYPE TRLTOM_DIRECT_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: ZINPS_AND_ZINPA + END TYPE +CONTAINS + FUNCTION PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_PACK) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: D + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + TYPE(TRLTOM_PACK_HANDLE) :: HTRLTOM_PACK + + REAL(KIND=JPRBT) :: DUMMY + + HTRLTOM_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, D%NLENGT0B*KF_FS*2*SIZEOF(DUMMY)) + END FUNCTION + + SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) + !**** *TRLTOM_PACK* - Copy fourier data from local array to buffer + + ! Purpose. + ! -------- + ! Routine for copying fourier data from local array to buffer + + !** Interface. + ! ---------- + ! CALL TRLTOM_PACK(...) + + ! Explicit arguments : PREEL - local fourier/GP array + ! -------------------- KF_FS - number of fields + ! + ! Externals. None. + ! ---------- + + ! Author. + ! ------- + ! Mats Hamrud *ECMWF* + + ! ------------------------------------------------------------------ + + USE ALLOCATOR_MOD + USE PARKIND_ECTRANS, ONLY : JPIM,JPRBT + USE TPM_DISTR, ONLY : D,MYSETW,D_NSTAGTF,D_NPNTGTB0,D_NPTRLS + USE TPM_GEOMETRY, ONLY : G,G_NMEN,G_NLOEN + USE TPM_DIM, ONLY: R_NSMAX + USE ISO_C_BINDING + ! + + IMPLICIT NONE + + REAL(KIND=JPRBT), INTENT(IN) :: PREEL_COMPLEX(:) + REAL(KIND=JPRBT), POINTER, INTENT(OUT) :: FOUBUF_IN(:) + INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(TRLTOM_PACK_HANDLE), INTENT(IN) :: HTRLTOM_PACK + + INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL + + REAL(KIND=JPRBT) :: SCAL + + CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRLTOM_PACK%HFOUBUF_IN),& + & 1_C_SIZE_T, D%NLENGT0B*KF_FS*2*SIZEOF(FOUBUF_IN(1))) + + !$ACC DATA PRESENT(D,G_NMEN,D_NPNTGTB0,FOUBUF_IN,PREEL_COMPLEX,D_NSTAGTF,G_NLOEN) ASYNC(1) + + ! scale results and move into next transformation buffer + + OFFSET_VAR=D_NPTRLS(MYSETW) + + !$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,ISTA,SCAL) DEFAULT(NONE) & + !$ACC& ASYNC(1) TILE(32,16,1) + DO KGL=1,D%NDGL_FS + DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG) + DO JF=1,KF_FS + IGLG = OFFSET_VAR+KGL-1 + IF (JM <= G_NMEN(IGLG)) THEN + IOFF_LAT = KF_FS*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + + SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) + ISTA = D_NPNTGTB0(JM,KGL)*KF_FS*2 + + FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+1) + FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+2) + ENDIF + ENDDO + ENDDO + ENDDO + !$ACC END DATA + + !$ACC WAIT(1) + END SUBROUTINE TRLTOM_PACK + + FUNCTION PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_UNPACK) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE TPM_DISTR, ONLY : D + USE LEDIR_MOD, ONLY: LEDIR_STRIDES + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + TYPE(TRLTOM_UNPACK_HANDLE) :: HTRLTOM_UNPACK + + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=C_SIZE_T) :: ISIZE + + REAL(KIND=JPRBT) :: ZPRBT_DUMMY + REAL(KIND=JPRD) :: ZPRD_DUMMY + + CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& + IIN0_STRIDES0=IIN0_STRIDES0,IIN0_STRIDES1=IIN0_STRIDES1) + + ! Check if the reuse buffer is large enough + ISIZE = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) + + HTRLTOM_UNPACK%ZINPS_AND_ZINPA = RESERVE(ALLOCATOR, ISIZE) + END FUNCTION + + SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + USE PARKIND_ECTRANS, ONLY : JPIM, JPRBT, JPRD + USE TPM_DIM, ONLY : R, R_NDGNH, R_NDGL + USE TPM_GEOMETRY, ONLY : G, G_NDGLU + USE TPM_FIELDS, ONLY : F + USE TPM_DISTR, ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1 + USE LEDIR_MOD, ONLY : LEDIR_STRIDES + USE, INTRINSIC :: ISO_C_BINDING + + IMPLICIT NONE + + REAL(KIND=JPRBT), INTENT(IN) :: FOUBUF(:) + REAL(KIND=JPRBT), POINTER, INTENT(INOUT) :: ZINPS(:), ZINPA(:) + REAL(KIND=JPRD), POINTER, INTENT(INOUT) :: ZINPS0(:), ZINPA0(:) + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(TRLTOM_UNPACK_HANDLE), INTENT(IN) :: HTRLTOM_UNPACK + + REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) + + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + + INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ + + INTEGER(KIND=8) :: JF + INTEGER(KIND=JPIM) :: KM, ISL, IGLS, OFFSET1, OFFSET2, JGL, KMLOC + + REAL(KIND=JPRBT) :: PAIA, PAIS + + CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& + IIN0_STRIDES0=IIN0_STRIDES0,IIN0_STRIDES1=IIN0_STRIDES1) + + IALLOC_POS=1 + + IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPS(0)),128) + CALL ASSIGN_PTR(ZINPS, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%ZINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS=IALLOC_POS+IALLOC_SZ + + IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPA(0)),128) + CALL ASSIGN_PTR(ZINPA, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%ZINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS=IALLOC_POS+IALLOC_SZ + + IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPS0(0)),128) + CALL ASSIGN_PTR(ZINPS0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%ZINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS=IALLOC_POS+IALLOC_SZ + + IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPA0(0)),128) + CALL ASSIGN_PTR(ZINPA0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%ZINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS=IALLOC_POS+IALLOC_SZ + + !$ACC DATA & + !$ACC& PRESENT(ZINPS,ZINPA,ZINPS0,ZINPA0) & + !$ACC& PRESENT(F,F%RW) & + !$ACC& PRESENT(D,D_MYMS,R,G,G_NDGLU) & + !$ACC& PRESENT(D_NPNTGTB1) + + !$ACC DATA PRESENT(FOUBUF) + !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) ASYNC(1) + DO KMLOC=1,D_NUMP + DO JGL=1,R_NDGNH + DO JF=1,KF_FS*2 + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + IF (JGL >= ISL) THEN + !(DO JGL=ISL,R_NDGNH) + IGLS = R_NDGL+1-JGL + OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*2*KF_FS + OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_FS + PAIA = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) + PAIS = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) + IF (JF <= 4*KF_UV) THEN + ! Multiply in case of velocity + PAIA = PAIA*F%RACTHE(JGL) + PAIS = PAIS*F%RACTHE(JGL) + ENDIF + IF (KM /= 0) THEN + ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIA*F%RW(JGL) + ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIS*F%RW(JGL) + ELSEIF (MOD(JF-1,2) == 0) THEN + ! every other field is sufficient because Im(KM=0) == 0 + ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F%RW(JGL) + ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIS*F%RW(JGL) + ENDIF + ENDIF + ENDDO + ENDDO + END DO + !$ACC END DATA + + !$ACC END DATA + END SUBROUTINE + + FUNCTION PREPARE_TRLTOM_DIRECT(ALLOCATOR, KF_FS) RESULT(HTRLTOM_DIRECT) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE TPM_DISTR, ONLY : D + USE LEDIR_MOD, ONLY : LEDIR_STRIDES + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + TYPE(TRLTOM_DIRECT_HANDLE) :: HTRLTOM_DIRECT + + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=C_SIZE_T) :: ISIZE + + REAL(KIND=JPRBT) :: ZPRBT_DUMMY + REAL(KIND=JPRD) :: ZPRD_DUMMY + + CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& + IIN0_STRIDES0=IIN0_STRIDES0,IIN0_STRIDES1=IIN0_STRIDES1) + + ! Check if the reuse buffer is large enough + ISIZE = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) + + HTRLTOM_DIRECT%ZINPS_AND_ZINPA = RESERVE(ALLOCATOR, ISIZE) + END FUNCTION + + SUBROUTINE TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD + USE TPM_DIM ,ONLY : R, R_NDGNH, R_NDGL + USE TPM_GEOMETRY ,ONLY : G, G_NDGLU, G_NLOEN + USE TPM_FIELDS ,ONLY : F + USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,MYSETW,D_NSTAGTF,D_NPTRLS + USE LEDIR_MOD, ONLY: LEDIR_STRIDES + USE, INTRINSIC :: ISO_C_BINDING + + IMPLICIT NONE + + REAL(KIND=JPRBT), INTENT(IN) :: PREEL_COMPLEX(:) + REAL(KIND=JPRBT), POINTER, INTENT(INOUT) :: ZINPS(:), ZINPA(:) + REAL(KIND=JPRD), POINTER, INTENT(INOUT) :: ZINPS0(:), ZINPA0(:) + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(TRLTOM_DIRECT_HANDLE), INTENT(IN) :: HTRLTOM_DIRECT + + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + + INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ + + INTEGER(KIND=8) :: JF, OFFSET_VAR + INTEGER(KIND=JPIM) :: KM, ISL, IGLS, OFFSET1, OFFSET2, JGL, IGLG, KMLOC + + REAL(KIND=JPRBT) :: PAIA, PAIS, V1, V2, SCAL + + CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& + IIN0_STRIDES0=IIN0_STRIDES0,IIN0_STRIDES1=IIN0_STRIDES1) + + IALLOC_POS=1 + + IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPS(0)),128) + CALL ASSIGN_PTR(ZINPS, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%ZINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS=IALLOC_POS+IALLOC_SZ + + IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPA(0)),128) + CALL ASSIGN_PTR(ZINPA, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%ZINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS=IALLOC_POS+IALLOC_SZ + + IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPS0(0)),128) + CALL ASSIGN_PTR(ZINPS0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%ZINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS=IALLOC_POS+IALLOC_SZ + + IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPA0(0)),128) + CALL ASSIGN_PTR(ZINPA0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%ZINPS_AND_ZINPA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS=IALLOC_POS+IALLOC_SZ + + CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& + IIN0_STRIDES0=IIN0_STRIDES0) + + !$ACC DATA & + !$ACC& PRESENT(ZINPS,ZINPA,ZINPS0,ZINPA0) & + !$ACC& PRESENT(F,F%RW) & + !$ACC& PRESENT(D,D_MYMS,G,G_NDGLU) & + !$ACC& PRESENT(PREEL_COMPLEX,D_NSTAGTF,G_NLOEN) + + OFFSET_VAR=D_NPTRLS(MYSETW) + + !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(KM,ISL,OFFSET1,OFFSET2,PAIA,PAIS,IGLS,V1,V2,SCAL,IGLG) ASYNC(1) TILE(32,16,1) + DO JGL=1,R_NDGNH + DO KMLOC=1,D_NUMP + DO JF=1,KF_FS*2 + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + IF (JGL >= ISL) THEN + !(DO JGL=ISL,R_NDGNH) + IGLS = JGL + + OFFSET1 = KF_FS*D_NSTAGTF(IGLS)+(JF-1)/2*(D_NSTAGTF(IGLS+1)-D_NSTAGTF(IGLS)) + IGLG = OFFSET_VAR+IGLS-1 + SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) + V1 = SCAL * PREEL_COMPLEX(OFFSET1+2*(KMLOC-1)+1+MOD(JF-1,2)) + + IGLS = R_NDGL+1-JGL + OFFSET2 = KF_FS*D_NSTAGTF(IGLS)+(JF-1)/2*(D_NSTAGTF(IGLS+1)-D_NSTAGTF(IGLS)) + IGLG = OFFSET_VAR+IGLS-1 + SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) + V2 = SCAL * PREEL_COMPLEX(OFFSET2+2*(KMLOC-1)+1+MOD(JF-1,2)) + + PAIA = V1-V2 + PAIS = V1+V2 + IF (JF <= 4*KF_UV) THEN + ! Multiply in case of velocity + PAIA = PAIA*F%RACTHE(JGL) + PAIS = PAIS*F%RACTHE(JGL) + ENDIF + IF (KM /= 0) THEN + ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIA*F%RW(JGL) + ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIS*F%RW(JGL) + ELSEIF (MOD(JF-1,2) == 0) THEN + ! every other field is sufficient because Im(KM=0) == 0 + ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F%RW(JGL) + ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIS*F%RW(JGL) + ENDIF + ENDIF + ENDDO + ENDDO + END DO + + !$ACC END DATA + END SUBROUTINE +END MODULE TRLTOM_PACK_UNPACK + From 057c4c3b341f1ca0465daa1c107932b486ec9696 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:36 -0700 Subject: [PATCH 229/263] remove ltinv_ctl_mod calls --- src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 171 +++---------------- src/trans/gpu/internal/ledir_mod.F90 | 3 + src/trans/gpu/internal/ltinv_ctl_mod.F90 | 19 --- src/trans/gpu/internal/ltinv_mod.F90 | 10 +- 4 files changed, 26 insertions(+), 177 deletions(-) diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index 1c7dd6dbf..a8b4254da 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -86,13 +86,14 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& USE PARKIND1 ,ONLY : JPIM ,JPRB -USE TPM_GEN ,ONLY : NPROMATR +USE TPM_GEN ,ONLY : NPROMATR, NOUT USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP USE SHUFFLE_MOD ,ONLY : SHUFFLE USE FIELD_SPLIT_MOD ,ONLY : FIELD_SPLIT -USE LTINV_CTL_MOD ,ONLY : LTINV_CTL USE FTINV_CTL_MOD ,ONLY : FTINV_CTL +USE TRMTOL_MOD +USE LTINV_MOD ! IMPLICIT NONE @@ -128,163 +129,31 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& ! Local variables -INTEGER(KIND=JPIM) :: IPTRGP(KF_GP),IPTRSPUV(NPROMATR),IPTRSPSC(NPROMATR) -INTEGER(KIND=JPIM) :: ISHFUV_G(KF_GP),ISHFSC_G(KF_GP) -INTEGER(KIND=JPIM) :: IVSETUV(KF_GP),IVSETSC(KF_GP) -INTEGER(KIND=JPIM) :: IBLKS,JBLK,ISTUV_G,IENUV_G -INTEGER(KIND=JPIM) :: IF_UV_G,IF_UV,ISTUV,IF_SCALARS,IF_SCALARS_G,IF_FS,IF_GP -INTEGER(KIND=JPIM) :: JFLD,ISTSC_G,IENSC_G,ISTSC,IENSC,IENUV,IF_SCDERS,IF_OUT_LT -INTEGER(KIND=JPIM) :: IOFFD,IOFFU,IOFFV,IOFFUVD,IOFFSC,IOFFSCNS,IOFFSCEW,IOFF,IF_GPB,IST INTEGER(KIND=JPIM) :: KFIELD REAL(KIND=JPRB), ALLOCATABLE :: FOUBUF(:) +REAL(KIND=JPRB), ALLOCATABLE :: FOUBUF_IN(:) ! ------------------------------------------------------------------ -! Perform transform + IF(NPROMATR > 0) THEN + print *, "This is currently not supported and/or tested (NPROMATR > 0j" + stop 24 + ENDIF -IF_GPB = 2*KF_UV_G+KF_SCALARS_G -IF(NPROMATR > 0 .AND. IF_GPB > NPROMATR) THEN - print *, "This is currently not supported and/or tested" - stop 24 - - ! Fields to be split into packets - - CALL SHUFFLE(KF_UV_G,KF_SCALARS_G,ISHFUV_G,IVSETUV,ISHFSC_G,IVSETSC,& - & KVSETUV,KVSETSC) - - IBLKS=(IF_GPB-1)/NPROMATR+1 - - DO JBLK=1,IBLKS - - CALL FIELD_SPLIT(JBLK,IF_GPB,KF_UV_G,IVSETUV,IVSETSC,& - & ISTUV_G,IENUV_G,IF_UV_G,ISTSC_G,IENSC_G,IF_SCALARS_G,& - & ISTUV,IENUV,IF_UV,ISTSC,IENSC,IF_SCALARS) - - IF(LSCDERS) THEN - IF_SCDERS = IF_SCALARS - ELSE - IF_SCDERS = 0 - ENDIF - - IF_OUT_LT = 2*IF_UV + IF_SCALARS+IF_SCDERS - IF(LVORGP) THEN - IF_OUT_LT = IF_OUT_LT+IF_UV - ENDIF - IF(LDIVGP) THEN - IF_OUT_LT = IF_OUT_LT+IF_UV - ENDIF - IF_FS = IF_OUT_LT+IF_SCDERS - IF(LUVDER) THEN - IF_FS = IF_FS+2*IF_UV - ENDIF - - IF_GP = 2*IF_UV_G+IF_SCALARS_G - IOFFD = 0 - IOFFU = 0 - IOFFV = KF_UV_G - IOFFUVD = 2*KF_UV_G+KF_SCALARS_G - IOFFSC = 2*KF_UV_G - IF(LVORGP) THEN - IF_GP = IF_GP+IF_UV_G - IOFFD = KF_UV_G - IOFFU = IOFFU+KF_UV_G - IOFFV = IOFFV+KF_UV_G - IOFFUVD =IOFFUVD+KF_UV_G - IOFFSC = IOFFSC+KF_UV_G - ENDIF - IF(LDIVGP) THEN - IF_GP = IF_GP+IF_UV_G - IOFFU = IOFFU+KF_UV_G - IOFFV = IOFFV+KF_UV_G - IOFFUVD =IOFFUVD+KF_UV_G - IOFFSC = IOFFSC+KF_UV_G - ENDIF - IF(LSCDERS) THEN - IF_GP = IF_GP+2*IF_SCALARS_G - IOFFUVD =IOFFUVD+KF_SCALARS_G - IOFFSCNS = IOFFSC+KF_SCALARS_G - IOFFSCEW = IOFFSC+2*KF_SCALARS_G - ENDIF - IF(LUVDER) THEN - IF_GP = IF_GP+2*IF_UV_G - IOFFSCEW = IOFFSCEW+2*KF_UV_G - ENDIF - - DO JFLD=1,IF_UV_G - IOFF = 0 - IF(LVORGP) THEN - IPTRGP(JFLD+IOFF) = ISHFUV_G(ISTUV_G+JFLD-1) - IOFF = IOFF+IF_UV_G - ENDIF - IF(LDIVGP) THEN - IPTRGP(JFLD+IOFF) = IOFFD+ISHFUV_G(ISTUV_G+JFLD-1) - IOFF = IOFF+IF_UV_G - ENDIF - IPTRGP(JFLD+IOFF) = IOFFU+ISHFUV_G(ISTUV_G+JFLD-1) - IOFF = IOFF+IF_UV_G - IPTRGP(JFLD+IOFF) = IOFFV+ISHFUV_G(ISTUV_G+JFLD-1) - IOFF = IOFF+IF_UV_G+IF_SCALARS_G - IF(LSCDERS) THEN - IOFF = IOFF+IF_SCALARS_G - ENDIF - IF(LUVDER) THEN - IPTRGP(JFLD+IOFF) = IOFFUVD+ISHFUV_G(ISTUV_G+JFLD-1) - IOFF = IOFF+IF_UV_G - IPTRGP(JFLD+IOFF) = IOFFUVD+KF_UV_G+ISHFUV_G(ISTUV_G+JFLD-1) - ENDIF - ENDDO - - DO JFLD=1,IF_SCALARS_G - IOFF = 2*IF_UV_G - IF (LVORGP) IOFF = IOFF+IF_UV_G - IF (LDIVGP) IOFF = IOFF+IF_UV_G - IPTRGP(JFLD+IOFF) = IOFFSC+ISHFSC_G(ISTSC_G+JFLD-1) - IOFF = IOFF+IF_SCALARS_G - IF(LSCDERS) THEN - IPTRGP(JFLD+IOFF) = IOFFSCNS+ISHFSC_G(ISTSC_G+JFLD-1) - IOFF = IOFF+IF_SCALARS_G - IF(LUVDER) THEN - IOFF = IOFF+2*IF_UV_G - ENDIF - IPTRGP(JFLD+IOFF) = IOFFSCEW+ISHFSC_G(ISTSC_G+JFLD-1) - ENDIF - ENDDO - DO JFLD=1,IF_UV - IPTRSPUV(JFLD) = ISTUV+JFLD-1 - ENDDO - DO JFLD=1,IF_SCALARS - IPTRSPSC(JFLD) = ISTSC+JFLD-1 - ENDDO - - CALL LTINV_CTL(IF_UV,IF_SCALARS, & - & PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& - & KFLDPTRUV=IPTRSPUV,KFLDPTRSC=IPTRSPSC,FOUBUF=FOUBUF,KFIELD=KFIELD) - - IF(IF_UV_G > 0 .AND. IF_SCALARS_G > 0) THEN - CALL FTINV_CTL(KFIELD,IF_UV_G,IF_SCALARS_G,& - & IF_UV,IF_SCALARS,IF_GP,& - & FOUBUF,KVSETUV=IVSETUV(ISTUV_G:IENUV_G),& - & KVSETSC=IVSETSC(ISTSC_G:IENSC_G),KPTRGP=IPTRGP,& - & PGP=PGP) - ELSEIF(IF_UV_G > 0) THEN - CALL FTINV_CTL(KFIELD,IF_UV_G,IF_SCALARS_G,& - & IF_UV,IF_SCALARS,IF_GP,& - & FOUBUF,KVSETUV=IVSETUV(ISTUV_G:IENUV_G),KPTRGP=IPTRGP,& - & PGP=PGP) - ELSEIF(IF_SCALARS_G > 0) THEN - CALL FTINV_CTL(KFIELD,IF_UV_G,IF_SCALARS_G,& - & IF_UV,IF_SCALARS,IF_GP,& - & FOUBUF,KVSETSC=IVSETSC(ISTSC_G:IENSC_G),KPTRGP=IPTRGP,& - & PGP=PGP) - ENDIF - ENDDO - -ELSE ! No splitting of fields, transform done in one go ! from PSPXXX to FOUBUF - CALL LTINV_CTL(KF_UV,KF_SCALARS, & - & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,FOUBUF,KFIELD) + + CALL GSTATS(102,0) + CALL LTINV(KF_UV,KF_SCALARS,& + & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & + & FOUBUF_IN,KFIELD) + CALL GSTATS(102,1) + + CALL GSTATS(152,0) + WRITE(NOUT,*) 'ltinv_ctl:TRMTOL_CUDAAWARE' + CALL TRMTOL_CUDAAWARE(FOUBUF_IN,FOUBUF,KFIELD) + CALL GSTATS(152,1) ! from FOUBUF to PGPXXX CALL FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& @@ -293,8 +162,6 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) -ENDIF - ! ------------------------------------------------------------------ END SUBROUTINE INV_TRANS_CTL diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index cd7ca1567..a612a2d59 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -14,6 +14,9 @@ MODULE LEDIR_MOD USE ALLOCATOR_MOD IMPLICIT NONE + PRIVATE + PUBLIC :: LEDIR_HANDLE, LEDIR_STRIDES, LEDIR + TYPE LEDIR_HANDLE TYPE(ALLOCATION_RESERVATION_HANDLE) :: HOUT_AND_POA END TYPE diff --git a/src/trans/gpu/internal/ltinv_ctl_mod.F90 b/src/trans/gpu/internal/ltinv_ctl_mod.F90 index ec643fb1a..27f9752d5 100755 --- a/src/trans/gpu/internal/ltinv_ctl_mod.F90 +++ b/src/trans/gpu/internal/ltinv_ctl_mod.F90 @@ -76,25 +76,6 @@ SUBROUTINE LTINV_CTL(KF_UV,KF_SCALARS,& INTEGER(KIND=JPIM),INTENT(OUT) :: KFIELD REAL(KIND=JPRB), ALLOCATABLE :: FOUBUF_IN(:) - CALL GSTATS(102,0) - CALL GSTATS(1647,0) - ! LTINV allocates FOUBUF_IN and creates on device - CALL LTINV(KF_UV,KF_SCALARS,& - & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & - & KFLDPTRUV,KFLDPTRSC,FOUBUF_IN,KFIELD) - CALL GSTATS(1647,1) - CALL GSTATS(102,1) - - CALL GSTATS(152,0) - ! TRMTOL deallocates FOUBUF_IN and deletes on device - ! from FOUBUF_IN to FOUBUF -#ifdef USE_CUDA_AWARE_MPI_FT - WRITE(NOUT,*) 'ltinv_ctl:TRMTOL_CUDAAWARE' - CALL TRMTOL_CUDAAWARE(FOUBUF_IN,FOUBUF,KFIELD) -#else - CALL TRMTOL(FOUBUF_IN,FOUBUF,KFIELD) -#endif - CALL GSTATS(152,1) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index ec800cb98..42b6a0923 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -12,7 +12,7 @@ MODULE LTINV_MOD CONTAINS SUBROUTINE LTINV(KF_UV,KF_SCALARS,& & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & - & KFLDPTRUV,KFLDPTRSC,FOUBUF_IN,FOUBUF_KFIELD) + & FOUBUF_IN,FOUBUF_KFIELD) USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK @@ -95,8 +95,6 @@ SUBROUTINE LTINV(KF_UV,KF_SCALARS,& REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC2(:,:) REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC3A(:,:,:) REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC3B(:,:,:) - INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) - INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) REAL(KIND=JPRB), ALLOCATABLE, INTENT(OUT) :: FOUBUF_IN(:) INTEGER(KIND=JPIM), INTENT(OUT) :: FOUBUF_KFIELD @@ -182,8 +180,8 @@ SUBROUTINE LTINV(KF_UV,KF_SCALARS,& ENDIF IF (KF_UV > 0) THEN - CALL PRFI1B(PVOR,PSPVOR,KF_UV,UBOUND(PSPVOR,2),KFLDPTRUV) - CALL PRFI1B(PDIV,PSPDIV,KF_UV,UBOUND(PSPDIV,2),KFLDPTRUV) + CALL PRFI1B(PVOR,PSPVOR,KF_UV,UBOUND(PSPVOR,2)) + CALL PRFI1B(PDIV,PSPDIV,KF_UV,UBOUND(PSPDIV,2)) ! Compute U and V for VOR and DIV CALL VDTUV(KF_UV,ZEPSNM,PVOR,PDIV,PU,PV) @@ -191,7 +189,7 @@ SUBROUTINE LTINV(KF_UV,KF_SCALARS,& IF (KF_SCALARS > 0) THEN IF(PRESENT(PSPSCALAR)) THEN - CALL PRFI1B(PSCALARS,PSPSCALAR,KF_SCALARS,UBOUND(PSPSCALAR,2),KFLDPTRSC) + CALL PRFI1B(PSCALARS,PSPSCALAR,KF_SCALARS,UBOUND(PSPSCALAR,2)) ELSE IFIRST = 1 IF(PRESENT(PSPSC2) .AND. NF_SC2 > 0) THEN From 6d4ccc22fab6e03d60532567d32d38264f65b16f Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:36 -0700 Subject: [PATCH 230/263] merge ftinv_ctl into inv_trans temporarily --- src/trans/gpu/internal/ftinv_ctl_mod.F90 | 282 ------------------- src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 188 ++++++++++++- src/trans/gpu/internal/ltinv_ctl_mod.F90 | 83 ------ 3 files changed, 177 insertions(+), 376 deletions(-) delete mode 100755 src/trans/gpu/internal/ftinv_ctl_mod.F90 delete mode 100755 src/trans/gpu/internal/ltinv_ctl_mod.F90 diff --git a/src/trans/gpu/internal/ftinv_ctl_mod.F90 b/src/trans/gpu/internal/ftinv_ctl_mod.F90 deleted file mode 100755 index cd96670b3..000000000 --- a/src/trans/gpu/internal/ftinv_ctl_mod.F90 +++ /dev/null @@ -1,282 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE FTINV_CTL_MOD -CONTAINS -SUBROUTINE FTINV_CTL(KF_INPUT,KF_UV_G,KF_SCALARS_G,& - & KF_UV,KF_SCALARS,KF_GP,FOUBUF, & - & KVSETUV,KVSETSC,KVSETSC3A,KVSETSC3B,KVSETSC2,& - & PGP,PGPUV,PGP3A,PGP3B,PGP2,KPTRGP) - - -!**** *FTINV_CTL - Inverse Fourier transform control - -! Purpose. Control routine for Fourier to Gridpoint transform -! -------- - -!** Interface. -! ---------- -! CALL FTINV_CTL(..) - -! Explicit arguments : -! -------------------- -! PGP - gridpoint array -! KF_UV_G - global number of spectral u-v fields -! KF_SCALARS_G - global number of scalar spectral fields -! KF_UV - local number of spectral u-v fields -! KF_SCALARS - local number of scalar spectral fields -! KF_GP - total number of output gridpoint fields -! KVSETUV - "B" set in spectral/fourier space for -! u and v variables -! KVSETSC - "B" set in spectral/fourier space for -! scalar variables -! KPTRGP - pointer array to fi3elds in gridpoint space - -! Method. -! ------- - -! Externals. TRLTOG - transposition routine -! ---------- FOURIER_IN - copy fourier data from Fourier buffer -! FTINV - fourier transform -! FSC - Fourier space computations - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 00-03-03 - -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT - -USE TPM_GEN ,ONLY : NERR, nout -!USE TPM_DIM -!USE TPM_GEOMETRY -USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP,LATLON, REUSE_PTR -USE TPM_DISTR ,ONLY : D, MYPROC, NPROC -USE TPM_FLT ,ONLY : S -USE FOURIER_IN_MOD ,ONLY : FOURIER_IN -USE FSC_MOD ,ONLY : FSC -USE FTINV_MOD ,ONLY : FTINV -USE TRLTOG_MOD ,ONLY : TRLTOG, TRLTOG_CUDAAWARE -USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS -USE MPL_MODULE ,ONLY : MPL_BARRIER -USE TPM_GEN ,ONLY : LSYNC_TRANS -USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX -use ieee_arithmetic -! - -IMPLICIT NONE - -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_INPUT -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_UV_G -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_SCALARS_G -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_UV -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_SCALARS -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_GP -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) -REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP(:,:,:) -REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGPUV(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3A(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3B(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP2(:,:,:) -REAL(KIND=JPRBT), ALLOCATABLE, INTENT(INOUT) :: FOUBUF(:) - -REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:) -REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) -INTEGER(KIND=JPIM) :: KUV_OFFSET, KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, & - & KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET - -INTEGER(KIND=JPIM) :: IST -INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) -INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) -INTEGER(KIND=JPIM) :: IVSET(KF_GP) -INTEGER(KIND=JPIM) :: J3,JGL,IGL,IOFF,IFGP2,IFGP3A,IFGP3B,IGP3APAR,IGP3BPAR -INTEGER(KIND=JPIM) :: IFIRST -INTEGER(KIND=JPIM) :: KF_FS - - -! ------------------------------------------------------------------ - -! 1. Copy Fourier data to local array - -CALL GSTATS(107,0) -CALL GSTATS(1639,0) - -! Compute PREEL_COMPLEX Domain decomposition -IFIRST = 0 -IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity -IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence -IFIRST = IFIRST + 2*KF_UV ! U and V -IFIRST = IFIRST + KF_SCALARS ! Scalars -IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives -! This verifies if we get the same assumptions about how much data we get from the LT space -IF (2*IFIRST /= KF_INPUT) CALL ABORT_TRANS('Size mismatch: LT and FT do not agree on input size') -IF (LUVDER) IFIRST = IFIRST+2*KF_UV ! U and V derivatives -IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives -KF_FS = IFIRST - -IF (.NOT. ALLOCATED(REUSE_PTR)) THEN - ALLOCATE(REUSE_PTR(KF_FS*D%NLENGTF)) - !$ACC ENTER DATA CREATE(REUSE_PTR) -ELSEIF (SIZE(REUSE_PTR) < KF_FS*D%NLENGTF) THEN - !$ACC EXIT DATA DELETE(REUSE_PTR) - DEALLOCATE(REUSE_PTR) - ALLOCATE(REUSE_PTR(KF_FS*D%NLENGTF)) - !$ACC ENTER DATA CREATE(REUSE_PTR) -ENDIF -PREEL_COMPLEX => REUSE_PTR - -! Initialize potentially unset offsets -KSCALARS_NSDER_OFFSET = -1 -KUV_EWDER_OFFSET = -1 -KSCALARS_EWDER_OFFSET = -1 - -! And reiterate domain decomposition to assign pointers -IFIRST = 0 -IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity -IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence -KUV_OFFSET = IFIRST -IFIRST = IFIRST + 2*KF_UV ! U and V -KSCALARS_OFFSET = IFIRST -IFIRST = IFIRST + KF_SCALARS ! Scalars -IF (LSCDERS) THEN - KSCALARS_NSDER_OFFSET = IFIRST - IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives -ENDIF -IF (LUVDER) THEN - KUV_EWDER_OFFSET = IFIRST - IFIRST = IFIRST+2*KF_UV ! U and V derivatives -ENDIF -IF (LSCDERS) THEN - KSCALARS_EWDER_OFFSET = IFIRST - IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives -ENDIF - -! from FOUBUF to PREEL_COMPLEX. Divide by two because we consider this complex space now -CALL FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_INPUT/2,KF_FS) - -! 2. Fourier space computations - -! fill the rest of PREEL_COMPLEX -CALL FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, & - & KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) - -! 3. Fourier transform -! inplace operation -IF(KF_FS > 0) THEN - CALL FTINV(PREEL_COMPLEX,PREEL_REAL,KF_FS) -ELSE - PREEL_REAL => PREEL_COMPLEX -ENDIF - -CALL GSTATS(1639,1) -CALL GSTATS(107,1) - -! 4. Transposition - -IF (PRESENT(KVSETUV)) THEN - IVSETUV(:) = KVSETUV(:) -ELSE - IVSETUV(:) = -1 -ENDIF -IVSETSC(:)=-1 -IF (PRESENT(KVSETSC)) THEN - IVSETSC(:) = KVSETSC(:) -ELSE - IOFF=0 - IF (PRESENT(KVSETSC2)) THEN - IFGP2=UBOUND(KVSETSC2,1) - IVSETSC(1:IFGP2)=KVSETSC2(:) - IOFF=IOFF+IFGP2 - ENDIF - IF (PRESENT(KVSETSC3A)) THEN - IFGP3A=UBOUND(KVSETSC3A,1) - IGP3APAR=UBOUND(PGP3A,3) - IF (LSCDERS) IGP3APAR=IGP3APAR/3 - DO J3=1,IGP3APAR - IVSETSC(IOFF+1:IOFF+IFGP3A)=KVSETSC3A(:) - IOFF=IOFF+IFGP3A - ENDDO - ENDIF - IF (PRESENT(KVSETSC3B)) THEN - IFGP3B=UBOUND(KVSETSC3B,1) - IGP3BPAR=UBOUND(PGP3B,3) - IF (LSCDERS) IGP3BPAR=IGP3BPAR/3 - DO J3=1,IGP3BPAR - IVSETSC(IOFF+1:IOFF+IFGP3B)=KVSETSC3B(:) - IOFF=IOFF+IFGP3B - ENDDO - ENDIF - IF (IOFF > 0 .AND. IOFF /= KF_SCALARS_G ) THEN - WRITE(NERR,*)'FTINV:IOFF,KF_SCALARS_G ',IOFF,KF_SCALARS_G - CALL ABORT_TRANS('FTINV_CTL_MOD:IOFF /= KF_SCALARS_G') - ENDIF -ENDIF - -IST = 1 -IF (KF_UV_G > 0) THEN - IF (LVORGP) THEN - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - ENDIF - IF ( LDIVGP) THEN - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - ENDIF - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G -ENDIF -IF (KF_SCALARS_G > 0) THEN - IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) - IST = IST+KF_SCALARS_G - IF (LSCDERS) THEN - IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) - IST = IST+KF_SCALARS_G - ENDIF -ENDIF -IF (KF_UV_G > 0 .AND. LUVDER) THEN - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G -ENDIF -IF (KF_SCALARS_G > 0) THEN - IF (LSCDERS) THEN - IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) - IST = IST+KF_SCALARS_G - ENDIF -ENDIF - -CALL GSTATS(157,0) -#ifdef USE_CUDA_AWARE_MPI_FT -WRITE(NOUT,*) 'ftinv_ctl:TRLTOG_CUDAAWARE' -CALL TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,IVSET,KPTRGP,& - &PGP,PGPUV,PGP3A,PGP3B,PGP2) -#else -!WRITE(NOUT,*) 'ftinv_ctl:TRLTOG' -CALL TRLTOG(PREEL_REAL,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& - &PGP,PGPUV,PGP3A,PGP3B,PGP2) -#endif -CALL GSTATS(157,1) -! ------------------------------------------------------------------ - -END SUBROUTINE FTINV_CTL -END MODULE FTINV_CTL_MOD diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index a8b4254da..fc2498975 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -84,16 +84,19 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& ! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT -USE TPM_GEN ,ONLY : NPROMATR, NOUT -USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP +USE TPM_GEN ,ONLY : NPROMATR, NOUT, NERR +USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, REUSE_PTR +USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS -USE SHUFFLE_MOD ,ONLY : SHUFFLE -USE FIELD_SPLIT_MOD ,ONLY : FIELD_SPLIT -USE FTINV_CTL_MOD ,ONLY : FTINV_CTL USE TRMTOL_MOD USE LTINV_MOD +USE FOURIER_IN_MOD ,ONLY : FOURIER_IN +USE FSC_MOD ,ONLY : FSC +USE FTINV_MOD ,ONLY : FTINV +USE TRLTOG_MOD ,ONLY : TRLTOG, TRLTOG_CUDAAWARE +USE TPM_DISTR ,ONLY : D ! IMPLICIT NONE @@ -134,6 +137,18 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& REAL(KIND=JPRB), ALLOCATABLE :: FOUBUF(:) REAL(KIND=JPRB), ALLOCATABLE :: FOUBUF_IN(:) +REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:) +REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) +INTEGER(KIND=JPIM) :: KUV_OFFSET, KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, & + & KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET + +INTEGER(KIND=JPIM) :: IST +INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) +INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) +INTEGER(KIND=JPIM) :: IVSET(KF_GP) +INTEGER(KIND=JPIM) :: J3,JGL,IGL,IOFF,IFGP2,IFGP3A,IFGP3B,IGP3APAR,IGP3BPAR +INTEGER(KIND=JPIM) :: IFIRST + ! ------------------------------------------------------------------ IF(NPROMATR > 0) THEN @@ -156,12 +171,163 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& CALL GSTATS(152,1) ! from FOUBUF to PGPXXX - CALL FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& - & KF_UV,KF_SCALARS,KF_GP,& - & FOUBUF,KVSETUV=KVSETUV,KVSETSC=KVSETSC,& - & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& - & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) + ! CALL FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& + ! & KF_UV,KF_SCALARS,KF_GP,& + ! & FOUBUF,KVSETUV=KVSETUV,KVSETSC=KVSETSC,& + ! & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& + ! & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) + +CALL GSTATS(107,0) +CALL GSTATS(1639,0) + +! Compute PREEL_COMPLEX Domain decomposition +IFIRST = 0 +IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity +IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence +IFIRST = IFIRST + 2*KF_UV ! U and V +IFIRST = IFIRST + KF_SCALARS ! Scalars +IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives +! This verifies if we get the same assumptions about how much data we get from the LT space +IF (2*IFIRST /= KFIELD) CALL ABORT_TRANS('Size mismatch: LT and FT do not agree on input size') +IF (LUVDER) IFIRST = IFIRST+2*KF_UV ! U and V derivatives +IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives +IF (IFIRST /= KF_FS) CALL ABORT_TRANS('Size mismatch: Wrong computation KF_FS') + +IF (.NOT. ALLOCATED(REUSE_PTR)) THEN + ALLOCATE(REUSE_PTR(KF_FS*D%NLENGTF)) + !$ACC ENTER DATA CREATE(REUSE_PTR) +ELSEIF (SIZE(REUSE_PTR) < KF_FS*D%NLENGTF) THEN + !$ACC EXIT DATA DELETE(REUSE_PTR) + DEALLOCATE(REUSE_PTR) + ALLOCATE(REUSE_PTR(KF_FS*D%NLENGTF)) + !$ACC ENTER DATA CREATE(REUSE_PTR) +ENDIF +PREEL_COMPLEX => REUSE_PTR + +! Initialize potentially unset offsets +KSCALARS_NSDER_OFFSET = -1 +KUV_EWDER_OFFSET = -1 +KSCALARS_EWDER_OFFSET = -1 + +! And reiterate domain decomposition to assign pointers +IFIRST = 0 +IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity +IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence +KUV_OFFSET = IFIRST +IFIRST = IFIRST + 2*KF_UV ! U and V +KSCALARS_OFFSET = IFIRST +IFIRST = IFIRST + KF_SCALARS ! Scalars +IF (LSCDERS) THEN + KSCALARS_NSDER_OFFSET = IFIRST + IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives +ENDIF +IF (LUVDER) THEN + KUV_EWDER_OFFSET = IFIRST + IFIRST = IFIRST+2*KF_UV ! U and V derivatives +ENDIF +IF (LSCDERS) THEN + KSCALARS_EWDER_OFFSET = IFIRST + IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives +ENDIF + +! from FOUBUF to PREEL_COMPLEX. Divide by two because we consider this complex space now +CALL FOURIER_IN(FOUBUF,PREEL_COMPLEX,KFIELD/2,KF_FS) + +! 2. Fourier space computations + +! fill the rest of PREEL_COMPLEX +CALL FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, & + & KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) + +! 3. Fourier transform +! inplace operation +IF (KF_FS > 0) CALL FTINV(PREEL_COMPLEX,PREEL_REAL,KF_FS) + +CALL GSTATS(1639,1) +CALL GSTATS(107,1) +! 4. Transposition + +IF (PRESENT(KVSETUV)) THEN + IVSETUV(:) = KVSETUV(:) +ELSE + IVSETUV(:) = -1 +ENDIF +IVSETSC(:)=-1 +IF (PRESENT(KVSETSC)) THEN + IVSETSC(:) = KVSETSC(:) +ELSE + IOFF=0 + IF (PRESENT(KVSETSC2)) THEN + IFGP2=UBOUND(KVSETSC2,1) + IVSETSC(1:IFGP2)=KVSETSC2(:) + IOFF=IOFF+IFGP2 + ENDIF + IF (PRESENT(KVSETSC3A)) THEN + IFGP3A=UBOUND(KVSETSC3A,1) + IGP3APAR=UBOUND(PGP3A,3) + IF (LSCDERS) IGP3APAR=IGP3APAR/3 + DO J3=1,IGP3APAR + IVSETSC(IOFF+1:IOFF+IFGP3A)=KVSETSC3A(:) + IOFF=IOFF+IFGP3A + ENDDO + ENDIF + IF (PRESENT(KVSETSC3B)) THEN + IFGP3B=UBOUND(KVSETSC3B,1) + IGP3BPAR=UBOUND(PGP3B,3) + IF (LSCDERS) IGP3BPAR=IGP3BPAR/3 + DO J3=1,IGP3BPAR + IVSETSC(IOFF+1:IOFF+IFGP3B)=KVSETSC3B(:) + IOFF=IOFF+IFGP3B + ENDDO + ENDIF + IF (IOFF > 0 .AND. IOFF /= KF_SCALARS_G ) THEN + WRITE(NERR,*)'FTINV:IOFF,KF_SCALARS_G ',IOFF,KF_SCALARS_G + CALL ABORT_TRANS('FTINV_CTL_MOD:IOFF /= KF_SCALARS_G') + ENDIF +ENDIF + +IST = 1 +IF (KF_UV_G > 0) THEN + IF (LVORGP) THEN + IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) + IST = IST+KF_UV_G + ENDIF + IF ( LDIVGP) THEN + IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) + IST = IST+KF_UV_G + ENDIF + IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) + IST = IST+KF_UV_G + IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) + IST = IST+KF_UV_G +ENDIF +IF (KF_SCALARS_G > 0) THEN + IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) + IST = IST+KF_SCALARS_G + IF (LSCDERS) THEN + IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) + IST = IST+KF_SCALARS_G + ENDIF +ENDIF +IF (KF_UV_G > 0 .AND. LUVDER) THEN + IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) + IST = IST+KF_UV_G + IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) + IST = IST+KF_UV_G +ENDIF +IF (KF_SCALARS_G > 0) THEN + IF (LSCDERS) THEN + IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) + IST = IST+KF_SCALARS_G + ENDIF +ENDIF + +CALL GSTATS(157,0) +CALL TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,IVSET,& + &PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) +CALL GSTATS(157,1) +! ------------------------------------------------------------------ ! ------------------------------------------------------------------ END SUBROUTINE INV_TRANS_CTL diff --git a/src/trans/gpu/internal/ltinv_ctl_mod.F90 b/src/trans/gpu/internal/ltinv_ctl_mod.F90 deleted file mode 100755 index 27f9752d5..000000000 --- a/src/trans/gpu/internal/ltinv_ctl_mod.F90 +++ /dev/null @@ -1,83 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE LTINV_CTL_MOD - CONTAINS - SUBROUTINE LTINV_CTL(KF_UV,KF_SCALARS,& - & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,FOUBUF,KFIELD,& - & KFLDPTRUV,KFLDPTRSC) - - !**** *LTINV_CTL* - Control routine for inverse Legandre transform. - - ! Purpose. - ! -------- - ! Control routine for the inverse LEGENDRE transform - - !** Interface. - ! ---------- - ! CALL INV_TRANS_CTL(...) - ! KF_UV - local number of spectral u-v fields - ! KF_SCALARS - local number of scalar spectral fields - ! PSPVOR(:,:) - spectral vorticity (input) - ! PSPDIV(:,:) - spectral divergence (input) - ! PSPSCALAR(:,:) - spectral scalarvalued fields (input) - ! KFLDPTRUV(:) - field pointer array for vor./div. - ! KFLDPTRSC(:) - field pointer array for PSPSCALAR - - ! Method. - ! ------- - - ! Externals. - ! ---------- - ! - - ! Author. - ! ------- - ! Mats Hamrud *ECMWF* - - ! Modifications. - ! -------------- - ! Original : 00-06-03 - - ! ------------------------------------------------------------------ - - USE PARKIND1 ,ONLY : JPIM ,JPRB - - USE TPM_GEN, only: nout - USE TPM_DIM ,ONLY : R - USE TPM_DISTR ,ONLY : D - USE TPM_GEOMETRY ,ONLY : G - - USE TPM_FLT - - USE LTINV_MOD ,ONLY : LTINV - USE TRMTOL_MOD ,ONLY : TRMTOL, TRMTOL_CUDAAWARE - USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX - - IMPLICIT NONE - - INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV,KF_SCALARS - REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPVOR(:,:) - REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPDIV(:,:) - REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSCALAR(:,:) - REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC3A(:,:,:) - REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC3B(:,:,:) - REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC2(:,:) - REAL(KIND=JPRB), ALLOCATABLE, INTENT(OUT) :: FOUBUF(:) - INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) - INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) - INTEGER(KIND=JPIM),INTENT(OUT) :: KFIELD - REAL(KIND=JPRB), ALLOCATABLE :: FOUBUF_IN(:) - - - ! ------------------------------------------------------------------ - - END SUBROUTINE LTINV_CTL - END MODULE LTINV_CTL_MOD From 1a779dc6b90d0066ec77148cf88f1a279c2408a2 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:36 -0700 Subject: [PATCH 231/263] Inverse transform: Add empty handles --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 9 +- src/trans/gpu/internal/fourier_in_mod.F90 | 24 +- src/trans/gpu/internal/fsc_mod.F90 | 22 +- src/trans/gpu/internal/ftdir_ctlad_mod.F90 | 2 +- src/trans/gpu/internal/ftinv_mod.F90 | 21 +- src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 306 +++++---- src/trans/gpu/internal/ledir_mod.F90 | 7 +- src/trans/gpu/internal/ltdir_ctlad_mod.F90 | 5 +- src/trans/gpu/internal/ltdir_mod.F90 | 26 +- src/trans/gpu/internal/ltinv_mod.F90 | 30 +- src/trans/gpu/internal/trltog_mod.F90 | 686 +------------------ src/trans/gpu/internal/trltom_mod.F90 | 3 + src/trans/gpu/internal/trmtol_mod.F90 | 135 +--- 13 files changed, 325 insertions(+), 951 deletions(-) diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 7e30d283f..ce2272cb9 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -82,8 +82,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& USE SHUFFLE_MOD ,ONLY : SHUFFLE USE FIELD_SPLIT_MOD ,ONLY : FIELD_SPLIT -USE LTDIR_MOD ,ONLY : LTDIR, PREPARE_LTDIR -USE LEDIR_MOD, ONLY: LEDIR_HANDLE +USE LTDIR_MOD ,ONLY : LTDIR, PREPARE_LTDIR, LTDIR_HANDLE USE TRGTOL_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_GEN ,ONLY : LSYNC_TRANS @@ -148,7 +147,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& TYPE(TRLTOM_HANDLE) :: HTRLTOM TYPE(TRLTOM_UNPACK_HANDLE) :: HTRLTOM_UNPACK TYPE(TRLTOM_DIRECT_HANDLE) :: HTRLTOM_DIRECT -TYPE(LEDIR_HANDLE) :: HLEDIR +TYPE(LTDIR_HANDLE) :: HLTDIR INTEGER(KIND=C_INT8_T), POINTER :: PTR(:) @@ -168,7 +167,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ELSE HTRLTOM_DIRECT = PREPARE_TRLTOM_DIRECT(ALLOCATOR, KF_FS) ENDIF - HLEDIR = PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) + HLTDIR = PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) ! TODO this is going to be simplified when we have it implemented for invtrans too IF(ALLOCATED(REUSE_PTR)) & @@ -210,7 +209,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& CALL GSTATS(153,1) IF (KF_FS > 0) THEN - CALL LTDIR(ALLOCATOR,HLEDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALARS, & + CALL LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALARS, & & PSPVOR,PSPDIV,PSPSCALAR,& & PSPSC3A,PSPSC3B,PSPSC2) ENDIF diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index 8f5230b4b..aa94e6da4 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -9,8 +9,28 @@ ! MODULE FOURIER_IN_MOD + USE ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: FOURIER_IN, FOURIER_IN_HANDLE, PREPARE_FOURIER_IN + + TYPE FOURIER_IN_HANDLE + END TYPE + CONTAINS -SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) + FUNCTION PREPARE_FOURIER_IN(ALLOCATOR) RESULT(HFOURIER_IN) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: D + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + + TYPE(FOURIER_IN_HANDLE) :: HFOURIER_IN + + END FUNCTION +SUBROUTINE FOURIER_IN(ALLOCATOR,HFOURIER_IN,FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) !**** *FOURIER_IN* - Copy fourier data from buffer to local array @@ -49,6 +69,8 @@ SUBROUTINE FOURIER_IN(FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) REAL(KIND=JPRBT), ALLOCATABLE, INTENT(INOUT) :: FOUBUF(:) REAL(KIND=JPRBT), INTENT(OUT) :: PREEL_COMPLEX(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_CURRENT, KF_TOTAL +TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR +TYPE(FOURIER_IN_HANDLE), INTENT(IN) :: HFOURIER_IN INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL INTEGER(KIND=JPIM) :: IBEG,IEND,IINC diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 7755e81a2..3047a8b3a 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -9,8 +9,26 @@ ! MODULE FSC_MOD + USE ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: FSC, PREPARE_FSC, FSC_HANDLE + + TYPE FSC_HANDLE + END TYPE + CONTAINS -SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & + FUNCTION PREPARE_FSC(ALLOCATOR) RESULT(HFSC) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: D + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + TYPE(FSC_HANDLE) :: HFSC + END FUNCTION +SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & & KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) !**** *FSC - Division by a*cos(theta), east-west derivatives @@ -71,6 +89,8 @@ SUBROUTINE FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, & REAL(KIND=JPRBT), INTENT(INOUT) :: PREEL_COMPLEX(:) INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV, KF_SCALARS INTEGER(KIND=JPIM), INTENT(IN) :: KUV_OFFSET, KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(FSC_HANDLE), INTENT(IN) :: HFSC ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/ftdir_ctlad_mod.F90 b/src/trans/gpu/internal/ftdir_ctlad_mod.F90 index aa14925ed..c26f71734 100755 --- a/src/trans/gpu/internal/ftdir_ctlad_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctlad_mod.F90 @@ -62,7 +62,7 @@ SUBROUTINE FTDIR_CTLAD(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & !USE TPM_GEOMETRY USE TPM_DISTR ,ONLY : D, MYPROC, NPROC -USE TRLTOG_MOD ,ONLY : TRLTOG +!USE TRLTOG_MOD ,ONLY : TRLTOG USE FOURIER_OUTAD_MOD ,ONLY : FOURIER_OUTAD USE FTDIRAD_MOD ,ONLY : FTDIRAD ! diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index ff3071f45..f889055df 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -9,8 +9,25 @@ ! MODULE FTINV_MOD + USE ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: FTINV, FTINV_HANDLE, PREPARE_FTINV + + TYPE FTINV_HANDLE + END TYPE CONTAINS -SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) + FUNCTION PREPARE_FTINV(ALLOCATOR) RESULT(HFTINV) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: D + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + TYPE(FTINV_HANDLE) :: HFTINV + END FUNCTION +SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) !**** *FTINV - Inverse Fourier transform @@ -58,6 +75,8 @@ SUBROUTINE FTINV(PREEL_COMPLEX,PREEL_REAL,KFIELD) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_COMPLEX(:) +TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR +TYPE(FTINV_HANDLE), INTENT(IN) :: HFTINV INTEGER(KIND=JPIM) :: IGLG,KGL,IRET diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index fc2498975..f74707833 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -92,11 +92,12 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& USE TRMTOL_MOD USE LTINV_MOD -USE FOURIER_IN_MOD ,ONLY : FOURIER_IN -USE FSC_MOD ,ONLY : FSC -USE FTINV_MOD ,ONLY : FTINV -USE TRLTOG_MOD ,ONLY : TRLTOG, TRLTOG_CUDAAWARE +USE FOURIER_IN_MOD +USE FSC_MOD +USE FTINV_MOD +USE TRLTOG_MOD USE TPM_DISTR ,ONLY : D +USE ALLOCATOR_MOD ! IMPLICIT NONE @@ -149,6 +150,14 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& INTEGER(KIND=JPIM) :: J3,JGL,IGL,IOFF,IFGP2,IFGP3A,IFGP3B,IGP3APAR,IGP3BPAR INTEGER(KIND=JPIM) :: IFIRST +TYPE(BUFFERED_ALLOCATOR) :: ALLOCATOR +TYPE(LTINV_HANDLE) :: HLTINV +TYPE(TRMTOL_HANDLE) :: HTRMTOL +TYPE(FOURIER_IN_HANDLE) :: HFOURIER_IN +TYPE(FSC_HANDLE) :: HFSC +TYPE(FTINV_HANDLE) :: HFTINV +TYPE(TRLTOG_HANDLE) :: HTRLTOG + ! ------------------------------------------------------------------ IF(NPROMATR > 0) THEN @@ -156,178 +165,177 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& stop 24 ENDIF + ALLOCATOR = MAKE_BUFFERED_ALLOCATOR() + HLTINV = PREPARE_LTINV(ALLOCATOR,KF_GP,KF_FS) + HTRMTOL = PREPARE_TRMTOL(ALLOCATOR,KF_FS) + HFOURIER_IN = PREPARE_FOURIER_IN(ALLOCATOR) + HFSC = PREPARE_FSC(ALLOCATOR) + HFTINV = PREPARE_FTINV(ALLOCATOR) + HTRLTOG = PREPARE_TRLTOG(ALLOCATOR) + ! No splitting of fields, transform done in one go ! from PSPXXX to FOUBUF CALL GSTATS(102,0) - CALL LTINV(KF_UV,KF_SCALARS,& + CALL LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & & FOUBUF_IN,KFIELD) CALL GSTATS(102,1) CALL GSTATS(152,0) WRITE(NOUT,*) 'ltinv_ctl:TRMTOL_CUDAAWARE' - CALL TRMTOL_CUDAAWARE(FOUBUF_IN,FOUBUF,KFIELD) + CALL TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,FOUBUF_IN,FOUBUF,KFIELD) CALL GSTATS(152,1) - ! from FOUBUF to PGPXXX - ! CALL FTINV_CTL(KFIELD,KF_UV_G,KF_SCALARS_G,& - ! & KF_UV,KF_SCALARS,KF_GP,& - ! & FOUBUF,KVSETUV=KVSETUV,KVSETSC=KVSETSC,& - ! & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& - ! & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) - -CALL GSTATS(107,0) -CALL GSTATS(1639,0) - -! Compute PREEL_COMPLEX Domain decomposition -IFIRST = 0 -IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity -IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence -IFIRST = IFIRST + 2*KF_UV ! U and V -IFIRST = IFIRST + KF_SCALARS ! Scalars -IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives -! This verifies if we get the same assumptions about how much data we get from the LT space -IF (2*IFIRST /= KFIELD) CALL ABORT_TRANS('Size mismatch: LT and FT do not agree on input size') -IF (LUVDER) IFIRST = IFIRST+2*KF_UV ! U and V derivatives -IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives -IF (IFIRST /= KF_FS) CALL ABORT_TRANS('Size mismatch: Wrong computation KF_FS') - -IF (.NOT. ALLOCATED(REUSE_PTR)) THEN - ALLOCATE(REUSE_PTR(KF_FS*D%NLENGTF)) - !$ACC ENTER DATA CREATE(REUSE_PTR) -ELSEIF (SIZE(REUSE_PTR) < KF_FS*D%NLENGTF) THEN - !$ACC EXIT DATA DELETE(REUSE_PTR) - DEALLOCATE(REUSE_PTR) - ALLOCATE(REUSE_PTR(KF_FS*D%NLENGTF)) - !$ACC ENTER DATA CREATE(REUSE_PTR) -ENDIF -PREEL_COMPLEX => REUSE_PTR - -! Initialize potentially unset offsets -KSCALARS_NSDER_OFFSET = -1 -KUV_EWDER_OFFSET = -1 -KSCALARS_EWDER_OFFSET = -1 - -! And reiterate domain decomposition to assign pointers -IFIRST = 0 -IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity -IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence -KUV_OFFSET = IFIRST -IFIRST = IFIRST + 2*KF_UV ! U and V -KSCALARS_OFFSET = IFIRST -IFIRST = IFIRST + KF_SCALARS ! Scalars -IF (LSCDERS) THEN - KSCALARS_NSDER_OFFSET = IFIRST - IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives -ENDIF -IF (LUVDER) THEN - KUV_EWDER_OFFSET = IFIRST - IFIRST = IFIRST+2*KF_UV ! U and V derivatives -ENDIF -IF (LSCDERS) THEN - KSCALARS_EWDER_OFFSET = IFIRST - IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives -ENDIF - -! from FOUBUF to PREEL_COMPLEX. Divide by two because we consider this complex space now -CALL FOURIER_IN(FOUBUF,PREEL_COMPLEX,KFIELD/2,KF_FS) - -! 2. Fourier space computations - -! fill the rest of PREEL_COMPLEX -CALL FSC(PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, & - & KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) - -! 3. Fourier transform -! inplace operation -IF (KF_FS > 0) CALL FTINV(PREEL_COMPLEX,PREEL_REAL,KF_FS) - -CALL GSTATS(1639,1) -CALL GSTATS(107,1) - -! 4. Transposition - -IF (PRESENT(KVSETUV)) THEN - IVSETUV(:) = KVSETUV(:) -ELSE - IVSETUV(:) = -1 -ENDIF -IVSETSC(:)=-1 -IF (PRESENT(KVSETSC)) THEN - IVSETSC(:) = KVSETSC(:) -ELSE - IOFF=0 - IF (PRESENT(KVSETSC2)) THEN - IFGP2=UBOUND(KVSETSC2,1) - IVSETSC(1:IFGP2)=KVSETSC2(:) - IOFF=IOFF+IFGP2 + CALL GSTATS(107,0) + + ! Compute PREEL_COMPLEX Domain decomposition + IFIRST = 0 + IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity + IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence + IFIRST = IFIRST + 2*KF_UV ! U and V + IFIRST = IFIRST + KF_SCALARS ! Scalars + IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives + ! This verifies if we get the same assumptions about how much data we get from the LT space + IF (2*IFIRST /= KFIELD) CALL ABORT_TRANS('Size mismatch: LT and FT do not agree on input size') + IF (LUVDER) IFIRST = IFIRST+2*KF_UV ! U and V derivatives + IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives + IF (IFIRST /= KF_FS) CALL ABORT_TRANS('Size mismatch: Wrong computation KF_FS') + + IF (.NOT. ALLOCATED(REUSE_PTR)) THEN + ALLOCATE(REUSE_PTR(KF_FS*D%NLENGTF)) + !$ACC ENTER DATA CREATE(REUSE_PTR) + ELSEIF (SIZE(REUSE_PTR) < KF_FS*D%NLENGTF) THEN + !$ACC EXIT DATA DELETE(REUSE_PTR) + DEALLOCATE(REUSE_PTR) + ALLOCATE(REUSE_PTR(KF_FS*D%NLENGTF)) + !$ACC ENTER DATA CREATE(REUSE_PTR) + ENDIF + PREEL_COMPLEX => REUSE_PTR + + ! Initialize potentially unset offsets + KSCALARS_NSDER_OFFSET = -1 + KUV_EWDER_OFFSET = -1 + KSCALARS_EWDER_OFFSET = -1 + + ! And reiterate domain decomposition to assign pointers + IFIRST = 0 + IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity + IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence + KUV_OFFSET = IFIRST + IFIRST = IFIRST + 2*KF_UV ! U and V + KSCALARS_OFFSET = IFIRST + IFIRST = IFIRST + KF_SCALARS ! Scalars + IF (LSCDERS) THEN + KSCALARS_NSDER_OFFSET = IFIRST + IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives + ENDIF + IF (LUVDER) THEN + KUV_EWDER_OFFSET = IFIRST + IFIRST = IFIRST+2*KF_UV ! U and V derivatives ENDIF - IF (PRESENT(KVSETSC3A)) THEN - IFGP3A=UBOUND(KVSETSC3A,1) - IGP3APAR=UBOUND(PGP3A,3) - IF (LSCDERS) IGP3APAR=IGP3APAR/3 - DO J3=1,IGP3APAR - IVSETSC(IOFF+1:IOFF+IFGP3A)=KVSETSC3A(:) - IOFF=IOFF+IFGP3A - ENDDO + IF (LSCDERS) THEN + KSCALARS_EWDER_OFFSET = IFIRST + IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives ENDIF - IF (PRESENT(KVSETSC3B)) THEN - IFGP3B=UBOUND(KVSETSC3B,1) - IGP3BPAR=UBOUND(PGP3B,3) - IF (LSCDERS) IGP3BPAR=IGP3BPAR/3 - DO J3=1,IGP3BPAR - IVSETSC(IOFF+1:IOFF+IFGP3B)=KVSETSC3B(:) - IOFF=IOFF+IFGP3B - ENDDO + + ! from FOUBUF to PREEL_COMPLEX. Divide by two because we consider this complex space now + CALL FOURIER_IN(ALLOCATOR,HFOURIER_IN,FOUBUF,PREEL_COMPLEX,KFIELD/2,KF_FS) + + ! 2. Fourier space computations + + ! fill the rest of PREEL_COMPLEX + CALL FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, & + & KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) + + ! 3. Fourier transform + ! inplace operation + IF (KF_FS > 0) CALL FTINV(ALLOCATOR, HFTINV, PREEL_COMPLEX,PREEL_REAL,KF_FS) + + CALL GSTATS(107,1) + + ! 4. Transposition + + IF (PRESENT(KVSETUV)) THEN + IVSETUV(:) = KVSETUV(:) + ELSE + IVSETUV(:) = -1 ENDIF - IF (IOFF > 0 .AND. IOFF /= KF_SCALARS_G ) THEN - WRITE(NERR,*)'FTINV:IOFF,KF_SCALARS_G ',IOFF,KF_SCALARS_G - CALL ABORT_TRANS('FTINV_CTL_MOD:IOFF /= KF_SCALARS_G') + IVSETSC(:)=-1 + IF (PRESENT(KVSETSC)) THEN + IVSETSC(:) = KVSETSC(:) + ELSE + IOFF=0 + IF (PRESENT(KVSETSC2)) THEN + IFGP2=UBOUND(KVSETSC2,1) + IVSETSC(1:IFGP2)=KVSETSC2(:) + IOFF=IOFF+IFGP2 + ENDIF + IF (PRESENT(KVSETSC3A)) THEN + IFGP3A=UBOUND(KVSETSC3A,1) + IGP3APAR=UBOUND(PGP3A,3) + IF (LSCDERS) IGP3APAR=IGP3APAR/3 + DO J3=1,IGP3APAR + IVSETSC(IOFF+1:IOFF+IFGP3A)=KVSETSC3A(:) + IOFF=IOFF+IFGP3A + ENDDO + ENDIF + IF (PRESENT(KVSETSC3B)) THEN + IFGP3B=UBOUND(KVSETSC3B,1) + IGP3BPAR=UBOUND(PGP3B,3) + IF (LSCDERS) IGP3BPAR=IGP3BPAR/3 + DO J3=1,IGP3BPAR + IVSETSC(IOFF+1:IOFF+IFGP3B)=KVSETSC3B(:) + IOFF=IOFF+IFGP3B + ENDDO + ENDIF + IF (IOFF > 0 .AND. IOFF /= KF_SCALARS_G ) THEN + WRITE(NERR,*)'FTINV:IOFF,KF_SCALARS_G ',IOFF,KF_SCALARS_G + CALL ABORT_TRANS('FTINV_CTL_MOD:IOFF /= KF_SCALARS_G') + ENDIF ENDIF -ENDIF -IST = 1 -IF (KF_UV_G > 0) THEN - IF (LVORGP) THEN + IST = 1 + IF (KF_UV_G > 0) THEN + IF (LVORGP) THEN + IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) + IST = IST+KF_UV_G + ENDIF + IF ( LDIVGP) THEN + IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) + IST = IST+KF_UV_G + ENDIF IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) IST = IST+KF_UV_G - ENDIF - IF ( LDIVGP) THEN IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) IST = IST+KF_UV_G ENDIF - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G -ENDIF -IF (KF_SCALARS_G > 0) THEN - IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) - IST = IST+KF_SCALARS_G - IF (LSCDERS) THEN + IF (KF_SCALARS_G > 0) THEN IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) IST = IST+KF_SCALARS_G + IF (LSCDERS) THEN + IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) + IST = IST+KF_SCALARS_G + ENDIF ENDIF -ENDIF -IF (KF_UV_G > 0 .AND. LUVDER) THEN - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G -ENDIF -IF (KF_SCALARS_G > 0) THEN - IF (LSCDERS) THEN - IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) - IST = IST+KF_SCALARS_G + IF (KF_UV_G > 0 .AND. LUVDER) THEN + IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) + IST = IST+KF_UV_G + IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) + IST = IST+KF_UV_G + ENDIF + IF (KF_SCALARS_G > 0) THEN + IF (LSCDERS) THEN + IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) + IST = IST+KF_SCALARS_G + ENDIF ENDIF -ENDIF -CALL GSTATS(157,0) -CALL TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,IVSET,& - &PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) -CALL GSTATS(157,1) -! ------------------------------------------------------------------ + CALL GSTATS(157,0) + CALL TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,IVSET,& + &PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) + CALL GSTATS(157,1) + ! ------------------------------------------------------------------ ! ------------------------------------------------------------------ END SUBROUTINE INV_TRANS_CTL diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index a612a2d59..ece8184df 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -11,15 +11,10 @@ MODULE LEDIR_MOD USE PARKIND_ECTRANS ,ONLY : JPIM - USE ALLOCATOR_MOD IMPLICIT NONE PRIVATE - PUBLIC :: LEDIR_HANDLE, LEDIR_STRIDES, LEDIR - - TYPE LEDIR_HANDLE - TYPE(ALLOCATION_RESERVATION_HANDLE) :: HOUT_AND_POA - END TYPE + PUBLIC :: LEDIR_STRIDES, LEDIR INTEGER(KIND=JPIM) :: A = 8 !Alignment CONTAINS diff --git a/src/trans/gpu/internal/ltdir_ctlad_mod.F90 b/src/trans/gpu/internal/ltdir_ctlad_mod.F90 index dcf8a5f54..1bf9cbf50 100755 --- a/src/trans/gpu/internal/ltdir_ctlad_mod.F90 +++ b/src/trans/gpu/internal/ltdir_ctlad_mod.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -40,7 +41,7 @@ SUBROUTINE LTDIR_CTLAD(KF_FS,KF_UV,KF_SCALARS, & USE TPM_DISTR ,ONLY : D USE LTDIRAD_MOD ,ONLY : LTDIRAD -USE TRMTOL_MOD ,ONLY : TRMTOL +!USE TRMTOL_MOD ,ONLY : TRMTOL ! IMPLICIT NONE @@ -100,7 +101,7 @@ SUBROUTINE LTDIR_CTLAD(KF_FS,KF_UV,KF_SCALARS, & CALL GSTATS(105,1) CALL GSTATS(181,0) -CALL TRMTOL(FOUBUF,FOUBUF_IN,2*KF_FS) +!CALL TRMTOL(FOUBUF,FOUBUF_IN,2*KF_FS) CALL GSTATS(181,1) IF (.NOT.LALLOPERM) DEALLOCATE(FOUBUF) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index 47b89aedc..e0d024be6 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -10,10 +10,18 @@ ! MODULE LTDIR_MOD + USE ALLOCATOR_MOD IMPLICIT NONE + PRIVATE + PUBLIC :: PREPARE_LTDIR, LTDIR_HANDLE, LTDIR + + TYPE LTDIR_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HOUT_AND_POA + END TYPE + CONTAINS - FUNCTION PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) RESULT(HLEDIR) + FUNCTION PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) RESULT(HLTDIR) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD USE TPM_DISTR, ONLY: D USE TPM_DIM, ONLY: R @@ -25,7 +33,7 @@ FUNCTION PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) RESULT(HLEDIR) TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV - TYPE(LEDIR_HANDLE) :: HLEDIR + TYPE(LTDIR_HANDLE) :: HLTDIR INTEGER(KIND=C_SIZE_T) :: IALLOC_SZ INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 @@ -46,10 +54,10 @@ FUNCTION PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) RESULT(HLEDIR) ! ZOUT0 IALLOC_SZ = IALLOC_SZ+ ALIGN(IOUT0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) - HLEDIR%HOUT_AND_POA = RESERVE(ALLOCATOR, IALLOC_SZ) + HLTDIR%HOUT_AND_POA = RESERVE(ALLOCATOR, IALLOC_SZ) END FUNCTION - SUBROUTINE LTDIR(ALLOCATOR,HLEDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALARS,& + SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALARS,& & PSPVOR,PSPDIV,PSPSCALAR,& & PSPSC3A,PSPSC3B,PSPSC2, & & KFLDPTRUV,KFLDPTRSC) @@ -157,7 +165,7 @@ SUBROUTINE LTDIR(ALLOCATOR,HLEDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALA REAL(KIND=JPRBT), POINTER :: ZOUT(:) REAL(KIND=JPRD), POINTER :: ZOUT0(:) TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR - TYPE(LEDIR_HANDLE), INTENT(IN) :: HLEDIR + TYPE(LTDIR_HANDLE), INTENT(IN) :: HLTDIR INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 @@ -183,26 +191,26 @@ SUBROUTINE LTDIR(ALLOCATOR,HLEDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALA IALLOC_POS = 1 IALLOC_SZ = ALIGN(2*KF_FS*(R%NTMAX+3)*D%NUMP*SIZEOF(POA1_L(1)),128) - CALL ASSIGN_PTR(POA1_L, GET_ALLOCATION(ALLOCATOR, HLEDIR%HOUT_AND_POA),& + CALL ASSIGN_PTR(POA1_L, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& & IALLOC_POS, IALLOC_SZ) CALL C_F_POINTER(C_LOC(POA1_L), POA1, (/ 2*KF_FS, R%NTMAX+3, D%NUMP /)) IALLOC_POS = IALLOC_POS + IALLOC_SZ IALLOC_SZ = ALIGN(4*KF_UV*(R%NTMAX+3)*D%NUMP*SIZEOF(POA2_L(1)),128) - CALL ASSIGN_PTR(POA2_L, GET_ALLOCATION(ALLOCATOR, HLEDIR%HOUT_AND_POA),& + CALL ASSIGN_PTR(POA2_L, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& & IALLOC_POS, IALLOC_SZ) CALL C_F_POINTER(C_LOC(POA2_L), POA2, (/ 4*KF_UV, R%NTMAX+3, D%NUMP /)) IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZOUT IALLOC_SZ = ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZOUT(1)),128) - CALL ASSIGN_PTR(ZOUT, GET_ALLOCATION(ALLOCATOR, HLEDIR%HOUT_AND_POA),& + CALL ASSIGN_PTR(ZOUT, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZOUT0 IALLOC_SZ = ALIGN(IOUT0_STRIDES1*SIZEOF(ZOUT0(1)),128) - CALL ASSIGN_PTR(ZOUT0, GET_ALLOCATION(ALLOCATOR, HLEDIR%HOUT_AND_POA),& + CALL ASSIGN_PTR(ZOUT0, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS = IALLOC_POS + IALLOC_SZ diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index 42b6a0923..68fda910c 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -9,8 +9,34 @@ ! MODULE LTINV_MOD + USE ALLOCATOR_MOD + + IMPLICIT NONE + + PRIVATE + PUBLIC :: LTINV, LTINV_HANDLE, PREPARE_LTINV + + TYPE LTINV_HANDLE + END TYPE + CONTAINS - SUBROUTINE LTINV(KF_UV,KF_SCALARS,& + FUNCTION PREPARE_LTINV(ALLOCATOR, KF_FS, KF_UV) RESULT(HLTINV) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE TPM_DISTR, ONLY: D + USE TPM_DIM, ONLY: R + USE ISO_C_BINDING + USE LEDIR_MOD + USE ALLOCATOR_MOD + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV + + TYPE(LTINV_HANDLE) :: HLTINV + END FUNCTION + + SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & & FOUBUF_IN,FOUBUF_KFIELD) @@ -105,6 +131,8 @@ SUBROUTINE LTINV(KF_UV,KF_SCALARS,& REAL(KIND=JPRB), POINTER :: PSCALARS(:,:,:), PSCALARS_NSDER(:,:,:) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(LTINV_HANDLE), INTENT(IN) :: HLTINV ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 236bf237e..e9d900339 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -9,8 +9,26 @@ ! MODULE TRLTOG_MOD - CONTAINS - SUBROUTINE TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP,& + USE ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: TRLTOG_CUDAAWARE, TRLTOG_HANDLE, PREPARE_TRLTOG + + TYPE TRLTOG_HANDLE + END TYPE + +CONTAINS + FUNCTION PREPARE_TRLTOG(ALLOCATOR) RESULT(HTRLTOG) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: D + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + TYPE(TRLTOG_HANDLE) :: HTRLTOG + END FUNCTION + SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) !**** *trltog * - transposition of grid point data from latitudinal @@ -99,6 +117,8 @@ SUBROUTINE TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KP REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP3A(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP2(:,:,:) + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(TRLTOG_HANDLE) :: HTRLTOG ! LOCAL VARIABLES @@ -615,667 +635,5 @@ SUBROUTINE TRLTOG_CUDAAWARE(PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KP IF (LHOOK) CALL DR_HOOK('TRLTOG',1,ZHOOK_HANDLE) END SUBROUTINE TRLTOG_CUDAAWARE - - SUBROUTINE TRLTOG(PREEL_REAL,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& - &PGP,PGPUV,PGP3A,PGP3B,PGP2) - - !**** *trltog * - transposition of grid point data from latitudinal - ! to column structure. This takes place between inverse - ! FFT and grid point calculations. - ! TRLTOG is the inverse of TRGTOL - - ! Purpose. - ! -------- - - - !** Interface. - ! ---------- - ! *call* *trltog(...) - - ! Explicit arguments : - ! -------------------- - ! PREEL_REAL - Latitudinal data ready for direct FFT (input) - ! PGP - Blocked grid point data (output) - ! KVSET - "v-set" for each field (input) - - ! Implicit arguments : - ! -------------------- - - ! Method. - ! ------- - ! See documentation - - ! Externals. - ! ---------- - - ! Reference. - ! ---------- - ! ECMWF Research Department documentation of the IFS - - ! Author. - ! ------- - ! MPP Group *ECMWF* - - ! Modifications. - ! -------------- - ! Original : 95-10-01 - ! D.Dent : 97-08-04 Reorganisation to allow NPRTRV - ! to differ from NPRGPEW - ! =99-03-29= Mats Hamrud and Deborah Salmond - ! JUMP in FFT's changed to 1 - ! INDEX introduced and ZCOMBUF not used for same PE - ! 01-11-23 Deborah Salmond and John Hague - ! LIMP_NOOLAP Option for non-overlapping message passing - ! and buffer packing - ! 01-12-18 Peter Towers - ! Improved vector performance of LTOG_PACK,LTOG_UNPACK - ! 03-0-02 G. Radnoti: Call barrier always when nproc>1 - ! 08-01-01 G.Mozdzynski: cleanup - ! 09-01-02 G.Mozdzynski: use non-blocking recv and send - ! ------------------------------------------------------------------ - - - - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB , JPRBT - USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - - USE MPL_MODULE ,ONLY : MPL_RECV, MPL_SEND, MPL_WAIT, JP_NON_BLOCKING_STANDARD, MPL_MYRANK - - USE TPM_GEN ,ONLY : NOUT - USE TPM_DISTR ,ONLY : D, MYSETV, MYSETW, MTAGLG, & - & NPRCIDS, NPRTRNS, MYPROC, NPROC - USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, NGPBLKS - - USE INIGPTR_MOD ,ONLY : INIGPTR - USE PE2SET_MOD ,ONLY : PE2SET - !USE MYSENDSET_MOD - !USE MYRECVSET_MOD - USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS - ! - USE MPI - - IMPLICIT NONE - - - REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) - INTEGER(KIND=JPIM),INTENT(IN) :: KVSET(:) - INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP - INTEGER(KIND=JPIM),INTENT(IN) :: KF_SCALARS_G - INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) - REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP(:,:,:) - REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGPUV(:,:,:,:) - REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP3A(:,:,:,:) - REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP3B(:,:,:,:) - REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP2(:,:,:) - - ! LOCAL VARIABLES - - REAL(KIND=JPRBT),ALLOCATABLE :: ZCOMBUFS(:,:),ZCOMBUFR(:,:) - REAL(KIND=JPRBT) :: ZDUM(2) - - INTEGER(KIND=JPIM) :: ISENT (NPROC) - INTEGER(KIND=JPIM) :: IRCVD (NPROC) - INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) - INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) - INTEGER(KIND=JPIM) :: IREQ (NPROC*2) - INTEGER(KIND=JPIM) :: JSEND (NPROC) - INTEGER(KIND=JPIM) :: JRECV (NPROC) - - INTEGER(KIND=JPIM) :: IFIRST, IFIRSTLAT, IFLD, IGL, IGLL,& - &ILAST, ILASTLAT, IPOS, ISETA, & - &ISETB, IRECV, IRECVSET, & - &ISETV, ISEND, ITAG, JBLK, JFLD, & - &JGL, JK, JL, JLOOP, ISETW, IFLDS, IPROC,JROC, & - &INRECV, INSEND,INR,INS,IR - INTEGER(KIND=JPIM) :: II,INDOFFX,ILEN,IBUFLENS,IBUFLENR - - LOGICAL :: LLPGPUV,LLPGP3A,LLPGP3B,LLPGP2,LLPGPONLY - LOGICAL :: LLUV(KF_GP),LLGP2(KF_GP),LLGP3A(KF_GP),LLGP3B(KF_GP) - LOGICAL :: LLDONE, LLINDER - INTEGER(KIND=JPIM) :: IUVLEVS(KF_GP),IUVPARS(KF_GP),IGP2PARS(KF_GP) - INTEGER(KIND=JPIM) :: IGP3APARS(KF_GP),IGP3ALEVS(KF_GP),IGP3BPARS(KF_GP),IGP3BLEVS(KF_GP) - INTEGER(KIND=JPIM) :: IUVPAR,IUVLEV,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR,IPAROFF - INTEGER(KIND=JPIM) :: IOFF,IOFF1,IOFFNS,IOFFEW,J1,J2 - INTEGER(KIND=JPIM) :: INDEX(D%NLENGTF),INDOFF(NPROC),IFLDOFF(KF_GP) - INTEGER(KIND=JPIM) :: IRECV_FLD_START,IRECV_FLD_END - INTEGER(KIND=JPIM) :: ISEND_FLD_START(NPROC),ISEND_FLD_END - INTEGER(KIND=JPIM) :: INUMFLDS - INTEGER(KIND=JPIM) :: IGPTRSEND(2,NGPBLKS,NPRTRNS) - INTEGER(KIND=JPIM) :: IGPTRRECV(NPRTRNS) - INTEGER(KIND=JPIM) :: IGPTROFF(NGPBLKS) - - ! INTEGER FUNCTIONS - INTEGER(KIND=JPIM) :: ISENDCOUNT,IRECVCOUNT,J - INTEGER(KIND=JPIM) :: JPOS(NGPBLKS),IFLDA(KF_GP),JI,JJ - INTEGER(KIND=JPIM) :: IFLDT - REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR - - INTEGER(KIND=JPIM) :: IERROR - - REAL(KIND=JPRBT) :: TIMEF, tc - - ! ------------------------------------------------------------------ - - !* 0. Some initializations - ! -------------------- - IF (LHOOK) CALL DR_HOOK('TRLTOG',0,ZHOOK_HANDLE) - - !$ACC UPDATE HOST(PREEL_REAL) - - CALL GSTATS(1806,0) - - LLINDER = .FALSE. - LLPGPUV = .FALSE. - LLPGP3A = .FALSE. - LLPGP3B = .FALSE. - LLPGP2 = .FALSE. - LLPGPONLY = .FALSE. - IF(PRESENT(KPTRGP)) LLINDER = .TRUE. - IF(PRESENT(PGP)) LLPGPONLY=.TRUE. - IF(PRESENT(PGPUV)) LLPGPUV=.TRUE. - IF(PRESENT(PGP3A)) LLPGP3A=.TRUE. - IF(PRESENT(PGP3B)) LLPGP3B=.TRUE. - IF(PRESENT(PGP2)) LLPGP2=.TRUE. - - IUVPAR=0 - IUVLEV=0 - IOFF1=0 - IOFFNS=KF_SCALARS_G - IOFFEW=2*KF_SCALARS_G - - LLUV(:) = .FALSE. - IF (LLPGPUV) THEN - IOFF=0 - IUVLEV=UBOUND(PGPUV,2) - IF(LVORGP) THEN - IUVPAR=IUVPAR+1 - DO J=1,IUVLEV - IUVLEVS(IOFF+J)=J - IUVPARS(IOFF+J)=IUVPAR - LLUV(IOFF+J)=.TRUE. - ENDDO - IOFF=IOFF+IUVLEV - ENDIF - IF(LDIVGP) THEN - IUVPAR=IUVPAR+1 - DO J=1,IUVLEV - IUVLEVS(IOFF+J)=J - IUVPARS(IOFF+J)=IUVPAR - LLUV(IOFF+J)=.TRUE. - ENDDO - IOFF=IOFF+IUVLEV - ENDIF - DO J=1,IUVLEV - IUVLEVS(IOFF+J)=J - IUVPARS(IOFF+J)=IUVPAR+1 - IUVLEVS(IOFF+J+IUVLEV)=J - IUVPARS(IOFF+J+IUVLEV)=IUVPAR+2 - ENDDO - IUVPAR=IUVPAR+2 - LLUV(IOFF+1:IOFF+2*IUVLEV)=.TRUE. - IOFF=IOFF+2*IUVLEV - IOFF1=IOFF - IOFFNS=IOFFNS+IOFF - IOFFEW=IOFFEW+IOFF - - IOFF=IUVPAR*IUVLEV+KF_SCALARS_G - IF(LUVDER) THEN - IF(LSCDERS) IOFF=IOFF+KF_SCALARS_G - DO J=1,IUVLEV - IUVLEVS(IOFF+J)=J - IUVPARS(IOFF+J)=IUVPAR+1 - LLUV(IOFF+J)=.TRUE. - IUVLEVS(IOFF+J+IUVLEV)=J - IUVPARS(IOFF+J+IUVLEV)=IUVPAR+2 - LLUV(IOFF+J+IUVLEV)=.TRUE. - ENDDO - IUVPAR=IUVPAR+2 - IOFF=IOFF+2*IUVLEV - IOFFEW=IOFFEW+2*IUVLEV - ENDIF - ENDIF - - LLGP2(:)=.FALSE. - IF(LLPGP2) THEN - IOFF=IOFF1 - IGP2PAR=UBOUND(PGP2,2) - IF(LSCDERS) IGP2PAR=IGP2PAR/3 - DO J=1,IGP2PAR - LLGP2(J+IOFF) = .TRUE. - IGP2PARS(J+IOFF)=J - ENDDO - IOFF1=IOFF1+IGP2PAR - IF(LSCDERS) THEN - IOFF=IOFFNS - DO J=1,IGP2PAR - LLGP2(J+IOFF) = .TRUE. - IGP2PARS(J+IOFF)=J+IGP2PAR - ENDDO - IOFFNS=IOFF+IGP2PAR - IOFF=IOFFEW - DO J=1,IGP2PAR - LLGP2(J+IOFF) = .TRUE. - IGP2PARS(J+IOFF)=J+2*IGP2PAR - ENDDO - IOFFEW=IOFF+IGP2PAR - ENDIF - ENDIF - - LLGP3A(:) = .FALSE. - IF(LLPGP3A) THEN - IGP3ALEV=UBOUND(PGP3A,2) - IGP3APAR=UBOUND(PGP3A,3) - IF(LSCDERS) IGP3APAR=IGP3APAR/3 - IOFF=IOFF1 - DO J1=1,IGP3APAR - DO J2=1,IGP3ALEV - LLGP3A(J2+(J1-1)*IGP3ALEV+IOFF) = .TRUE. - IGP3APARS(J2+(J1-1)*IGP3ALEV+IOFF)=J1 - IGP3ALEVS(J2+(J1-1)*IGP3ALEV+IOFF)=J2 - ENDDO - ENDDO - IPAROFF=IGP3APAR - IOFF1=IOFF1+IGP3APAR*IGP3ALEV - IF(LSCDERS) THEN - IOFF=IOFFNS - DO J1=1,IGP3APAR - DO J2=1,IGP3ALEV - LLGP3A(J2+(J1-1)*IGP3ALEV+IOFF) = .TRUE. - IGP3APARS(J2+(J1-1)*IGP3ALEV+IOFF)=J1+IPAROFF - IGP3ALEVS(J2+(J1-1)*IGP3ALEV+IOFF)=J2 - ENDDO - ENDDO - IPAROFF=IPAROFF+IGP3APAR - IOFFNS=IOFFNS+IGP3APAR*IGP3ALEV - IOFF=IOFFEW - DO J1=1,IGP3APAR - DO J2=1,IGP3ALEV - LLGP3A(J2+(J1-1)*IGP3ALEV+IOFF) = .TRUE. - IGP3APARS(J2+(J1-1)*IGP3ALEV+IOFF)=J1+IPAROFF - IGP3ALEVS(J2+(J1-1)*IGP3ALEV+IOFF)=J2 - ENDDO - ENDDO - IOFFEW=IOFFEW+IGP3APAR*IGP3ALEV - ENDIF - ENDIF - - LLGP3B(:) = .FALSE. - IF(LLPGP3B) THEN - IGP3BLEV=UBOUND(PGP3B,2) - IGP3BPAR=UBOUND(PGP3B,3) - IF(LSCDERS) IGP3BPAR=IGP3BPAR/3 - IOFF=IOFF1 - DO J1=1,IGP3BPAR - DO J2=1,IGP3BLEV - LLGP3B(J2+(J1-1)*IGP3BLEV+IOFF) = .TRUE. - IGP3BPARS(J2+(J1-1)*IGP3BLEV+IOFF)=J1 - IGP3BLEVS(J2+(J1-1)*IGP3BLEV+IOFF)=J2 - ENDDO - ENDDO - IPAROFF=IGP3BPAR - IOFF1=IOFF1+IGP3BPAR*IGP3BLEV - IF(LSCDERS) THEN - IOFF=IOFFNS - DO J1=1,IGP3BPAR - DO J2=1,IGP3BLEV - LLGP3B(J2+(J1-1)*IGP3BLEV+IOFF) = .TRUE. - IGP3BPARS(J2+(J1-1)*IGP3BLEV+IOFF)=J1+IPAROFF - IGP3BLEVS(J2+(J1-1)*IGP3BLEV+IOFF)=J2 - ENDDO - ENDDO - IPAROFF=IPAROFF+IGP3BPAR - IOFFNS=IOFFNS+IGP3BPAR*IGP3BLEV - IOFF=IOFFEW - DO J1=1,IGP3BPAR - DO J2=1,IGP3BLEV - LLGP3B(J2+(J1-1)*IGP3BLEV+IOFF) = .TRUE. - IGP3BPARS(J2+(J1-1)*IGP3BLEV+IOFF)=J1+IPAROFF - IGP3BLEVS(J2+(J1-1)*IGP3BLEV+IOFF)=J2 - ENDDO - ENDDO - IOFFEW=IOFFEW+IGP3BPAR*IGP3BLEV - ENDIF - ENDIF - - CALL INIGPTR(IGPTRSEND,IGPTRRECV) - LLDONE = .FALSE. - ITAG = MTAGLG - - INDOFFX = 0 - IBUFLENS = 0 - IBUFLENR = 0 - INRECV = 0 - INSEND = 0 - - DO JROC=1,NPROC - - CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) - ISEND = JROC - ISENT(JROC) = 0 - IRCVD(JROC) = 0 - - ! count up expected number of fields - IPOS = 0 - DO JFLD=1,KF_GP - IF(KVSET(JFLD) == ISETV .OR. KVSET(JFLD) == -1) IPOS = IPOS+1 - ENDDO - IRECVTOT(JROC) = IGPTRRECV(ISETW)*IPOS - IF(IRECVTOT(JROC) > 0 .AND. MYPROC /= JROC) THEN - INRECV = INRECV + 1 - JRECV(INRECV)=JROC - ENDIF - - IF( JROC /= MYPROC) IBUFLENR = MAX(IBUFLENR,IRECVTOT(JROC)) - - IFIRSTLAT = MAX(D%NPTRLS(MYSETW),D%NFRSTLAT(ISETA)) - ILASTLAT = MIN(D%NPTRLS(MYSETW)+D%NULTPP(MYSETW)-1,D%NLSTLAT(ISETA)) - - IPOS = 0 - DO JGL=IFIRSTLAT,ILASTLAT - IGL = D%NPTRFRSTLAT(ISETA)+JGL-D%NFRSTLAT(ISETA) - IPOS = IPOS+D%NONL(IGL,ISETB) - ENDDO - - ISENDTOT(JROC) = IPOS*KF_FS - IF( JROC /= MYPROC) THEN - IBUFLENS = MAX(IBUFLENS,ISENDTOT(JROC)) - IF(ISENDTOT(JROC) > 0) THEN - INSEND = INSEND+1 - JSEND(INSEND)=JROC - ENDIF - ENDIF - - IF(IPOS > 0) THEN - INDOFF(JROC) = INDOFFX - INDOFFX = INDOFFX+IPOS - IPOS = 0 - DO JGL=IFIRSTLAT,ILASTLAT - IGL = D%NPTRFRSTLAT(ISETA)+JGL-D%NFRSTLAT(ISETA) - IGLL = JGL-D%NPTRLS(MYSETW)+1 - DO JL=D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL),& - &D%NSTA(IGL,ISETB)+D%NSTAGTF(IGLL)+D%NONL(IGL,ISETB)-1 - IPOS = IPOS+1 - INDEX(IPOS+INDOFF(JROC)) = JL - ENDDO - ENDDO - ENDIF - ENDDO - - ISENDCOUNT=0 - IRECVCOUNT=0 - DO J=1,NPROC - ISENDCOUNT=MAX(ISENDCOUNT,ISENDTOT(J)) - IRECVCOUNT=MAX(IRECVCOUNT,IRECVTOT(J)) - ENDDO - IF (IBUFLENS > 0) ALLOCATE(ZCOMBUFS(-1:ISENDCOUNT,INSEND)) - IF (IBUFLENR > 0) ALLOCATE(ZCOMBUFR(-1:IRECVCOUNT,INRECV)) - - CALL GSTATS(1806,1) - - - ! Copy local contribution - IF( IRECVTOT(MYPROC) > 0 )THEN - IFLDS = 0 - DO JFLD=1,KF_GP - IF(KVSET(JFLD) == MYSETV .OR. KVSET(JFLD) == -1) THEN - IFLDS = IFLDS+1 - IF(LLINDER) THEN - IFLDOFF(IFLDS) = KPTRGP(JFLD) - ELSE - IFLDOFF(IFLDS) = JFLD - ENDIF - ENDIF - ENDDO - - IPOS=0 - DO JBLK=1,NGPBLKS - IGPTROFF(JBLK)=IPOS - IFIRST = IGPTRSEND(1,JBLK,MYSETW) - IF(IFIRST > 0) THEN - ILAST = IGPTRSEND(2,JBLK,MYSETW) - IPOS=IPOS+ILAST-IFIRST+1 - ENDIF - ENDDO - - CALL GSTATS(1604,0) - #ifdef NECSX - !$OMP PARALLEL DO SCHEDULE(DYNAMIC) PRIVATE(JFLD,JBLK,JK,IFLD,IPOS,IFIRST,ILAST) - #else - !$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JFLD,JBLK,JK,IFLD,IPOS,IFIRST,ILAST) - #endif - - DO JBLK=1,NGPBLKS - IFIRST = IGPTRSEND(1,JBLK,MYSETW) - IF(IFIRST > 0) THEN - ILAST = IGPTRSEND(2,JBLK,MYSETW) - IF(LLPGPONLY) THEN - IF(LLINDER) THEN - DO JFLD=1,IFLDS - IFLD = KPTRGP(JFLD) - DO JK=IFIRST,ILAST - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 - PGP(JK,IFLD,JBLK) = PREEL_REAL((INDEX(IPOS)-1)*KF_FS+JFLD) - ENDDO - ENDDO - ELSE - DO JFLD=1,IFLDS - IFLD = IFLDOFF(JFLD) - DO JK=IFIRST,ILAST - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 - PGP(JK,IFLD,JBLK) = PREEL_REAL((INDEX(IPOS)-1)*KF_FS+JFLD) - ENDDO - ENDDO - ENDIF - ELSE - DO JFLD=1,IFLDS - IFLD = IFLDOFF(JFLD) - IF(LLUV(IFLD)) THEN - DO JK=IFIRST,ILAST - IPOS = INDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 - PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) = PREEL_REAL(JFLD+KF_FS*IPOS) - ENDDO - ELSEIF(LLGP2(IFLD)) THEN - DO JK=IFIRST,ILAST - IPOS = INDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 - PGP2(JK,IGP2PARS(IFLD),JBLK)=PREEL_REAL(JFLD+KF_FS*IPOS) - ENDDO - ELSEIF(LLGP3A(IFLD)) THEN - DO JK=IFIRST,ILAST - IPOS = INDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 - PGP3A(JK,IGP3ALEVS(IFLD),IGP3APARS(IFLD),JBLK)=PREEL_REAL(JFLD+KF_FS*IPOS) - ENDDO - ELSEIF(LLGP3B(IFLD)) THEN - DO JK=IFIRST,ILAST - IPOS = INDEX(INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1)-1 - PGP3B(JK,IGP3BLEVS(IFLD),IGP3BPARS(IFLD),JBLK)=PREEL_REAL(JFLD+KF_FS*IPOS) - ENDDO - ELSE - WRITE(NOUT,*)'TRLTOG_MOD: ERROR',JFLD,IFLD - CALL ABORT_TRANS('TRLTOG_MOD: ERROR') - ENDIF - ENDDO - ENDIF - ENDIF - ENDDO - !$OMP END PARALLEL DO - CALL GSTATS(1604,1) - - ENDIF - - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=TIMEF() - #endif - ! - ! loop over the number of processors we need to communicate with. - ! NOT MYPROC - ! - ! Pack loop......................................................... - - CALL GSTATS(1605,0) - - !$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JL,II,JFLD,INS,ISEND,ILEN,ISEND_FLD_END) - DO INS=1,INSEND - ISEND=JSEND(INS) - ISEND_FLD_START(ISEND)= 1 - ILEN = ISENDTOT(ISEND)/KF_FS - ISEND_FLD_END = KF_FS - #ifdef NECSX - DO JFLD=ISEND_FLD_START(ISEND),ISEND_FLD_END - DO JL=1,ILEN - II = INDEX(INDOFF(ISEND)+JL) - #else - DO JL=1,ILEN - II = INDEX(INDOFF(ISEND)+JL) - DO JFLD=ISEND_FLD_START(ISEND),ISEND_FLD_END - #endif - ZCOMBUFS((JFLD-ISEND_FLD_START(ISEND))*ILEN+JL,INS) = PREEL_REAL((II-1)*KF_FS+JFLD) - ENDDO - ENDDO - ZCOMBUFS(-1,INS) = 1 - ZCOMBUFS(0,INS) = KF_FS - ENDDO - !$OMP END PARALLEL DO - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=(TIMEF()-Tc)/1000.0_JPRBT - !IF(MPL_MYRANK==1) WRITE(*,*) "packing (trltog) in sec: ", Tc - #endif - - CALL GSTATS(1605,1) - - IR=0 - CALL GSTATS(805,0) - - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=TIMEF() - #endif - !...Receive loop......................................................... - DO INR=1,INRECV - IR=IR+1 - IRECV=JRECV(INR) - CALL MPL_RECV(ZCOMBUFR(-1:IRECVTOT(IRECV),INR), & - & KSOURCE=NPRCIDS(IRECV), & - & KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IREQ(IR), & - & KTAG=ITAG,CDSTRING='TRLTOG:' ) - ENDDO - - !...Send loop......................................................... - DO INS=1,INSEND - IR=IR+1 - ISEND=JSEND(INS) - CALL MPL_SEND(ZCOMBUFS(-1:ISENDTOT(ISEND),INS),KDEST=NPRCIDS(ISEND),& - & KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IREQ(IR), & - & KTAG=ITAG,CDSTRING='TRLTOG:') - ENDDO - - IF(IR > 0) THEN - CALL MPL_WAIT(KREQUEST=IREQ(1:IR), & - & CDSTRING='TRLTOG: WAIT FOR SENDS AND RECEIVES') - ENDIF - - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=(TIMEF()-Tc)/1000.0_JPRBT - !IF(MPL_MYRANK==1) WRITE(*,*) "non-CUDA-aware isend/irecv (trltog) in sec: ", Tc - #endif - - CALL GSTATS(805,1) - - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=TIMEF() - #endif - ! Unpack loop......................................................... - - CALL GSTATS(1606,0) - !$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(IFLDT,JBLK,IFIRST,ILAST,JK,& - !$OMP& JJ,JI,JPOS,INR,IRECV,IRECVSET,IRECV_FLD_START,IRECV_FLD_END,IPOS,& - !$OMP& ISETA,ISETB,ISETW,ISETV,JFLD,IFLD,IFLDA) - DO INR=1,INRECV - IRECV=JRECV(INR) - CALL PE2SET(IRECV,ISETA,ISETB,ISETW,ISETV) - IRECVSET = ISETV - IRECV_FLD_START = ZCOMBUFR(-1,INR) - IRECV_FLD_END = ZCOMBUFR(0,INR) - IFLD = 0 - IPOS = 0 - DO JFLD=1,KF_GP - IF(KVSET(JFLD) == IRECVSET .OR. KVSET(JFLD) == -1 ) THEN - IFLD = IFLD+1 - IFLDA(IFLD)=JFLD - ENDIF - ENDDO - - DO JBLK=1,NGPBLKS - IFIRST = IGPTRSEND(1,JBLK,ISETW) - IF(IFIRST > 0) THEN - ILAST = IGPTRSEND(2,JBLK,ISETW) - JPOS(JBLK)=IPOS - IPOS=IPOS+(ILAST-IFIRST+1) - ENDIF - ENDDO - - - DO JJ=IRECV_FLD_START,IRECV_FLD_END - IFLDT=IFLDA(JJ) - DO JBLK=1,NGPBLKS - IFIRST = IGPTRSEND(1,JBLK,ISETW) - IF(IFIRST > 0) THEN - ILAST = IGPTRSEND(2,JBLK,ISETW) - IF(LLINDER) THEN - DO JK=IFIRST,ILAST - JI=(JJ-IRECV_FLD_START)*IPOS+JPOS(JBLK)+JK-IFIRST+1 - PGP(JK,KPTRGP(IFLDT),JBLK) = ZCOMBUFR(JI,INR) - ENDDO - ELSEIF(LLPGPONLY) THEN - DO JK=IFIRST,ILAST - JI=(JJ-IRECV_FLD_START)*IPOS+JPOS(JBLK)+JK-IFIRST+1 - PGP(JK,IFLDT,JBLK) = ZCOMBUFR(JI,INR) - ENDDO - ELSEIF(LLUV(IFLDT)) THEN - DO JK=IFIRST,ILAST - JI=(JJ-IRECV_FLD_START)*IPOS+JPOS(JBLK)+JK-IFIRST+1 - PGPUV(JK,IUVLEVS(IFLDT),IUVPARS(IFLDT),JBLK) = ZCOMBUFR(JI,INR) - ENDDO - ELSEIF(LLGP2(IFLDT)) THEN - DO JK=IFIRST,ILAST - JI=(JJ-IRECV_FLD_START)*IPOS+JPOS(JBLK)+JK-IFIRST+1 - PGP2(JK,IGP2PARS(IFLDT),JBLK) = ZCOMBUFR(JI,INR) - ENDDO - ELSEIF(LLGP3A(IFLDT)) THEN - DO JK=IFIRST,ILAST - JI=(JJ-IRECV_FLD_START)*IPOS+JPOS(JBLK)+JK-IFIRST+1 - PGP3A(JK,IGP3ALEVS(IFLDT),IGP3APARS(IFLDT),JBLK) = ZCOMBUFR(JI,INR) - ENDDO - ELSEIF(LLGP3B(IFLDT)) THEN - DO JK=IFIRST,ILAST - JI=(JJ-IRECV_FLD_START)*IPOS+JPOS(JBLK)+JK-IFIRST+1 - PGP3B(JK,IGP3BLEVS(IFLDT),IGP3BPARS(IFLDT),JBLK) = ZCOMBUFR(JI,INR) - ENDDO - ENDIF - ENDIF - ENDDO - ENDDO - - IPOS=(IRECV_FLD_END-IRECV_FLD_START+1)*IPOS - ENDDO - !$OMP END PARALLEL DO - - #ifdef COMVERBOSE - call MPI_BARRIER(MPI_COMM_WORLD,IERROR) - Tc=(TIMEF()-Tc)/1000.0_JPRBT - !IF(MPL_MYRANK==1) WRITE(*,*) "unpacking (trltog) in sec: ", Tc - #endif - - CALL GSTATS(1606,1) - IF (IBUFLENS > 0) DEALLOCATE(ZCOMBUFS) - IF (IBUFLENR > 0) DEALLOCATE(ZCOMBUFR) - - IF (LHOOK) CALL DR_HOOK('TRLTOG',1,ZHOOK_HANDLE) - - END SUBROUTINE TRLTOG END MODULE TRLTOG_MOD diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index 2286393ab..ae282ef17 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -12,6 +12,9 @@ MODULE TRLTOM_MOD USE ALLOCATOR_MOD IMPLICIT NONE + PRIVATE + PUBLIC :: TRLTOM_CUDAAWARE, PREPARE_TRLTOM, TRLTOM_HANDLE + TYPE TRLTOM_HANDLE TYPE(ALLOCATION_RESERVATION_HANDLE) :: HPFBUF END TYPE diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index c5acfac8f..874cd9a73 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -9,9 +9,29 @@ ! MODULE TRMTOL_MOD + USE ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: TRMTOL_CUDAAWARE, PREPARE_TRMTOL, TRMTOL_HANDLE + + TYPE TRMTOL_HANDLE + END TYPE CONTAINS -SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) + FUNCTION PREPARE_TRMTOL(ALLOCATOR, KF_FS) RESULT(HTRMTOL) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: D + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + TYPE(TRMTOL_HANDLE) :: HTRMTOL + + REAL(KIND=JPRBT) :: DUMMY + END FUNCTION +SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KFIELD) !**** *trmtol * - transposition in Fourier space @@ -82,6 +102,9 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE INTEGER(KIND=JPIM) :: IERROR +TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR +TYPE(TRMTOL_HANDLE), INTENT(IN) :: HTRMTOL + #ifdef PARKINDTRANS_SINGLE #define TRMTOL_DTYPE MPI_REAL #else @@ -162,114 +185,4 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) ! ------------------------------------------------------------------ END SUBROUTINE TRMTOL_CUDAAWARE -SUBROUTINE TRMTOL(PFBUF_IN,PFBUF,KFIELD) - -!**** *TRMTOL * - transposition in Fourier space - -! Purpose. -! -------- -! Transpose Fourier buffer data from partitioning -! over wave numbers to partitioning over latitudes. -! It is called between direct FFT and direct Legendre -! transform. -! This routine is the inverse of TRLTOM. - - -!** Interface. -! ---------- -! *CALL* *TRMTOL(...)* - -! Explicit arguments : PFBUF - Fourier coefficient buffer. It is -! -------------------- used for both input and output. -! KFIELD - Number of fields communicated - -! Implicit arguments : -! -------------------- - -! Method. -! ------- -! See documentation - -! Externals. -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! MPP Group *ECMWF* - -! Modifications. -! -------------- -! Original : 95-10-01 -! Modified : 97-06-17 G. Mozdzynski - control MPI mailbox use -! (NCOMBFLEN) for nphase.eq.1 -! Modified : 99-05-28 D.Salmond - Optimise copies. -! Modified : 00-02-02 M.Hamrud - Remove NPHASE -! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message -! passing and buffer packing -! G.Mozdzynski: 08-01-01 Cleanup -! Y.Seity : 07-08-31 Add barrier synchronisation under LSYNC_TRANS -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT -USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK -USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM -USE TPM_DISTR ,ONLY : D, NPRTRW, NPROC, MYPROC, MYSETW -USE TPM_GEN ,ONLY : LSYNC_TRANS -USE MPI - -IMPLICIT NONE - -INTEGER(KIND=JPIM) ,INTENT(IN) :: KFIELD -REAL(KIND=JPRBT), INTENT(OUT), ALLOCATABLE :: PFBUF(:) -REAL(KIND=JPRBT), INTENT(IN), ALLOCATABLE :: PFBUF_IN(:) - -INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) -INTEGER(KIND=JPIM) :: J, ILEN, ISTA -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE -INTEGER(KIND=JPIM) :: IERROR - -IF (LHOOK) CALL DR_HOOK('TRMTOL',0,ZHOOK_HANDLE) - -ALLOCATE(PFBUF(D%NLENGT0B*KFIELD)) -!$ACC ENTER DATA CREATE(PFBUF) - -IF(NPROC > 1) THEN - DO J=1,NPRTRW - ILENS(J) = D%NLTSFTB(J)*KFIELD - IOFFS(J) = D%NSTAGT1B(J)*KFIELD - ILENR(J) = D%NLTSGTB(J)*KFIELD - IOFFR(J) = D%NSTAGT0B(J)*KFIELD - ENDDO - - CALL GSTATS(807,0) - !$ACC UPDATE HOST(PFBUF_IN) - - CALL MPL_ALLTOALLV(PSENDBUF=PFBUF_IN,KSENDCOUNTS=ILENS,& - & PRECVBUF=PFBUF,KRECVCOUNTS=ILENR,KSENDDISPL=IOFFS,KRECVDISPL=IOFFR,& - & KCOMM=MPL_ALL_MS_COMM,CDSTRING='TRMTOL:') - - !$ACC UPDATE DEVICE(PFBUF) - CALL MPI_BARRIER(MPI_COMM_WORLD,IERROR) - CALL GSTATS(807,1) -ELSE - ILEN = D%NLTSGTB(MYSETW)*KFIELD - ISTA = D%NSTAGT0B(MYSETW)*KFIELD+1 - CALL GSTATS(1608,0) - !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) - DO J=ISTA,ISTA+ILEN-1 - PFBUF(J) = PFBUF_IN(J) - ENDDO - CALL GSTATS(1608,1) -ENDIF - -!$ACC EXIT DATA DELETE(PFBUF_IN) -DEALLOCATE(PFBUF_IN) - -IF (LHOOK) CALL DR_HOOK('TRMTOL',1,ZHOOK_HANDLE) -! ------------------------------------------------------------------ -END SUBROUTINE TRMTOL END MODULE TRMTOL_MOD From e9dde3b165d509d72c19dc6f123d9914abfffd93 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:37 -0700 Subject: [PATCH 232/263] Initial allocation buffering implementation for inv trans --- .../gpu/algor/external/fourier/fft_wrapper.cu | 2 +- .../gpu/algor/external/gemm/gemm_wrapper.cu | 5 +- src/trans/gpu/internal/allocator_mod.F90 | 6 +- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 9 +- src/trans/gpu/internal/fourier_in_mod.F90 | 27 +- src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 98 ++- src/trans/gpu/internal/leinv_mod.F90 | 679 +++++++++--------- src/trans/gpu/internal/ltinv_mod.F90 | 170 ++++- src/trans/gpu/internal/tpm_trans.F90 | 3 +- src/trans/gpu/internal/trgtol_mod.F90 | 13 +- src/trans/gpu/internal/trltog_mod.F90 | 36 +- src/trans/gpu/internal/trltom_pack_unpack.F90 | 24 +- src/trans/gpu/internal/trmtol_mod.F90 | 38 +- 13 files changed, 623 insertions(+), 487 deletions(-) diff --git a/src/trans/gpu/algor/external/fourier/fft_wrapper.cu b/src/trans/gpu/algor/external/fourier/fft_wrapper.cu index 26b84ddc4..fcff3eefe 100644 --- a/src/trans/gpu/algor/external/fourier/fft_wrapper.cu +++ b/src/trans/gpu/algor/external/fourier/fft_wrapper.cu @@ -95,7 +95,7 @@ void execute_fft(typename Type::real *data_real, // the plan is cached, but the pointers are not correct. we remove and // delete the graph, but we keep the FFT plans, if this happens more often, // we should cache this... - std::cout << "WARNING: POINTER CHANGE --> THIS MIGHT BE SLOW" << std::endl; + std::cout << "WARNING FFT: POINTER CHANGE --> THIS MIGHT BE SLOW" << std::endl; CUDA_CHECK(cudaGraphExecDestroy(graphCache[kfield])); graphCache.erase(kfield); ptrCache.erase(kfield); diff --git a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu index e89a4257b..9f3afc161 100644 --- a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu +++ b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu @@ -75,7 +75,10 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, // the plan is cached, but the pointers are not correct. we remove and // delete the graph, but we keep the cublas handles, if this happens more // often, we should cache this... - std::cout << "WARNING: POINTER CHANGE --> THIS MIGHT BE SLOW" << std::endl; + std::cout << "WARNING GEMM: POINTER CHANGE - Graph recreation might be slow.\n"; + std::cout << "We have an entry with key {m=" << m << ", blas_id=" << blas_id << "}\n"; + std::cout << "Pointers: " << std::get<0>(ptrs->second) << ", " << std::get<1>(ptrs->second) << ", " << std::get<2>(ptrs->second) << " vs. " + << A << ", " << B << ", " << C << std::endl; CUDA_CHECK(cudaGraphExecDestroy(graphCache[key])); graphCache.erase(key); ptrCache.erase(key); diff --git a/src/trans/gpu/internal/allocator_mod.F90 b/src/trans/gpu/internal/allocator_mod.F90 index 9b0c5bdf4..a31b5903d 100644 --- a/src/trans/gpu/internal/allocator_mod.F90 +++ b/src/trans/gpu/internal/allocator_mod.F90 @@ -52,6 +52,11 @@ SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES) CONTAINS + ! TODO This is not perfect yet. We will over-allocate up to 2X in theory. + ! It would be better to always keep the previous allocation size and then + ! have one allocation sitting at the the top, and the double-buffer at + ! the bottom of the allocation. + FUNCTION MAKE_BUFFERED_ALLOCATOR() TYPE(BUFFERED_ALLOCATOR) :: MAKE_BUFFERED_ALLOCATOR @@ -90,7 +95,6 @@ SUBROUTINE INSTANTIATE_ALLOCATOR(ALLOCATOR, OLD_PTR) OLD_PTR => ALLOCATOR%PTR ELSE ALLOCATOR%PTR(1:) => OLD_PTR(1:) - NULLIFY(OLD_PTR) ENDIF ELSE ALLOCATE(ALLOCATOR%PTR(1:SUM(ALLOCATOR%BUFR_SZ))) diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index ce2272cb9..f389604cd 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -149,8 +149,6 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& TYPE(TRLTOM_DIRECT_HANDLE) :: HTRLTOM_DIRECT TYPE(LTDIR_HANDLE) :: HLTDIR -INTEGER(KIND=C_INT8_T), POINTER :: PTR(:) - IF(NPROMATR > 0) THEN PRINT *, "ERROR, not implemented right now (NPROMATR > 0)" STOP 4 @@ -170,11 +168,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& HLTDIR = PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) ! TODO this is going to be simplified when we have it implemented for invtrans too - IF(ALLOCATED(REUSE_PTR)) & - CALL C_F_POINTER(C_LOC(REUSE_PTR), PTR, (/ SIZEOF(REUSE_PTR) /) ) - CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, PTR) - IF (ASSOCIATED(PTR)) & - CALL C_F_POINTER(C_LOC(PTR), REUSE_PTR, (/ SIZEOF(PTR)/SIZEOF(REUSE_PTR(1)) /) ) + CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, REUSE_PTR) ! from the PGP arrays to PREEL_REAL CALL TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,& @@ -202,6 +196,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ENDIF ELSE + WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_DIRECT' ! Short cut - no need to go through tansforms, we will go directly into ! the legendre space CALL TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 index aa94e6da4..1f3b53979 100755 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ b/src/trans/gpu/internal/fourier_in_mod.F90 @@ -16,19 +16,25 @@ MODULE FOURIER_IN_MOD PUBLIC :: FOURIER_IN, FOURIER_IN_HANDLE, PREPARE_FOURIER_IN TYPE FOURIER_IN_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HREEL END TYPE CONTAINS - FUNCTION PREPARE_FOURIER_IN(ALLOCATOR) RESULT(HFOURIER_IN) + FUNCTION PREPARE_FOURIER_IN(ALLOCATOR,KF_FS) RESULT(HFOURIER_IN) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT USE TPM_DISTR, ONLY: D IMPLICIT NONE TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM) :: KF_FS TYPE(FOURIER_IN_HANDLE) :: HFOURIER_IN + REAL(KIND=JPRBT) :: DUMMY + + HFOURIER_IN%HREEL = RESERVE(ALLOCATOR, D%NLENGTF*KF_FS*SIZEOF(DUMMY)) + END FUNCTION SUBROUTINE FOURIER_IN(ALLOCATOR,HFOURIER_IN,FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) @@ -66,25 +72,17 @@ SUBROUTINE FOURIER_IN(ALLOCATOR,HFOURIER_IN,FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_T IMPLICIT NONE -REAL(KIND=JPRBT), ALLOCATABLE, INTENT(INOUT) :: FOUBUF(:) -REAL(KIND=JPRBT), INTENT(OUT) :: PREEL_COMPLEX(:) +REAL(KIND=JPRBT), INTENT(IN) :: FOUBUF(:) +REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_CURRENT, KF_TOTAL TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(FOURIER_IN_HANDLE), INTENT(IN) :: HFOURIER_IN INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX -IF(MYPROC > NPROC/2)THEN - IBEG=1 - IEND=D%NDGL_FS - IINC=1 -ELSE - IBEG=D%NDGL_FS - IEND=1 - IINC=-1 -ENDIF +CALL ASSIGN_PTR(PREEL_COMPLEX, GET_ALLOCATION(ALLOCATOR, HFOURIER_IN%HREEL),& + & 1_C_SIZE_T, KF_TOTAL*D%NLENGTF*SIZEOF(PREEL_COMPLEX(1))) !$ACC DATA PRESENT(D,G_NLOEN,G_NMEN,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF) ASYNC(1) @@ -119,9 +117,6 @@ SUBROUTINE FOURIER_IN(ALLOCATOR,HFOURIER_IN,FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_T !$ACC WAIT(1) -!$ACC EXIT DATA DELETE(FOUBUF) -DEALLOCATE(FOUBUF) - END SUBROUTINE FOURIER_IN END MODULE FOURIER_IN_MOD diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index f74707833..febb6bdaa 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -135,13 +135,14 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& INTEGER(KIND=JPIM) :: KFIELD -REAL(KIND=JPRB), ALLOCATABLE :: FOUBUF(:) -REAL(KIND=JPRB), ALLOCATABLE :: FOUBUF_IN(:) +REAL(KIND=JPRB), POINTER :: FOUBUF(:) +REAL(KIND=JPRB), POINTER :: FOUBUF_IN(:) REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:) REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) INTEGER(KIND=JPIM) :: KUV_OFFSET, KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, & & KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET +INTEGER(KIND=JPIM) :: IF_LEG, IF_FOURIER INTEGER(KIND=JPIM) :: IST INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) @@ -158,6 +159,8 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& TYPE(FTINV_HANDLE) :: HFTINV TYPE(TRLTOG_HANDLE) :: HTRLTOG +INTEGER(KIND=C_INT8_T), POINTER :: PTR(:) + ! ------------------------------------------------------------------ IF(NPROMATR > 0) THEN @@ -165,71 +168,29 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& stop 24 ENDIF - ALLOCATOR = MAKE_BUFFERED_ALLOCATOR() - HLTINV = PREPARE_LTINV(ALLOCATOR,KF_GP,KF_FS) - HTRMTOL = PREPARE_TRMTOL(ALLOCATOR,KF_FS) - HFOURIER_IN = PREPARE_FOURIER_IN(ALLOCATOR) - HFSC = PREPARE_FSC(ALLOCATOR) - HFTINV = PREPARE_FTINV(ALLOCATOR) - HTRLTOG = PREPARE_TRLTOG(ALLOCATOR) - - ! No splitting of fields, transform done in one go - ! from PSPXXX to FOUBUF - - CALL GSTATS(102,0) - CALL LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& - & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & - & FOUBUF_IN,KFIELD) - CALL GSTATS(102,1) - - CALL GSTATS(152,0) - WRITE(NOUT,*) 'ltinv_ctl:TRMTOL_CUDAAWARE' - CALL TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,FOUBUF_IN,FOUBUF,KFIELD) - CALL GSTATS(152,1) - - CALL GSTATS(107,0) - - ! Compute PREEL_COMPLEX Domain decomposition - IFIRST = 0 - IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity - IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence - IFIRST = IFIRST + 2*KF_UV ! U and V - IFIRST = IFIRST + KF_SCALARS ! Scalars - IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives - ! This verifies if we get the same assumptions about how much data we get from the LT space - IF (2*IFIRST /= KFIELD) CALL ABORT_TRANS('Size mismatch: LT and FT do not agree on input size') - IF (LUVDER) IFIRST = IFIRST+2*KF_UV ! U and V derivatives - IF (LSCDERS) IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives - IF (IFIRST /= KF_FS) CALL ABORT_TRANS('Size mismatch: Wrong computation KF_FS') - - IF (.NOT. ALLOCATED(REUSE_PTR)) THEN - ALLOCATE(REUSE_PTR(KF_FS*D%NLENGTF)) - !$ACC ENTER DATA CREATE(REUSE_PTR) - ELSEIF (SIZE(REUSE_PTR) < KF_FS*D%NLENGTF) THEN - !$ACC EXIT DATA DELETE(REUSE_PTR) - DEALLOCATE(REUSE_PTR) - ALLOCATE(REUSE_PTR(KF_FS*D%NLENGTF)) - !$ACC ENTER DATA CREATE(REUSE_PTR) - ENDIF - PREEL_COMPLEX => REUSE_PTR + ! Compute Vertical domain decomposition ! Initialize potentially unset offsets KSCALARS_NSDER_OFFSET = -1 KUV_EWDER_OFFSET = -1 KSCALARS_EWDER_OFFSET = -1 - ! And reiterate domain decomposition to assign pointers + ! (note in ltinv we will initially start with a slightly different domain decomposition + ! which always has vorticity and divergence because this is the actual input) IFIRST = 0 IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence KUV_OFFSET = IFIRST - IFIRST = IFIRST + 2*KF_UV ! U and V + IFIRST = IFIRST + KF_UV ! U + IFIRST = IFIRST + KF_UV ! V KSCALARS_OFFSET = IFIRST IFIRST = IFIRST + KF_SCALARS ! Scalars IF (LSCDERS) THEN KSCALARS_NSDER_OFFSET = IFIRST IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives ENDIF + ! the rest of fields is being computed in fourier space, namely in FSC + IF_LEG = IFIRST IF (LUVDER) THEN KUV_EWDER_OFFSET = IFIRST IFIRST = IFIRST+2*KF_UV ! U and V derivatives @@ -238,14 +199,45 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& KSCALARS_EWDER_OFFSET = IFIRST IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives ENDIF + IF_FOURIER = IFIRST + IF (IF_FOURIER /= KF_FS) CALL ABORT_TRANS('Size mismatch: Wrong computation KF_FS') + + ALLOCATOR = MAKE_BUFFERED_ALLOCATOR() + HLTINV = PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) + HTRMTOL = PREPARE_TRMTOL(ALLOCATOR,IF_LEG) + HFOURIER_IN = PREPARE_FOURIER_IN(ALLOCATOR,IF_FOURIER) + HFSC = PREPARE_FSC(ALLOCATOR) + HFTINV = PREPARE_FTINV(ALLOCATOR) + HTRLTOG = PREPARE_TRLTOG(ALLOCATOR,IF_FOURIER,KF_GP) + + CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, REUSE_PTR) + + ! No splitting of fields, transform done in one go + ! from PSPXXX to FOUBUF + + CALL GSTATS(102,0) + IF (KF_FS > 0) THEN + CALL LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& + & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & + & FOUBUF_IN,KFIELD) + ENDIF + CALL GSTATS(102,1) + + CALL GSTATS(152,0) + WRITE(NOUT,*) 'ltinv_ctl:TRMTOL_CUDAAWARE' + IF (KF_FS > 0) CALL TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,FOUBUF_IN,FOUBUF,IF_LEG) + CALL GSTATS(152,1) + + CALL GSTATS(107,0) + ! from FOUBUF to PREEL_COMPLEX. Divide by two because we consider this complex space now - CALL FOURIER_IN(ALLOCATOR,HFOURIER_IN,FOUBUF,PREEL_COMPLEX,KFIELD/2,KF_FS) + IF (KF_FS > 0) CALL FOURIER_IN(ALLOCATOR,HFOURIER_IN,FOUBUF,PREEL_COMPLEX,KFIELD/2,KF_FS) ! 2. Fourier space computations ! fill the rest of PREEL_COMPLEX - CALL FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, & + IF (KF_FS > 0) CALL FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, & & KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) ! 3. Fourier transform diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 4969ffe3e..ec38ef3b0 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -10,366 +10,355 @@ ! MODULE LEINV_MOD -CONTAINS -SUBROUTINE LEINV(PIA,FOUBUF_IN) - -!**** *LEINV* - Inverse Legendre transform. - -! Purpose. -! -------- -! Inverse Legendre tranform of all variables(kernel). - -!** Interface. -! ---------- -! CALL LEINV(...) + USE PARKIND_ECTRANS ,ONLY : JPIM + IMPLICIT NONE -! Explicit arguments : KM - zonal wavenumber (input-c) -! -------------------- KFC - number of fields to tranform (input-c) -! PIA - spectral fields -! for zonal wavenumber KM (input) + PRIVATE + PUBLIC :: LEINV_STRIDES, LEINV -! Implicit arguments : None. -! -------------------- + INTEGER(KIND=JPIM) :: A = 8 !Alignment -! Method. use butterfly or dgemm -! ------- - -! Externals. -! ---------- +CONTAINS + SUBROUTINE LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& + IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD + USE TPM_DIM ,ONLY : R + + IMPLICIT NONE + + INTEGER(KIND=JPIM), INTENT(IN) :: KF_LEG + + INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0, IOUT_STRIDES1 + INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_STRIDES1 + INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_STRIDES1 + + + IF (PRESENT(IOUT_STRIDES0)) & + IOUT0_STRIDES0 = ALIGN(KF_LEG,A) + IF (PRESENT(IOUT_STRIDES1)) & + IOUT0_STRIDES1 = IOUT0_STRIDES0 * ALIGN(R%NDGNH,A) + IF (PRESENT(IIN_STRIDES0)) & + IIN_STRIDES0 = ALIGN(2*KF_LEG,A) + IF (PRESENT(IIN_STRIDES1)) & + IIN_STRIDES1 = IIN_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) + IF (PRESENT(IOUT0_STRIDES0)) & + IOUT_STRIDES0 = ALIGN(2*KF_LEG,A) + IF (PRESENT(IOUT0_STRIDES1)) & + IOUT_STRIDES1 = IOUT_STRIDES0 * ALIGN(R%NDGNH,A) + IF (PRESENT(IIN0_STRIDES0)) & + IIN0_STRIDES0 = ALIGN(KF_LEG,A) + IF (PRESENT(IIN0_STRIDES1)) & + IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) + END SUBROUTINE + + SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN) + + !**** *LEINV* - Inverse Legendre transform. + + ! Purpose. + ! -------- + ! Inverse Legendre tranform of all variables(kernel). + + !** Interface. + ! ---------- + ! CALL LEINV(...) + + ! Explicit arguments : KM - zonal wavenumber (input-c) + ! -------------------- KFC - number of fields to tranform (input-c) + ! PIA - spectral fields + ! for zonal wavenumber KM (input) + + ! Implicit arguments : None. + ! -------------------- + + ! Method. use butterfly or dgemm + ! ------- + + ! Externals. + ! ---------- + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! Nils Wedi + Mats Hamrud + George Modzynski + ! + ! Modifications. + ! -------------- + ! J.Hague : Oct 2012 DR_HOOK round calls to DGEMM: + ! F. Vana 05-Mar-2015 Support for single precision + ! ------------------------------------------------------------------ + + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT, JPRD + USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX, R_NDGL + USE TPM_GEOMETRY ,ONLY : G, G_NDGLU + USE TPM_FIELDS ,ONLY : ZAA,ZAS,ZAA0,ZAS0,KMLOC0 + USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1,MYPROC + USE TPM_GEN, ONLY: NOUT + USE CUDA_GEMM_BATCHED_MOD + USE, INTRINSIC :: ISO_C_BINDING + USE MPL_MODULE ,ONLY : MPL_BARRIER + USE IEEE_ARITHMETIC + USE OPENACC + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + + IMPLICIT NONE + + + ! DUMMY ARGUMENTS + INTEGER(KIND=JPIM) :: KM + INTEGER(KIND=JPIM) :: KMLOC + INTEGER(KIND=JPIM) :: KIFC + INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) + REAL(KIND=JPRB), INTENT(IN) :: PIA(:,:,:) + REAL(KIND=JPRB), INTENT(OUT) :: FOUBUF_IN(:) + REAL(KIND=JPRBT), INTENT(OUT) :: ZINP(:), ZOUTS(:), ZOUTA(:) + REAL(KIND=JPRD), INTENT(OUT) :: ZINP0(:), ZOUTS0(:), ZOUTA0(:) + + ! LOCAL + REAL(KIND=JPRBT) :: ZAOA, ZSOA + + INTEGER(KIND=JPIM) :: IA, IS, ISL, J1, JGL, JK, J, IGLS, ISTAS, OFFSET1, OFFSET2 + INTEGER(KIND=JPIM) :: KF_LEG + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS + + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + + + !* 1.1 PREPARATIONS. + IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) + + KF_LEG = SIZE(PIA,1)/2 + ! ------------------------------------------------------------------ + + !* 1. PERFORM LEGENDRE TRANFORM. + ! -------------------------- + + !* 1.1 PREPARATIONS. + + CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& + IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) + + + !$ACC DATA PRESENT(D,D_MYMS,G,G_NDGLU,R) & + !$ACC& PRESENT(ZINP,ZOUTS,ZOUTA,ZINP0,ZOUTS0,ZOUTA0) & + !$ACC& PRESENT(ZAA,ZAS,PIA) & + !$ACC& PRESENT(D_MYMS,D_NPNTGTB1,G_NDGLU) -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS + IF (KMLOC0 > 0) THEN + print*,'computing m=0 in double precision' + ENDIF -! Author. -! ------- -! Nils Wedi + Mats Hamrud + George Modzynski -! -! Modifications. -! -------------- -! J.Hague : Oct 2012 DR_HOOK round calls to DGEMM: -! F. Vana 05-Mar-2015 Support for single precision -! ------------------------------------------------------------------ - -USE TPM_GEN ,ONLY : LSYNC_TRANS -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT, JPRD -USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK -USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX, R_NDGL -USE TPM_GEOMETRY ,ONLY : G, G_NDGLU -USE TPM_TRANS, ONLY: REUSE_PTR -USE TPM_FIELDS ,ONLY : ZAA,ZAS,ZAA0,ZAS0,KMLOC0 -USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1 -USE TPM_GEN, ONLY: NOUT -USE CUDA_GEMM_BATCHED_MOD -USE, INTRINSIC :: ISO_C_BINDING -USE MPL_MODULE ,ONLY : MPL_BARRIER -USE IEEE_ARITHMETIC -USE OPENACC -USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX - -IMPLICIT NONE - - -! DUMMY ARGUMENTS -INTEGER(KIND=JPIM) :: KM -INTEGER(KIND=JPIM) :: KMLOC -INTEGER(KIND=JPIM) :: KIFC -INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) -REAL(KIND=JPRB), INTENT(IN) :: PIA(:,:,:) -REAL(KIND=JPRB), INTENT(OUT), ALLOCATABLE :: FOUBUF_IN(:) - -! LOCAL -REAL(KIND=JPRBT), POINTER :: ZBASE(:), ZINP(:), ZOUTS(:), ZOUTA(:) -REAL(KIND=JPRD), POINTER :: ZBASE0(:), ZINP0(:), ZOUTS0(:), ZOUTA0(:) -REAL(KIND=JPRBT) :: ZAOA, ZSOA - -INTEGER(KIND=JPIM) :: IA, IS, ISL, J1, JGL, JK, J, IGLS, ISTAS, OFFSET1, OFFSET2 -INTEGER(KIND=JPIM) :: KFIELDS -INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 -INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 -INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 -INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 -INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS - -INTEGER(KIND=JPIM) :: A = 8 !Alignment - -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - - -!* 1.1 PREPARATIONS. -IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) - -KFIELDS = SIZE(PIA,1) -! ------------------------------------------------------------------ - -!* 1. PERFORM LEGENDRE TRANFORM. -! -------------------------- - -!* 1.1 PREPARATIONS. - -IIN_STRIDES0 = ALIGN(KFIELDS,A) -IIN_STRIDES1 = IIN_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) -IOUT_STRIDES0 = ALIGN(KFIELDS,A) -IOUT_STRIDES1 = IOUT_STRIDES0 * ALIGN(R_NDGNH,A) -IIN0_STRIDES0 = ALIGN(KFIELDS/2,A) -IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) -IOUT0_STRIDES0 = ALIGN(KFIELDS/2,A) -IOUT0_STRIDES1 = IOUT0_STRIDES0 * ALIGN(R_NDGNH,A) - -! Check if the reuse buffer is large enough -ALLOC_SZ = ALIGN(IIN_STRIDES1*D_NUMP,8)*SIZEOF(ZINP(1)) & - +ALIGN(IOUT_STRIDES1*D_NUMP,8)*SIZEOF(ZOUTS(1)) & - +ALIGN(IOUT_STRIDES1*D_NUMP,8)*SIZEOF(ZOUTA(1)) & - +ALIGN(IIN0_STRIDES1,8)*SIZEOF(ZINP0(1)) & - +ALIGN(IOUT0_STRIDES1,8)*SIZEOF(ZOUTS0(1)) & - +ALIGN(IOUT0_STRIDES1,8)*SIZEOF(ZOUTA0(1)) -IF (.NOT. ALLOCATED(REUSE_PTR)) THEN - ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) - !$ACC ENTER DATA CREATE(REUSE_PTR) -ELSEIF (SIZEOF(REUSE_PTR) <= ALLOC_SZ) THEN - ! and reallocate if needed - !$ACC EXIT DATA DELETE(REUSE_PTR) - DEALLOCATE(REUSE_PTR) - ALLOCATE(REUSE_PTR(ALLOC_SZ/SIZEOF(REUSE_PTR(1)))) - !$ACC ENTER DATA CREATE(REUSE_PTR) -ENDIF - -! Figure out which pointers to use -ALLOC_POS=1 -CALL C_F_POINTER(C_LOC(REUSE_PTR(ALLOC_POS:)), ZBASE, & - & [SIZEOF(REUSE_PTR(ALLOC_POS:))/SIZEOF(ZBASE(0))]) - -ZINP(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IIN_STRIDES1*D_NUMP-1) -ALLOC_POS=ALLOC_POS+ALIGN(IIN_STRIDES1*D_NUMP,8) -ZOUTS(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IOUT_STRIDES1*D_NUMP-1) -ALLOC_POS=ALLOC_POS+ALIGN(IOUT_STRIDES1*D_NUMP,8) -ZOUTA(1:) => ZBASE(ALLOC_POS:ALLOC_POS+IOUT_STRIDES1*D_NUMP-1) -ALLOC_POS=ALLOC_POS+ALIGN(IOUT_STRIDES1*D_NUMP,8) - -! The BASE0 pointer points to the rest, but likely in a different type! -CALL C_F_POINTER(C_LOC(REUSE_PTR(ALLOC_POS:)), ZBASE0, & - & [SIZEOF(REUSE_PTR(ALLOC_POS:))/SIZEOF(ZBASE0(0))]) -ALLOC_POS=1 -ZINP0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IIN0_STRIDES1-1) -ALLOC_POS=ALLOC_POS+ALIGN(IIN0_STRIDES1,8) -ZOUTS0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IOUT0_STRIDES1-1) -ALLOC_POS=ALLOC_POS+ALIGN(IOUT0_STRIDES1,8) -ZOUTA0(1:) => ZBASE0(ALLOC_POS:ALLOC_POS+IOUT0_STRIDES1-1) -ALLOC_POS=ALLOC_POS+ALIGN(IOUT0_STRIDES1,8) - -!$ACC DATA PRESENT(D,D_MYMS,G,G_NDGLU,R) & -!$ACC& PRESENT(ZINP,ZOUTS,ZOUTA,ZINP0,ZOUTS0,ZOUTA0) & -!$ACC& PRESENT(ZAA,ZAS,PIA) & -!$ACC& PRESENT(D_MYMS,D_NPNTGTB1,G_NDGLU) - -ALLOCATE(FOUBUF_IN(D%NLENGT1B*KFIELDS)) -!$ACC ENTER DATA CREATE(FOUBUF_IN) - -IF (KMLOC0 > 0) THEN - print*,'computing m=0 in double precision' -ENDIF - -! READ 2:NSMAX+3 - -!IF KM=0 and NSMAX is 6: -! IA=1 -! DO=1,6/2+1 ... 1..4 -! PIA_2=1+1+(J-1)*2 ...2+(0..3)*2 .... 2,4,6,8 -!IF KM=0 and NSMAX is 7: -! IA=2 -! DO=1,7/2+1 ... 1..4 -! PIA_2=2+1+(1..4-1)*2 ...3+(0..3)*2 .... 3,5,7,9 - -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) DEFAULT(NONE) ASYNC(1) -DO KMLOC=1,D_NUMP - DO JK=1,KFIELDS - KM = D_MYMS(KMLOC) - IA = 1+MOD(R_NSMAX-KM+2,2) - IF(KM /= 0)THEN - !$ACC LOOP SEQ - DO J=1,(R_NSMAX-KM+2)/2 - ZINP(JK+(J-1)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PIA(JK,IA+1+(J-1)*2,KMLOC) - ENDDO - ELSEIF (MOD((JK-1),2) .EQ. 0) THEN - ! every other field is sufficient because Im(KM=0) == 0 - !$ACC LOOP SEQ - DO J=1,(R_NSMAX+2)/2 - ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IA+1+(J-1)*2,KMLOC) + ! READ 2:NSMAX+3 + + !IF KM=0 and NSMAX is 6: + ! IA=1 + ! DO=1,6/2+1 ... 1..4 + ! PIA_2=1+1+(J-1)*2 ...2+(0..3)*2 .... 2,4,6,8 + !IF KM=0 and NSMAX is 7: + ! IA=2 + ! DO=1,7/2+1 ... 1..4 + ! PIA_2=2+1+(1..4-1)*2 ...3+(0..3)*2 .... 3,5,7,9 + + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) DEFAULT(NONE) ASYNC(1) + DO KMLOC=1,D_NUMP + DO JK=1,2*KF_LEG + KM = D_MYMS(KMLOC) + IA = 1+MOD(R_NSMAX-KM+2,2) + IF(KM /= 0)THEN + !$ACC LOOP SEQ + DO J=1,(R_NSMAX-KM+2)/2 + ZINP(JK+(J-1)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PIA(JK,IA+1+(J-1)*2,KMLOC) + ENDDO + ELSEIF (MOD((JK-1),2) .EQ. 0) THEN + ! every other field is sufficient because Im(KM=0) == 0 + !$ACC LOOP SEQ + DO J=1,(R_NSMAX+2)/2 + ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IA+1+(J-1)*2,KMLOC) + ENDDO + ENDIF ENDDO + ENDDO + + IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) ENDIF - ENDDO -ENDDO - -IF (LSYNC_TRANS) THEN - !$ACC WAIT(1) - CALL GSTATS(440,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(440,1) -ENDIF -CALL GSTATS(424,0) - -IF (KMLOC0 > 0) THEN - ! compute m=0 in double precision: - CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_T, & - & KFIELDS/2, G%NDGLU(0), (R%NSMAX+2)/2, & - & 1.0_JPRD, & - & ZINP0, IIN0_STRIDES0, 0, & - & ZAA0, SIZE(ZAA0,1), 0, & - & 0.0_JPRD, & - & ZOUTA0, IOUT0_STRIDES0, 0, & - & 1, STREAM=1_C_LONG) -ENDIF - -DO KMLOC=1,D_NUMP - KM = D_MYMS(KMLOC) - KS(KMLOC) = (R%NSMAX-KM+2)/2 - NS(KMLOC) = G%NDGLU(KM) - AOFFSETS(KMLOC) = IIN_STRIDES1*(KMLOC-1) - BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) - COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) -ENDDO -IF(KMLOC0 > 0) THEN - NS(KMLOC0) = 0 - KS(KMLOC0) = 0 -ENDIF -CALL CUDA_GEMM_BATCHED( & - & 11, & ! unique identifier - & CUBLAS_OP_N, CUBLAS_OP_T, & - & KFIELDS, NS(:), KS(:), & - & 1.0_JPRBT, & - & ZINP, IIN_STRIDES0, AOFFSETS, & - & ZAA, SIZE(ZAA,1), BOFFSETS, & - & 0.0_JPRBT, & - & ZOUTA, IOUT_STRIDES0, COFFSETS, & - & D_NUMP, STREAM=1_C_LONG) - -IF (LSYNC_TRANS) THEN - !$ACC WAIT(1) - CALL GSTATS(444,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(444,1) -ENDIF -CALL GSTATS(424,1) - -! 2. +++++++++++++ symmetric -!IF KM=0 and NSMAX is 6: -! IS=2 -! DO=1,4 -! PIA_2=2+1+(0..3)*2 ... 3+(0..3)*2 ... 3,5,7,9 -!IF KM=0 and NSMAX is 7: -! IS=1 -! DO=1,5 -! PIA_2=1+1+(1..5-1)*2 ...2+(0..4)*2 .... 2,4,6,8,10 - -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS,J) DEFAULT(NONE) ASYNC(1) -DO KMLOC=1,D_NUMP - DO JK=1,KFIELDS - KM = D_MYMS(KMLOC) - IS = 1+MOD(R_NSMAX-KM+1,2) - IF(KM /= 0) THEN - !$ACC LOOP SEQ - DO J=1,(R_NSMAX-KM+3)/2 - ZINP(JK+(J-1)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PIA(JK,IS+1+(J-1)*2,KMLOC) - ENDDO - ELSEIF (MOD((JK-1),2) == 0) THEN - !$ACC LOOP SEQ - DO J=1,(R_NSMAX+3)/2 - ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IS+1+(J-1)*2,KMLOC) - ENDDO + CALL GSTATS(424,0) + + IF (KMLOC0 > 0) THEN + ! compute m=0 in double precision: + CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_T, & + & KF_LEG, G%NDGLU(0), (R%NSMAX+2)/2, & + & 1.0_JPRD, & + & ZINP0, IIN0_STRIDES0, 0, & + & ZAA0, SIZE(ZAA0,1), 0, & + & 0.0_JPRD, & + & ZOUTA0, IOUT0_STRIDES0, 0, & + & 1, STREAM=1_C_LONG) ENDIF - ENDDO -ENDDO - -IF (LSYNC_TRANS) THEN - !$ACC WAIT(1) - CALL GSTATS(440,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(440,1) -ENDIF -CALL GSTATS(424,0) - -IF (KMLOC0 > 0) THEN - CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_T, & - & KFIELDS/2, G%NDGLU(0), (R%NSMAX+3)/2, & - & 1.0_JPRD, & - & ZINP0, IIN0_STRIDES0, 0, & - & ZAS0, SIZE(ZAS0,1), 0, & - & 0.0_JPRD, & - & ZOUTS0, IOUT0_STRIDES0, 0, & - & 1, STREAM=1_C_LONG) -ENDIF - -DO KMLOC=1,D_NUMP - KM = D_MYMS(KMLOC) - KS(KMLOC) = (R%NSMAX-KM+3)/2 - NS(KMLOC) = G%NDGLU(KM) - AOFFSETS(KMLOC) = IIN_STRIDES1*(KMLOC-1) - BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) - COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) -ENDDO -IF(KMLOC0 > 0) THEN - NS(KMLOC0) = 0 - KS(KMLOC0) = 0 -ENDIF -CALL CUDA_GEMM_BATCHED( & - & 12, & ! unique identifier - & CUBLAS_OP_N, CUBLAS_OP_T, & - & KFIELDS, NS(:), KS(:), & - & 1.0_JPRBT, & - & ZINP, IIN_STRIDES0, AOFFSETS, & - & ZAS, SIZE(ZAS,1), BOFFSETS, & - & 0.0_JPRBT, & - & ZOUTS, IOUT_STRIDES0, COFFSETS, & - & D_NUMP, STREAM=1_C_LONG) -IF (LSYNC_TRANS) THEN - !$ACC WAIT(1) - CALL GSTATS(444,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(444,1) -ENDIF -CALL GSTATS(424,1) - -!$ACC DATA PRESENT(FOUBUF_IN) -!$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,ISTAS,ZAOA,ZSOA) ASYNC(1) -DO KMLOC=1,D_NUMP - DO JGL=1,R_NDGNH - DO JK=1,KFIELDS - KM = D_MYMS(KMLOC) - ISL = R_NDGNH-G_NDGLU(KM)+1 - IF (JGL >= ISL) THEN - !(DO JGL=ISL,R_NDGNH) - IGLS = R_NDGL+1-JGL - OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*KFIELDS - OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*KFIELDS + DO KMLOC=1,D_NUMP + KM = D_MYMS(KMLOC) + KS(KMLOC) = (R%NSMAX-KM+2)/2 + NS(KMLOC) = G%NDGLU(KM) + AOFFSETS(KMLOC) = IIN_STRIDES1*(KMLOC-1) + BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) + COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) + ENDDO + IF(KMLOC0 > 0) THEN + NS(KMLOC0) = 0 + KS(KMLOC0) = 0 + ENDIF + CALL CUDA_GEMM_BATCHED( & + & 11, & ! unique identifier + & CUBLAS_OP_N, CUBLAS_OP_T, & + & 2*KF_LEG, NS(:), KS(:), & + & 1.0_JPRBT, & + & ZINP, IIN_STRIDES0, AOFFSETS, & + & ZAA, SIZE(ZAA,1), BOFFSETS, & + & 0.0_JPRBT, & + & ZOUTA, IOUT_STRIDES0, COFFSETS, & + & D_NUMP, STREAM=1_C_LONG) + + IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) + CALL GSTATS(444,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(444,1) + ENDIF + CALL GSTATS(424,1) + + ! 2. +++++++++++++ symmetric + !IF KM=0 and NSMAX is 6: + ! IS=2 + ! DO=1,4 + ! PIA_2=2+1+(0..3)*2 ... 3+(0..3)*2 ... 3,5,7,9 + !IF KM=0 and NSMAX is 7: + ! IS=1 + ! DO=1,5 + ! PIA_2=1+1+(1..5-1)*2 ...2+(0..4)*2 .... 2,4,6,8,10 + + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS,J) DEFAULT(NONE) ASYNC(1) + DO KMLOC=1,D_NUMP + DO JK=1,2*KF_LEG + KM = D_MYMS(KMLOC) + IS = 1+MOD(R_NSMAX-KM+1,2) IF(KM /= 0) THEN - ZSOA = ZOUTS(JK+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) - ZAOA = ZOUTA(JK+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) - ELSEIF (MOD((JK-1),2) .EQ. 0) THEN - ZSOA = ZOUTS0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) - ZAOA = ZOUTA0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) - ELSE - ! Imaginary values of KM=0 is zero, though I don't think we care - ZSOA = 0_JPRBT - ZAOA = 0_JPRBT + !$ACC LOOP SEQ + DO J=1,(R_NSMAX-KM+3)/2 + ZINP(JK+(J-1)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PIA(JK,IS+1+(J-1)*2,KMLOC) + ENDDO + ELSEIF (MOD((JK-1),2) == 0) THEN + !$ACC LOOP SEQ + DO J=1,(R_NSMAX+3)/2 + ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IS+1+(J-1)*2,KMLOC) + ENDDO ENDIF + ENDDO + ENDDO - FOUBUF_IN(OFFSET1+JK) = ZAOA+ZSOA - FOUBUF_IN(OFFSET2+JK) = ZSOA-ZAOA - ENDIF + IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) + ENDIF + CALL GSTATS(424,0) + + IF (KMLOC0 > 0) THEN + CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_T, & + & KF_LEG, G%NDGLU(0), (R%NSMAX+3)/2, & + & 1.0_JPRD, & + & ZINP0, IIN0_STRIDES0, 0, & + & ZAS0, SIZE(ZAS0,1), 0, & + & 0.0_JPRD, & + & ZOUTS0, IOUT0_STRIDES0, 0, & + & 1, STREAM=1_C_LONG) + ENDIF + + DO KMLOC=1,D_NUMP + KM = D_MYMS(KMLOC) + KS(KMLOC) = (R%NSMAX-KM+3)/2 + NS(KMLOC) = G%NDGLU(KM) + AOFFSETS(KMLOC) = IIN_STRIDES1*(KMLOC-1) + BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) + COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) + ENDDO + IF(KMLOC0 > 0) THEN + NS(KMLOC0) = 0 + KS(KMLOC0) = 0 + ENDIF + CALL CUDA_GEMM_BATCHED( & + & 12, & ! unique identifier + & CUBLAS_OP_N, CUBLAS_OP_T, & + & 2*KF_LEG, NS(:), KS(:), & + & 1.0_JPRBT, & + & ZINP, IIN_STRIDES0, AOFFSETS, & + & ZAS, SIZE(ZAS,1), BOFFSETS, & + & 0.0_JPRBT, & + & ZOUTS, IOUT_STRIDES0, COFFSETS, & + & D_NUMP, STREAM=1_C_LONG) + IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) + CALL GSTATS(444,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(444,1) + ENDIF + CALL GSTATS(424,1) + + !$ACC DATA PRESENT(FOUBUF_IN) + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,ISTAS,ZAOA,ZSOA) ASYNC(1) + DO KMLOC=1,D_NUMP + DO JGL=1,R_NDGNH + DO JK=1,2*KF_LEG + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + IF (JGL >= ISL) THEN + !(DO JGL=ISL,R_NDGNH) + IGLS = R_NDGL+1-JGL + OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*2*KF_LEG + OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_LEG + + IF(KM /= 0) THEN + ZSOA = ZOUTS(JK+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) + ZAOA = ZOUTA(JK+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) + ELSEIF (MOD((JK-1),2) .EQ. 0) THEN + ZSOA = ZOUTS0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) + ZAOA = ZOUTA0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) + ELSE + ! Imaginary values of KM=0 is zero, though I don't think we care + ZSOA = 0_JPRBT + ZAOA = 0_JPRBT + ENDIF + + FOUBUF_IN(OFFSET1+JK) = ZAOA+ZSOA + FOUBUF_IN(OFFSET2+JK) = ZSOA-ZAOA + ENDIF + ENDDO + ENDDO ENDDO - ENDDO -ENDDO -!$ACC WAIT(1) + !$ACC WAIT(1) -!$ACC END DATA -!$ACC END DATA + !$ACC END DATA + !$ACC END DATA -IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) -! ------------------------------------------------------------------ + IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) + ! ------------------------------------------------------------------ -END SUBROUTINE LEINV + END SUBROUTINE LEINV END MODULE LEINV_MOD diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index 68fda910c..3ada1996b 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -1,3 +1,4 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2022- NVIDIA. ! @@ -17,23 +18,86 @@ MODULE LTINV_MOD PUBLIC :: LTINV, LTINV_HANDLE, PREPARE_LTINV TYPE LTINV_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HPIA_AND_IN + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HOUTS_AND_OUTA + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HFOUBUF_IN END TYPE CONTAINS - FUNCTION PREPARE_LTINV(ALLOCATOR, KF_FS, KF_UV) RESULT(HLTINV) + FUNCTION PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT(HLTINV) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD USE TPM_DISTR, ONLY: D USE TPM_DIM, ONLY: R USE ISO_C_BINDING - USE LEDIR_MOD + USE LEINV_MOD USE ALLOCATOR_MOD IMPLICIT NONE TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR - INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV + INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV,KF_SCALARS + LOGICAL, INTENT(IN) :: LVORGP,LDIVGP,LSCDERS TYPE(LTINV_HANDLE) :: HLTINV + + INTEGER(KIND=C_SIZE_T) :: IALLOC_SZ, IPIA_SZ + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + + REAL(KIND=JPRBT) :: ZPRBT_DUMMY + REAL(KIND=JPRD) :: ZPRD_DUMMY + + INTEGER(KIND=JPIM) :: IF_READIN, IF_LEG + + ! # fields that are initially read. We always read vorticity + ! and divergence! Also keep in mind that we actually have 2X + ! this number of levels because real+complex + IF_READIN = 0 + IF_READIN = IF_READIN + KF_UV ! Vorticity or divergence + IF_READIN = IF_READIN + KF_UV ! Vorticity or divergence + IF_READIN = IF_READIN + KF_UV ! U + IF_READIN = IF_READIN + KF_UV ! V + IF_READIN = IF_READIN + KF_SCALARS ! Scalars + IF (LSCDERS) & + IF_READIN = IF_READIN + KF_SCALARS ! Scalars NS Derivatives + + IPIA_SZ = ALIGN(2*IF_READIN*(R%NSMAX+3)*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + + ! In Legendre space, we then ignore vorticity/divergence, if + ! they don't need to be transformed. + IF_LEG = IF_READIN + IF(.NOT. LVORGP) IF_LEG = IF_LEG - KF_UV ! No vorticity needed + IF(.NOT. LDIVGP) IF_LEG = IF_LEG - KF_UV ! No divergence needed + + CALL LEINV_STRIDES(IF_LEG,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& + IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) + + ! PIA + IALLOC_SZ = IPIA_SZ + ! ZINP + IALLOC_SZ = IALLOC_SZ + ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + ! ZINP0 + IALLOC_SZ = IALLOC_SZ + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) + + HLTINV%HPIA_AND_IN = RESERVE(ALLOCATOR, IALLOC_SZ) + + IALLOC_SZ = 0 + ! ZOUTA + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + ! ZOUTS + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + ! ZOUTA0 + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) + ! ZOUTS0 + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) + + HLTINV%HOUTS_AND_OUTA = RESERVE(ALLOCATOR, IALLOC_SZ) + + IALLOC_SZ = D%NLENGT1B*2*IF_LEG*SIZEOF(ZPRBT_DUMMY) + HLTINV%HFOUBUF_IN = RESERVE(ALLOCATOR, IALLOC_SZ) + END FUNCTION SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& @@ -51,8 +115,7 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& USE PRFI1B_MOD ,ONLY : PRFI1B USE VDTUV_MOD ,ONLY : VDTUV USE SPNSDE_MOD ,ONLY : SPNSDE - USE LEINV_MOD ,ONLY : LEINV - USE FSPGL_INT_MOD ,ONLY : FSPGL_INT + USE LEINV_MOD USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS use ieee_arithmetic USE TPM_FIELDS ,ONLY : F,ZEPSNM @@ -121,12 +184,12 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC2(:,:) REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC3A(:,:,:) REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC3B(:,:,:) - REAL(KIND=JPRB), ALLOCATABLE, INTENT(OUT) :: FOUBUF_IN(:) + REAL(KIND=JPRB), POINTER, INTENT(OUT) :: FOUBUF_IN(:) INTEGER(KIND=JPIM), INTENT(OUT) :: FOUBUF_KFIELD INTEGER(KIND=JPIM) :: IFIRST, J3 - REAL(KIND=JPRB), ALLOCATABLE, TARGET :: PIA(:,:,:) + REAL(KIND=JPRB), POINTER :: PIA_L(:), PIA(:,:,:) REAL(KIND=JPRB), POINTER :: PU(:,:,:), PV(:,:,:), PVOR(:,:,:), PDIV(:,:,:) REAL(KIND=JPRB), POINTER :: PSCALARS(:,:,:), PSCALARS_NSDER(:,:,:) @@ -134,6 +197,17 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(LTINV_HANDLE), INTENT(IN) :: HLTINV + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + + INTEGER(KIND=JPIM) :: IF_READIN, IF_LEG + INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ + + REAL(KIND=JPRBT), POINTER :: ZOUTS(:), ZOUTA(:), ZINP(:) + REAL(KIND=JPRD), POINTER :: ZOUTS0(:), ZOUTA0(:), ZINP0(:) + ! ------------------------------------------------------------------ !* 1. PERFORM LEGENDRE TRANFORM. @@ -141,6 +215,79 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& IF (LHOOK) CALL DR_HOOK('LTINV_MOD',0,ZHOOK_HANDLE) + ! Get all pointers + IF_READIN = 0 + IF_READIN = IF_READIN + KF_UV ! Vorticity or divergence + IF_READIN = IF_READIN + KF_UV ! Vorticity or divergence + IF_READIN = IF_READIN + KF_UV ! U + IF_READIN = IF_READIN + KF_UV ! V + IF_READIN = IF_READIN + KF_SCALARS ! Scalars + IF (LSCDERS) & + IF_READIN = IF_READIN + KF_SCALARS ! Scalars NS Derivatives + + !IPIA_SZ = ALIGN(2*IF_READIN*(R%NSMAX+3)*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + + ! In Legendre space, we then ignore vorticity/divergence, if + ! they don't need to be transformed. + IF_LEG = IF_READIN + IF(.NOT. LVORGP) IF_LEG = IF_LEG - KF_UV ! No vorticity needed + IF(.NOT. LDIVGP) IF_LEG = IF_LEG - KF_UV ! No divergence needed + + CALL LEINV_STRIDES(IF_LEG,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& + IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) + + IALLOC_POS = 1 + + ! PIA + IALLOC_SZ = ALIGN(2*IF_READIN*(R%NTMAX+3)*D%NUMP*SIZEOF(PIA_L(1)),128) + CALL ASSIGN_PTR(PIA_L, GET_ALLOCATION(ALLOCATOR, HLTINV%HPIA_AND_IN),& + & IALLOC_POS, IALLOC_SZ) + CALL C_F_POINTER(C_LOC(PIA_L), PIA, (/ 2*IF_READIN, R%NTMAX+3, D%NUMP /)) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZINP + IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINP(1)),128) + CALL ASSIGN_PTR(ZINP, GET_ALLOCATION(ALLOCATOR, HLTINV%HPIA_AND_IN),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZINP0 + IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINP0(1)),128) + CALL ASSIGN_PTR(ZINP0, GET_ALLOCATION(ALLOCATOR, HLTINV%HPIA_AND_IN),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + IALLOC_POS = 1 + + ! ZOUTA + IALLOC_SZ = ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZOUTA(1)),128) + CALL ASSIGN_PTR(ZOUTA, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZOUTS + IALLOC_SZ = ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZOUTS(1)),128) + CALL ASSIGN_PTR(ZOUTS, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZOUTA0 + IALLOC_SZ = ALIGN(IOUT0_STRIDES1*SIZEOF(ZOUTA0(1)),128) + CALL ASSIGN_PTR(ZOUTA0, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZOUTS0 + IALLOC_SZ = ALIGN(IOUT0_STRIDES1*SIZEOF(ZOUTS0(1)),128) + CALL ASSIGN_PTR(ZOUTS0, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! FOUBUF_IN + IALLOC_SZ = D%NLENGT1B*2*IF_LEG*SIZEOF(FOUBUF_IN(1)) + CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HLTINV%HFOUBUF_IN),& + & 1_C_SIZE_T, IALLOC_SZ) + ! ------------------------------------------------------------------ @@ -174,10 +321,6 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& IFIRST = IFIRST + 2*KF_SCALARS ! Scalars IF (LSCDERS) IFIRST = IFIRST + 2*KF_SCALARS ! Scalars NS Derivatives - ! Allocate data accordingly - ALLOCATE(PIA(IFIRST,R%NSMAX+3,D%NUMP)) - !$ACC ENTER DATA CREATE(PIA) - ! And reiterate domain decomposition to assign pointers IFIRST = 0 IF (.NOT. LVORGP .OR. LDIVGP) THEN @@ -268,12 +411,9 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& ! Transform PIA into FOUBUF_IN IF (FOUBUF_KFIELD > 0) THEN - CALL LEINV(PIA(IFIRST:,:,:), FOUBUF_IN) + CALL LEINV(PIA(IFIRST:,:,:),ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN) ENDIF - !$ACC EXIT DATA DELETE(PIA) - DEALLOCATE(PIA) - IF (LHOOK) CALL DR_HOOK('LTINV_MOD',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/tpm_trans.F90 b/src/trans/gpu/internal/tpm_trans.F90 index c9d1e71b8..ac6081272 100755 --- a/src/trans/gpu/internal/tpm_trans.F90 +++ b/src/trans/gpu/internal/tpm_trans.F90 @@ -14,6 +14,7 @@ MODULE TPM_TRANS ! USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT +USE ISO_C_BINDING, ONLY: C_INT8_T IMPLICIT NONE @@ -69,6 +70,6 @@ MODULE TPM_TRANS ! we adapt the size. After 2 iterations this should lead to constant runtimes ! (the first iteration is used to get the max buffer size, the second iteration ! is going to recreate the graphs if needed) -REAL(KIND=JPRBT),POINTER :: REUSE_PTR(:) +INTEGER(KIND=C_INT8_T),POINTER :: REUSE_PTR(:) END MODULE TPM_TRANS diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 5ca7378f7..3d133647d 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -1,3 +1,4 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) ! (C) Copyright 1995- ECMWF. ! (C) Copyright 2022- NVIDIA. ! @@ -360,8 +361,10 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(IRECV_TO_PROC(JROC)) ENDDO - CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFS),& - & 1_C_SIZE_T, ICOMBUFS_OFFSET(ISEND_COUNTS+1)*SIZEOF(ZCOMBUFS(1))) + IF (ISEND_COUNTS > 0) THEN + CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFS),& + & 1_C_SIZE_T, ICOMBUFS_OFFSET(ISEND_COUNTS+1)*SIZEOF(ZCOMBUFS(1))) + ENDIF !....Pack loop......................................................... !$ACC DATA IF(ISEND_COUNTS > 0) PRESENT(ZCOMBUFS) ASYNC(1) @@ -441,8 +444,10 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDIF CALL GSTATS(411,0) - CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& - & 1_C_SIZE_T, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*SIZEOF(ZCOMBUFR(1))) + IF (IRECV_COUNTS > 0) THEN + CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& + & 1_C_SIZE_T, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*SIZEOF(ZCOMBUFR(1))) + ENDIF !$ACC DATA IF(IRECV_COUNTS > 0) PRESENT(ZCOMBUFR) IR=0 diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index e9d900339..984184aab 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -1,3 +1,4 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) ! (C) Copyright 1995- ECMWF. ! (C) Copyright 2022- NVIDIA. ! @@ -16,17 +17,28 @@ MODULE TRLTOG_MOD PUBLIC :: TRLTOG_CUDAAWARE, TRLTOG_HANDLE, PREPARE_TRLTOG TYPE TRLTOG_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFR_AND_COMBUFS END TYPE CONTAINS - FUNCTION PREPARE_TRLTOG(ALLOCATOR) RESULT(HTRLTOG) + FUNCTION PREPARE_TRLTOG(ALLOCATOR,KF_FS,KF_GP) RESULT(HTRLTOG) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT USE TPM_DISTR, ONLY: D IMPLICIT NONE TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_GP, KF_FS TYPE(TRLTOG_HANDLE) :: HTRLTOG + + REAL(KIND=JPRBT) :: DUMMY + + INTEGER(KIND=C_SIZE_T) :: NELEM + + NELEM = ALIGN(KF_GP*D%NGPTOT*SIZEOF(DUMMY),128) ! ZCOMBUFR + NELEM = ALIGN(NELEM + KF_FS*D%NLENGTF*SIZEOF(DUMMY),128) !ZCOMBUFS upper obund + + HTRLTOG%HCOMBUFR_AND_COMBUFS = RESERVE(ALLOCATOR, NELEM) END FUNCTION SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP,& &PGP,PGPUV,PGP3A,PGP3B,PGP2) @@ -122,7 +134,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ ! LOCAL VARIABLES - REAL(KIND=JPRBT),ALLOCATABLE :: ZCOMBUFS(:),ZCOMBUFR(:) + REAL(KIND=JPRBT), POINTER :: ZCOMBUFS(:),ZCOMBUFR(:) INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) @@ -469,10 +481,17 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(IRECV_TO_PROC(JROC)) ENDDO - IF (ISEND_COUNTS > 0) ALLOCATE(ZCOMBUFS(ICOMBUFS_OFFSET(ISEND_COUNTS+1))) - !$ACC ENTER DATA IF(ISEND_COUNTS > 0) CREATE(ZCOMBUFS) ASYNC(1) + IF (IRECV_COUNTS > 0) THEN + CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),& + & 1_C_SIZE_T, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*SIZEOF(ZCOMBUFR(1))) + ENDIF + IF (ISEND_COUNTS > 0) THEN + CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),& + & ALIGN(KF_GP*D%NGPTOT*SIZEOF(ZCOMBUFR(1)),128)+1, & + & ICOMBUFS_OFFSET(ISEND_COUNTS+1)*SIZEOF(ZCOMBUFS(1))) + ENDIF - !$ACC DATA PRESENT(ZCOMBUFS) ASYNC(1) + !$ACC DATA PRESENT(ZCOMBUFS) CALL GSTATS(1605,0) DO INS=1,ISEND_COUNTS IPROC = ISEND_TO_PROC(INS) @@ -493,8 +512,6 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ !$ACC END DATA ! PREEL_REAL - IF (IRECV_COUNTS > 0) ALLOCATE(ZCOMBUFR(ICOMBUFR_OFFSET(IRECV_COUNTS+1))) - !$ACC DATA IF(IRECV_COUNTS > 0) CREATE(ZCOMBUFR) ASYNC(1) !$ACC WAIT(1) CALL GSTATS(805,0) @@ -540,9 +557,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ ENDIF CALL GSTATS(421,1) - !$ACC EXIT DATA IF(ISEND_COUNTS > 0) DELETE(ZCOMBUFS) - IF (ISEND_COUNTS > 0) DEALLOCATE(ZCOMBUFS) - + !$ACC DATA PRESENT(ZCOMBUFR) CALL GSTATS(805,1) ! Unpack loop......................................................... @@ -628,7 +643,6 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ !$ACC END DATA ! IRECVBUFR_TO_OUT,PGPINDICES !$ACC WAIT(1) - IF (IBUFLENR > 0) DEALLOCATE(ZCOMBUFR) CALL GSTATS(1606,1) diff --git a/src/trans/gpu/internal/trltom_pack_unpack.F90 b/src/trans/gpu/internal/trltom_pack_unpack.F90 index d2f118aff..04acf8936 100755 --- a/src/trans/gpu/internal/trltom_pack_unpack.F90 +++ b/src/trans/gpu/internal/trltom_pack_unpack.F90 @@ -22,10 +22,10 @@ MODULE TRLTOM_PACK_UNPACK TYPE(ALLOCATION_RESERVATION_HANDLE) :: HFOUBUF_IN END TYPE TYPE TRLTOM_UNPACK_HANDLE - TYPE(ALLOCATION_RESERVATION_HANDLE) :: ZINPS_AND_ZINPA + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HINPS_AND_ZINPA END TYPE TYPE TRLTOM_DIRECT_HANDLE - TYPE(ALLOCATION_RESERVATION_HANDLE) :: ZINPS_AND_ZINPA + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HINPS_AND_ZINPA END TYPE CONTAINS FUNCTION PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_PACK) @@ -145,7 +145,7 @@ FUNCTION PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_UNPACK) ISIZE = ISIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) ISIZE = ISIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) - HTRLTOM_UNPACK%ZINPS_AND_ZINPA = RESERVE(ALLOCATOR, ISIZE) + HTRLTOM_UNPACK%HINPS_AND_ZINPA = RESERVE(ALLOCATOR, ISIZE) END FUNCTION SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) @@ -184,22 +184,22 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP IALLOC_POS=1 IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPS(0)),128) - CALL ASSIGN_PTR(ZINPS, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%ZINPS_AND_ZINPA),& + CALL ASSIGN_PTR(ZINPS, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPA(0)),128) - CALL ASSIGN_PTR(ZINPA, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%ZINPS_AND_ZINPA),& + CALL ASSIGN_PTR(ZINPA, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPS0(0)),128) - CALL ASSIGN_PTR(ZINPS0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%ZINPS_AND_ZINPA),& + CALL ASSIGN_PTR(ZINPS0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPA0(0)),128) - CALL ASSIGN_PTR(ZINPA0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%ZINPS_AND_ZINPA),& + CALL ASSIGN_PTR(ZINPA0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ @@ -272,7 +272,7 @@ FUNCTION PREPARE_TRLTOM_DIRECT(ALLOCATOR, KF_FS) RESULT(HTRLTOM_DIRECT) ISIZE = ISIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) ISIZE = ISIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) - HTRLTOM_DIRECT%ZINPS_AND_ZINPA = RESERVE(ALLOCATOR, ISIZE) + HTRLTOM_DIRECT%HINPS_AND_ZINPA = RESERVE(ALLOCATOR, ISIZE) END FUNCTION SUBROUTINE TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) @@ -309,22 +309,22 @@ SUBROUTINE TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINP IALLOC_POS=1 IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPS(0)),128) - CALL ASSIGN_PTR(ZINPS, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%ZINPS_AND_ZINPA),& + CALL ASSIGN_PTR(ZINPS, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPA(0)),128) - CALL ASSIGN_PTR(ZINPA, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%ZINPS_AND_ZINPA),& + CALL ASSIGN_PTR(ZINPA, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPS0(0)),128) - CALL ASSIGN_PTR(ZINPS0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%ZINPS_AND_ZINPA),& + CALL ASSIGN_PTR(ZINPS0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPA0(0)),128) - CALL ASSIGN_PTR(ZINPA0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%ZINPS_AND_ZINPA),& + CALL ASSIGN_PTR(ZINPA0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 874cd9a73..6ba362a7b 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -16,22 +16,25 @@ MODULE TRMTOL_MOD PUBLIC :: TRMTOL_CUDAAWARE, PREPARE_TRMTOL, TRMTOL_HANDLE TYPE TRMTOL_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HPFBUF END TYPE CONTAINS - FUNCTION PREPARE_TRMTOL(ALLOCATOR, KF_FS) RESULT(HTRMTOL) + FUNCTION PREPARE_TRMTOL(ALLOCATOR, KF_LEG) RESULT(HTRMTOL) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT USE TPM_DISTR, ONLY: D IMPLICIT NONE TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR - INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + INTEGER(KIND=JPIM), INTENT(IN) :: KF_LEG TYPE(TRMTOL_HANDLE) :: HTRMTOL REAL(KIND=JPRBT) :: DUMMY + + HTRMTOL%HPFBUF = RESERVE(ALLOCATOR, D%NLENGT0B*2*KF_LEG*SIZEOF(DUMMY)) END FUNCTION -SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KFIELD) +SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) !**** *trmtol * - transposition in Fourier space @@ -50,7 +53,7 @@ SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KFIELD) ! Explicit arguments : PFBUF - Fourier coefficient buffer. It is ! -------------------- used for both input and output. -! KFIELD - Number of fields communicated +! KF_LEG - Number of fields communicated ! Implicit arguments : ! -------------------- @@ -93,9 +96,9 @@ SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KFIELD) IMPLICIT NONE -INTEGER(KIND=JPIM) ,INTENT(IN) :: KFIELD -REAL(KIND=JPRBT), INTENT(OUT), ALLOCATABLE :: PFBUF(:) -REAL(KIND=JPRBT), INTENT(INOUT), ALLOCATABLE :: PFBUF_IN(:) +INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_LEG +REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PFBUF(:) +REAL(KIND=JPRBT), INTENT(IN) :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK @@ -113,15 +116,15 @@ SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KFIELD) IF (LHOOK) CALL DR_HOOK('TRMTOL_CUDAAWARE',0,ZHOOK_HANDLE) -ALLOCATE(PFBUF(D%NLENGT0B*KFIELD)) -!$ACC ENTER DATA CREATE(PFBUF) +CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION(ALLOCATOR, HTRMTOL%HPFBUF),& + & 1_C_SIZE_T, D%NLENGT0B*2*KF_LEG*SIZEOF(PFBUF(1))) IF(NPROC > 1) THEN DO J=1,NPRTRW - ILENS(J) = D%NLTSFTB(J)*KFIELD - IOFFS(J) = D%NSTAGT1B(J)*KFIELD - ILENR(J) = D%NLTSGTB(J)*KFIELD - IOFFR(J) = D%NSTAGT0B(J)*KFIELD + ILENS(J) = D%NLTSFTB(J)*2*KF_LEG + IOFFS(J) = D%NSTAGT1B(J)*2*KF_LEG + ILENR(J) = D%NLTSGTB(J)*2*KF_LEG + IOFFR(J) = D%NSTAGT0B(J)*2*KF_LEG ENDDO CALL GSTATS(807,0) @@ -165,8 +168,8 @@ SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KFIELD) !$ACC WAIT(1) CALL GSTATS(807,1) ELSE - ILEN = D%NLTSGTB(MYSETW)*KFIELD - ISTA = D%NSTAGT0B(MYSETW)*KFIELD+1 + ILEN = D%NLTSGTB(MYSETW)*2*KF_LEG + ISTA = D%NSTAGT0B(MYSETW)*2*KF_LEG+1 CALL GSTATS(1608,0) !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) DO J=ISTA,ISTA+ILEN-1 @@ -175,11 +178,6 @@ SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KFIELD) CALL GSTATS(1608,1) ENDIF -IF (ALLOCATED(PFBUF_IN)) THEN - !$ACC EXIT DATA DELETE(PFBUF_IN) - DEALLOCATE(PFBUF_IN) -ENDIF - IF (LHOOK) CALL DR_HOOK('TRMTOL_CUDAAWARE',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ From 4e379beaf0e795e2c17564c75a6b49c03ce66ce1 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:37 -0700 Subject: [PATCH 233/263] Split leinv into leinv and leinv_pack --- src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 14 +-- src/trans/gpu/internal/leinv_mod.F90 | 120 ++++++++++++++----- src/trans/gpu/internal/ltinv_mod.F90 | 79 +++++------- 3 files changed, 127 insertions(+), 86 deletions(-) diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index febb6bdaa..bad9f833a 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -133,8 +133,6 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& ! Local variables -INTEGER(KIND=JPIM) :: KFIELD - REAL(KIND=JPRB), POINTER :: FOUBUF(:) REAL(KIND=JPRB), POINTER :: FOUBUF_IN(:) @@ -219,7 +217,7 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& IF (KF_FS > 0) THEN CALL LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & - & FOUBUF_IN,KFIELD) + & FOUBUF_IN) ENDIF CALL GSTATS(102,1) @@ -229,20 +227,18 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& CALL GSTATS(152,1) CALL GSTATS(107,0) - - ! from FOUBUF to PREEL_COMPLEX. Divide by two because we consider this complex space now - IF (KF_FS > 0) CALL FOURIER_IN(ALLOCATOR,HFOURIER_IN,FOUBUF,PREEL_COMPLEX,KFIELD/2,KF_FS) + IF (KF_FS > 0) CALL FOURIER_IN(ALLOCATOR,HFOURIER_IN,FOUBUF,PREEL_COMPLEX,IF_LEG,IF_FOURIER) ! 2. Fourier space computations ! fill the rest of PREEL_COMPLEX - IF (KF_FS > 0) CALL FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, & + IF (KF_FS > 0) CALL FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, IF_FOURIER, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, & & KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) ! 3. Fourier transform ! inplace operation - IF (KF_FS > 0) CALL FTINV(ALLOCATOR, HFTINV, PREEL_COMPLEX,PREEL_REAL,KF_FS) + IF (KF_FS > 0) CALL FTINV(ALLOCATOR, HFTINV, PREEL_COMPLEX,PREEL_REAL,IF_FOURIER) CALL GSTATS(107,1) @@ -324,7 +320,7 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& ENDIF CALL GSTATS(157,0) - CALL TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,IVSET,& + CALL TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,IF_FOURIER,KF_GP,KF_UV_G,KF_SCALARS_G,IVSET,& &PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) CALL GSTATS(157,1) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index ec38ef3b0..4f018b165 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -14,7 +14,7 @@ MODULE LEINV_MOD IMPLICIT NONE PRIVATE - PUBLIC :: LEINV_STRIDES, LEINV + PUBLIC :: LEINV_STRIDES, LEINV, LEINV_PACK INTEGER(KIND=JPIM) :: A = 8 !Alignment @@ -52,7 +52,7 @@ SUBROUTINE LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STR IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) END SUBROUTINE - SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN) + SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) !**** *LEINV* - Inverse Legendre transform. @@ -72,7 +72,7 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN) ! Implicit arguments : None. ! -------------------- - ! Method. use butterfly or dgemm + ! Method. ! ------- ! Externals. @@ -93,43 +93,30 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN) ! ------------------------------------------------------------------ USE TPM_GEN ,ONLY : LSYNC_TRANS - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT, JPRD + USE PARKIND_ECTRANS ,ONLY : JPIM,JPRB, JPRBT, JPRD USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX, R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU USE TPM_FIELDS ,ONLY : ZAA,ZAS,ZAA0,ZAS0,KMLOC0 - USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1,MYPROC - USE TPM_GEN, ONLY: NOUT + USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,MYPROC USE CUDA_GEMM_BATCHED_MOD - USE, INTRINSIC :: ISO_C_BINDING USE MPL_MODULE ,ONLY : MPL_BARRIER - USE IEEE_ARITHMETIC - USE OPENACC USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX IMPLICIT NONE - - ! DUMMY ARGUMENTS - INTEGER(KIND=JPIM) :: KM - INTEGER(KIND=JPIM) :: KMLOC - INTEGER(KIND=JPIM) :: KIFC - INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) REAL(KIND=JPRB), INTENT(IN) :: PIA(:,:,:) - REAL(KIND=JPRB), INTENT(OUT) :: FOUBUF_IN(:) + INTEGER(KIND=JPIM), INTENT(IN) :: KF_LEG REAL(KIND=JPRBT), INTENT(OUT) :: ZINP(:), ZOUTS(:), ZOUTA(:) REAL(KIND=JPRD), INTENT(OUT) :: ZINP0(:), ZOUTS0(:), ZOUTA0(:) ! LOCAL - REAL(KIND=JPRBT) :: ZAOA, ZSOA - - INTEGER(KIND=JPIM) :: IA, IS, ISL, J1, JGL, JK, J, IGLS, ISTAS, OFFSET1, OFFSET2 - INTEGER(KIND=JPIM) :: KF_LEG + INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) + INTEGER(KIND=JPIM) :: KM, KMLOC, IA, IS, ISL, J1, JGL, JK, J INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 - INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS REAL(KIND=JPHOOK) :: ZHOOK_HANDLE @@ -137,7 +124,6 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN) !* 1.1 PREPARATIONS. IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) - KF_LEG = SIZE(PIA,1)/2 ! ------------------------------------------------------------------ !* 1. PERFORM LEGENDRE TRANFORM. @@ -152,7 +138,7 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN) !$ACC DATA PRESENT(D,D_MYMS,G,G_NDGLU,R) & !$ACC& PRESENT(ZINP,ZOUTS,ZOUTA,ZINP0,ZOUTS0,ZOUTA0) & !$ACC& PRESENT(ZAA,ZAS,PIA) & - !$ACC& PRESENT(D_MYMS,D_NPNTGTB1,G_NDGLU) + !$ACC& PRESENT(D_MYMS,G_NDGLU) IF (KMLOC0 > 0) THEN print*,'computing m=0 in double precision' @@ -320,8 +306,86 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN) ENDIF CALL GSTATS(424,1) - !$ACC DATA PRESENT(FOUBUF_IN) - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,ISTAS,ZAOA,ZSOA) ASYNC(1) + !$ACC WAIT(1) + + !$ACC END DATA + + IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) + ! ------------------------------------------------------------------ + + END SUBROUTINE LEINV + SUBROUTINE LEINV_PACK(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,KF_LEG) + + !**** *TRMTOL_PACK* - Packing buffer for TRMTOL + + ! Purpose. + ! -------- + ! Packs data from LTINV outputs into FOUBUF for conversion to fourier space + + !** Interface. + ! ---------- + ! CALL TRMTOL_PACK(...) + + ! Explicit arguments : ZOUTS - symmetric data + ! -------------------- ZOUTA - asymmetric data + ! ZOUTS0 - symmetric data for KMLOC0 + ! ZOUTA0 - asymmetric data for KMLOC0 + ! FOUBUF_IN - output towards TRMTOL + ! KF_LEG - number of fields (we have 2XKF_LEG because complex) + + ! Implicit arguments : None. + ! -------------------- + + ! Externals. + ! ---------- + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! Nils Wedi + Mats Hamrud + George Modzynski + ! + ! Modifications. + ! -------------- + ! J.Hague : Oct 2012 DR_HOOK round calls to DGEMM: + ! F. Vana 05-Mar-2015 Support for single precision + ! ------------------------------------------------------------------ + + USE PARKIND_ECTRANS ,ONLY : JPIM,JPRB,JPRBT,JPRD + USE YOMHOOK, ONLY : LHOOK,DR_HOOK, JPHOOK + USE TPM_DIM, ONLY : R, R_NDGNH,R_NDGL + USE TPM_GEOMETRY,ONLY : G,G_NDGLU + USE TPM_DISTR, ONLY : D,D_NUMP,D_MYMS,D_NPNTGTB1 + + IMPLICIT NONE + + + ! DUMMY ARGUMENTS + REAL(KIND=JPRB), INTENT(OUT) :: FOUBUF_IN(:) + REAL(KIND=JPRBT), INTENT(IN) :: ZOUTS(:), ZOUTA(:) + REAL(KIND=JPRD), INTENT(IN) :: ZOUTS0(:), ZOUTA0(:) + INTEGER(KIND=JPIM), INTENT(IN) :: KF_LEG + + ! LOCAL + REAL(KIND=JPRBT) :: ZAOA, ZSOA + + INTEGER(KIND=JPIM) :: KMLOC, KM, ISL, JGL, JK, IGLS, OFFSET1, OFFSET2 + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + + IF (LHOOK) CALL DR_HOOK('LEINV_PACK',0,ZHOOK_HANDLE) + + CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_STRIDES1=IOUT_STRIDES1,& + IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_STRIDES1=IOUT0_STRIDES1) + + !$ACC DATA PRESENT(D,D_MYMS,D_NPNTGTB1,G,G_NDGLU) & + !$ACC& PRESENT(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN) + + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,ZAOA,ZSOA) ASYNC(1) DO KMLOC=1,D_NUMP DO JGL=1,R_NDGNH DO JK=1,2*KF_LEG @@ -354,11 +418,9 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN) !$ACC WAIT(1) - !$ACC END DATA !$ACC END DATA - IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) - ! ------------------------------------------------------------------ + IF (LHOOK) CALL DR_HOOK('LEINV_PACK',1,ZHOOK_HANDLE) - END SUBROUTINE LEINV + END SUBROUTINE LEINV_PACK END MODULE LEINV_MOD diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index 3ada1996b..bf99a6402 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -102,7 +102,7 @@ FUNCTION PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT( SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & - & FOUBUF_IN,FOUBUF_KFIELD) + & FOUBUF_IN) USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK @@ -185,7 +185,6 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC3A(:,:,:) REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC3B(:,:,:) REAL(KIND=JPRB), POINTER, INTENT(OUT) :: FOUBUF_IN(:) - INTEGER(KIND=JPIM), INTENT(OUT) :: FOUBUF_KFIELD INTEGER(KIND=JPIM) :: IFIRST, J3 @@ -288,40 +287,7 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HLTINV%HFOUBUF_IN),& & 1_C_SIZE_T, IALLOC_SZ) - ! ------------------------------------------------------------------ - - - !* 3. SPECTRAL COMPUTATIONS FOR U,V AND DERIVATIVES. - ! ---------------------------------------------- - - IF (LSYNC_TRANS) THEN - CALL GSTATS(440,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(440,1) - ENDIF - CALL GSTATS(422,0) - !$ACC DATA COPYIN(PSPVOR,PSPDIV) IF(KF_UV > 0) - !$ACC DATA COPYIN(PSPSCALAR) IF(PRESENT(PSPSCALAR) .AND. KF_SCALARS > 0) - !$ACC DATA COPYIN(PSPSC2) IF(NF_SC2 > 0) - !$ACC DATA COPYIN(PSPSC3A) IF(NF_SC3A > 0) - !$ACC DATA COPYIN(PSPSC3B) IF(NF_SC3B > 0) - IF (LSYNC_TRANS) THEN - CALL GSTATS(442,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(442,1) - ENDIF - CALL GSTATS(422,1) - - ! Compute PIA Domain decomposition - IFIRST = 0 - IFIRST = IFIRST + 2*KF_UV ! Vorticity or divergence - IFIRST = IFIRST + 2*KF_UV ! Vorticity or divergence - IFIRST = IFIRST + 2*KF_UV ! U - IFIRST = IFIRST + 2*KF_UV ! V - IFIRST = IFIRST + 2*KF_SCALARS ! Scalars - IF (LSCDERS) IFIRST = IFIRST + 2*KF_SCALARS ! Scalars NS Derivatives - - ! And reiterate domain decomposition to assign pointers + ! Assign pointers do the different components of PIA IFIRST = 0 IF (.NOT. LVORGP .OR. LDIVGP) THEN ! Usually we want to store vorticity first @@ -350,6 +316,30 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& IFIRST = IFIRST + 2*KF_SCALARS ! Scalars NS Derivatives ENDIF + ! ------------------------------------------------------------------ + + + !* 3. SPECTRAL COMPUTATIONS FOR U,V AND DERIVATIVES. + ! ---------------------------------------------- + + IF (LSYNC_TRANS) THEN + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) + ENDIF + CALL GSTATS(422,0) + !$ACC DATA COPYIN(PSPVOR,PSPDIV) IF(KF_UV > 0) + !$ACC DATA COPYIN(PSPSCALAR) IF(PRESENT(PSPSCALAR) .AND. KF_SCALARS > 0) + !$ACC DATA COPYIN(PSPSC2) IF(NF_SC2 > 0) + !$ACC DATA COPYIN(PSPSC3A) IF(NF_SC3A > 0) + !$ACC DATA COPYIN(PSPSC3B) IF(NF_SC3B > 0) + IF (LSYNC_TRANS) THEN + CALL GSTATS(442,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(442,1) + ENDIF + CALL GSTATS(422,1) + IF (KF_UV > 0) THEN CALL PRFI1B(PVOR,PSPVOR,KF_UV,UBOUND(PSPVOR,2)) CALL PRFI1B(PDIV,PSPDIV,KF_UV,UBOUND(PSPDIV,2)) @@ -391,6 +381,7 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& !$ACC END DATA !$ACC END DATA + ! Compute NS derivatives if needed IF (LSCDERS) THEN CALL SPNSDE(KF_SCALARS,ZEPSNM,PSCALARS,PSCALARS_NSDER) ENDIF @@ -401,18 +392,10 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& !* 4. INVERSE LEGENDRE TRANSFORM. ! --------------------------- - ! Forget about Vorticity and divergence if we don't need it in the output - IFIRST = 1 - IF(.NOT. LVORGP) IFIRST = IFIRST+2*KF_UV - IF(.NOT. LDIVGP) IFIRST = IFIRST+2*KF_UV - - ! Keep this for next functions because we have to remember this - FOUBUF_KFIELD = SIZE(PIA,1)-IFIRST+1 - - ! Transform PIA into FOUBUF_IN - IF (FOUBUF_KFIELD > 0) THEN - CALL LEINV(PIA(IFIRST:,:,:),ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN) - ENDIF + ! Legendre transforms. When converting PIA into ZOUT, we ignore the first entries of LEINV. + ! This is because vorticity and divergence is not necessarily converted to GP space. + CALL LEINV(PIA(2*(IF_READIN-IF_LEG)+1:IF_READIN,:,:),ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,IF_LEG) + CALL LEINV_PACK(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,IF_LEG) IF (LHOOK) CALL DR_HOOK('LTINV_MOD',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ From 8869d260c1727ac8b19a962065db8d6b8a92d238 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:37 -0700 Subject: [PATCH 234/263] Finish split leinv and leinv_pack --- src/trans/gpu/internal/fourier_in_mod.F90 | 122 -------- src/trans/gpu/internal/ftinv_mod.F90 | 2 +- src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 27 +- src/trans/gpu/internal/leinv_mod.F90 | 111 +------- src/trans/gpu/internal/ltinv_mod.F90 | 31 +- src/trans/gpu/internal/trmtol_pack_unpack.F90 | 264 ++++++++++++++++++ 6 files changed, 292 insertions(+), 265 deletions(-) delete mode 100755 src/trans/gpu/internal/fourier_in_mod.F90 create mode 100755 src/trans/gpu/internal/trmtol_pack_unpack.F90 diff --git a/src/trans/gpu/internal/fourier_in_mod.F90 b/src/trans/gpu/internal/fourier_in_mod.F90 deleted file mode 100755 index 1f3b53979..000000000 --- a/src/trans/gpu/internal/fourier_in_mod.F90 +++ /dev/null @@ -1,122 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE FOURIER_IN_MOD - USE ALLOCATOR_MOD - IMPLICIT NONE - - PRIVATE - PUBLIC :: FOURIER_IN, FOURIER_IN_HANDLE, PREPARE_FOURIER_IN - - TYPE FOURIER_IN_HANDLE - TYPE(ALLOCATION_RESERVATION_HANDLE) :: HREEL - END TYPE - -CONTAINS - FUNCTION PREPARE_FOURIER_IN(ALLOCATOR,KF_FS) RESULT(HFOURIER_IN) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT - USE TPM_DISTR, ONLY: D - - IMPLICIT NONE - - TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR - INTEGER(KIND=JPIM) :: KF_FS - - TYPE(FOURIER_IN_HANDLE) :: HFOURIER_IN - - REAL(KIND=JPRBT) :: DUMMY - - HFOURIER_IN%HREEL = RESERVE(ALLOCATOR, D%NLENGTF*KF_FS*SIZEOF(DUMMY)) - - END FUNCTION -SUBROUTINE FOURIER_IN(ALLOCATOR,HFOURIER_IN,FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) - -!**** *FOURIER_IN* - Copy fourier data from buffer to local array - -! Purpose. -! -------- -! Routine for copying fourier data from buffer to local array - -!** Interface. -! ---------- -! CALL FOURIER_IN(...) - -! Explicit arguments : PREEL_COMPLEX - local fourier/GP array -! -------------------- KF_CURRENT - number of fields that are read (from Legendre space) -! KF_TOTAL - total fields in PREEL ("stride") -! -! Externals. None. -! ---------- - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 2000-04-01 - -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM,JPRBT -USE TPM_DISTR ,ONLY : D,MYSETW,MYPROC, NPROC, D_NSTAGTF, D_NPNTGTB0,D_NPTRLS -USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NLOEN,G_NLOEN_MAX -! - -IMPLICIT NONE - -REAL(KIND=JPRBT), INTENT(IN) :: FOUBUF(:) -REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:) -INTEGER(KIND=JPIM),INTENT(IN) :: KF_CURRENT, KF_TOTAL -TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR -TYPE(FOURIER_IN_HANDLE), INTENT(IN) :: HFOURIER_IN - -INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL -REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX - -CALL ASSIGN_PTR(PREEL_COMPLEX, GET_ALLOCATION(ALLOCATOR, HFOURIER_IN%HREEL),& - & 1_C_SIZE_T, KF_TOTAL*D%NLENGTF*SIZEOF(PREEL_COMPLEX(1))) - -!$ACC DATA PRESENT(D,G_NLOEN,G_NMEN,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF) ASYNC(1) - -OFFSET_VAR=D_NPTRLS(MYSETW) -!$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,ISTA,RET_REAL,RET_COMPLEX) DEFAULT(NONE) & -!$ACC& ASYNC(1) TILE(32,16,1) -DO KGL=1,D%NDGL_FS - DO JF=1,KF_CURRENT - DO JM=0,G_NLOEN_MAX/2 - IGLG = OFFSET_VAR+KGL-1 - - ! FFT transforms NLON real values to floor(NLON/2)+1 complex numbers. Hence we have - ! to fill those floor(NLON/2)+1 values. - ! Truncation happens starting at G_NMEN+1. Hence, we zero-fill those values. - IF (JM <= G_NLOEN(IGLG)/2) THEN - RET_REAL = 0.0_JPRBT - RET_COMPLEX = 0.0_JPRBT - IF (JM <= G_NMEN(IGLG)) THEN - ISTA = D_NPNTGTB0(JM,KGL)*KF_CURRENT*2 - - RET_REAL = FOUBUF(ISTA+2*JF-1) - RET_COMPLEX = FOUBUF(ISTA+2*JF ) - ENDIF - IOFF_LAT = KF_TOTAL*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) - PREEL_COMPLEX(IOFF_LAT+2*JM+1) = RET_REAL - PREEL_COMPLEX(IOFF_LAT+2*JM+2) = RET_COMPLEX - ENDIF - ENDDO - ENDDO -ENDDO -!$ACC END DATA - -!$ACC WAIT(1) - -END SUBROUTINE FOURIER_IN -END MODULE FOURIER_IN_MOD - diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index f889055df..33fddfc94 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -74,7 +74,7 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) -REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_COMPLEX(:) +REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:) TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(FTINV_HANDLE), INTENT(IN) :: HFTINV diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index bad9f833a..976444c43 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -84,7 +84,7 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& ! ------------------------------------------------------------------ -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT, JPRD USE TPM_GEN ,ONLY : NPROMATR, NOUT, NERR USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, REUSE_PTR @@ -92,7 +92,7 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& USE TRMTOL_MOD USE LTINV_MOD -USE FOURIER_IN_MOD +USE TRMTOL_PACK_UNPACK USE FSC_MOD USE FTINV_MOD USE TRLTOG_MOD @@ -133,11 +133,10 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& ! Local variables -REAL(KIND=JPRB), POINTER :: FOUBUF(:) -REAL(KIND=JPRB), POINTER :: FOUBUF_IN(:) - -REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:) -REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) +REAL(KIND=JPRB), POINTER :: FOUBUF(:), FOUBUF_IN(:) +REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:), PREEL_COMPLEX(:) +REAL(KIND=JPRBT), POINTER :: ZOUTS(:), ZOUTA(:) +REAL(KIND=JPRD), POINTER :: ZOUTS0(:), ZOUTA0(:) INTEGER(KIND=JPIM) :: KUV_OFFSET, KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, & & KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET INTEGER(KIND=JPIM) :: IF_LEG, IF_FOURIER @@ -151,8 +150,9 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& TYPE(BUFFERED_ALLOCATOR) :: ALLOCATOR TYPE(LTINV_HANDLE) :: HLTINV +TYPE(TRMTOL_PACK_HANDLE) :: HTRMTOL_PACK TYPE(TRMTOL_HANDLE) :: HTRMTOL -TYPE(FOURIER_IN_HANDLE) :: HFOURIER_IN +TYPE(TRMTOL_UNPACK_HANDLE) :: HTRMTOL_UNPACK TYPE(FSC_HANDLE) :: HFSC TYPE(FTINV_HANDLE) :: HFTINV TYPE(TRLTOG_HANDLE) :: HTRLTOG @@ -202,8 +202,9 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& ALLOCATOR = MAKE_BUFFERED_ALLOCATOR() HLTINV = PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) + HTRMTOL_PACK = PREPARE_TRMTOL_PACK(ALLOCATOR,IF_LEG) HTRMTOL = PREPARE_TRMTOL(ALLOCATOR,IF_LEG) - HFOURIER_IN = PREPARE_FOURIER_IN(ALLOCATOR,IF_FOURIER) + HTRMTOL_UNPACK = PREPARE_TRMTOL_UNPACK(ALLOCATOR,IF_FOURIER) HFSC = PREPARE_FSC(ALLOCATOR) HFTINV = PREPARE_FTINV(ALLOCATOR) HTRLTOG = PREPARE_TRLTOG(ALLOCATOR,IF_FOURIER,KF_GP) @@ -217,10 +218,12 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& IF (KF_FS > 0) THEN CALL LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & - & FOUBUF_IN) - ENDIF + & ZOUTS,ZOUTA,ZOUTS0,ZOUTA0) + END IF CALL GSTATS(102,1) + IF (KF_FS > 0) CALL TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,IF_LEG) + CALL GSTATS(152,0) WRITE(NOUT,*) 'ltinv_ctl:TRMTOL_CUDAAWARE' IF (KF_FS > 0) CALL TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,FOUBUF_IN,FOUBUF,IF_LEG) @@ -228,7 +231,7 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& CALL GSTATS(107,0) ! from FOUBUF to PREEL_COMPLEX. Divide by two because we consider this complex space now - IF (KF_FS > 0) CALL FOURIER_IN(ALLOCATOR,HFOURIER_IN,FOUBUF,PREEL_COMPLEX,IF_LEG,IF_FOURIER) + IF (KF_FS > 0) CALL TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,IF_LEG,IF_FOURIER) ! 2. Fourier space computations diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 4f018b165..3672f11af 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -14,7 +14,7 @@ MODULE LEINV_MOD IMPLICIT NONE PRIVATE - PUBLIC :: LEINV_STRIDES, LEINV, LEINV_PACK + PUBLIC :: LEINV_STRIDES, LEINV INTEGER(KIND=JPIM) :: A = 8 !Alignment @@ -314,113 +314,4 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) ! ------------------------------------------------------------------ END SUBROUTINE LEINV - SUBROUTINE LEINV_PACK(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,KF_LEG) - - !**** *TRMTOL_PACK* - Packing buffer for TRMTOL - - ! Purpose. - ! -------- - ! Packs data from LTINV outputs into FOUBUF for conversion to fourier space - - !** Interface. - ! ---------- - ! CALL TRMTOL_PACK(...) - - ! Explicit arguments : ZOUTS - symmetric data - ! -------------------- ZOUTA - asymmetric data - ! ZOUTS0 - symmetric data for KMLOC0 - ! ZOUTA0 - asymmetric data for KMLOC0 - ! FOUBUF_IN - output towards TRMTOL - ! KF_LEG - number of fields (we have 2XKF_LEG because complex) - - ! Implicit arguments : None. - ! -------------------- - - ! Externals. - ! ---------- - - ! Reference. - ! ---------- - ! ECMWF Research Department documentation of the IFS - - ! Author. - ! ------- - ! Nils Wedi + Mats Hamrud + George Modzynski - ! - ! Modifications. - ! -------------- - ! J.Hague : Oct 2012 DR_HOOK round calls to DGEMM: - ! F. Vana 05-Mar-2015 Support for single precision - ! ------------------------------------------------------------------ - - USE PARKIND_ECTRANS ,ONLY : JPIM,JPRB,JPRBT,JPRD - USE YOMHOOK, ONLY : LHOOK,DR_HOOK, JPHOOK - USE TPM_DIM, ONLY : R, R_NDGNH,R_NDGL - USE TPM_GEOMETRY,ONLY : G,G_NDGLU - USE TPM_DISTR, ONLY : D,D_NUMP,D_MYMS,D_NPNTGTB1 - - IMPLICIT NONE - - - ! DUMMY ARGUMENTS - REAL(KIND=JPRB), INTENT(OUT) :: FOUBUF_IN(:) - REAL(KIND=JPRBT), INTENT(IN) :: ZOUTS(:), ZOUTA(:) - REAL(KIND=JPRD), INTENT(IN) :: ZOUTS0(:), ZOUTA0(:) - INTEGER(KIND=JPIM), INTENT(IN) :: KF_LEG - - ! LOCAL - REAL(KIND=JPRBT) :: ZAOA, ZSOA - - INTEGER(KIND=JPIM) :: KMLOC, KM, ISL, JGL, JK, IGLS, OFFSET1, OFFSET2 - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 - INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 - - REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - - IF (LHOOK) CALL DR_HOOK('LEINV_PACK',0,ZHOOK_HANDLE) - - CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_STRIDES1=IOUT_STRIDES1,& - IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_STRIDES1=IOUT0_STRIDES1) - - !$ACC DATA PRESENT(D,D_MYMS,D_NPNTGTB1,G,G_NDGLU) & - !$ACC& PRESENT(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN) - - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,ZAOA,ZSOA) ASYNC(1) - DO KMLOC=1,D_NUMP - DO JGL=1,R_NDGNH - DO JK=1,2*KF_LEG - KM = D_MYMS(KMLOC) - ISL = R_NDGNH-G_NDGLU(KM)+1 - IF (JGL >= ISL) THEN - !(DO JGL=ISL,R_NDGNH) - IGLS = R_NDGL+1-JGL - OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*2*KF_LEG - OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_LEG - - IF(KM /= 0) THEN - ZSOA = ZOUTS(JK+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) - ZAOA = ZOUTA(JK+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) - ELSEIF (MOD((JK-1),2) .EQ. 0) THEN - ZSOA = ZOUTS0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) - ZAOA = ZOUTA0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) - ELSE - ! Imaginary values of KM=0 is zero, though I don't think we care - ZSOA = 0_JPRBT - ZAOA = 0_JPRBT - ENDIF - - FOUBUF_IN(OFFSET1+JK) = ZAOA+ZSOA - FOUBUF_IN(OFFSET2+JK) = ZSOA-ZAOA - ENDIF - ENDDO - ENDDO - ENDDO - - !$ACC WAIT(1) - - !$ACC END DATA - - IF (LHOOK) CALL DR_HOOK('LEINV_PACK',1,ZHOOK_HANDLE) - - END SUBROUTINE LEINV_PACK END MODULE LEINV_MOD diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index bf99a6402..f5baee608 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -20,7 +20,6 @@ MODULE LTINV_MOD TYPE LTINV_HANDLE TYPE(ALLOCATION_RESERVATION_HANDLE) :: HPIA_AND_IN TYPE(ALLOCATION_RESERVATION_HANDLE) :: HOUTS_AND_OUTA - TYPE(ALLOCATION_RESERVATION_HANDLE) :: HFOUBUF_IN END TYPE CONTAINS @@ -95,14 +94,11 @@ FUNCTION PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT( HLTINV%HOUTS_AND_OUTA = RESERVE(ALLOCATOR, IALLOC_SZ) - IALLOC_SZ = D%NLENGT1B*2*IF_LEG*SIZEOF(ZPRBT_DUMMY) - HLTINV%HFOUBUF_IN = RESERVE(ALLOCATOR, IALLOC_SZ) - END FUNCTION SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & - & FOUBUF_IN) + & ZOUTS,ZOUTA,ZOUTS0,ZOUTA0) USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK @@ -184,7 +180,8 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC2(:,:) REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC3A(:,:,:) REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC3B(:,:,:) - REAL(KIND=JPRB), POINTER, INTENT(OUT) :: FOUBUF_IN(:) + REAL(KIND=JPRBT), POINTER, INTENT(OUT) :: ZOUTS(:), ZOUTA(:) + REAL(KIND=JPRD), POINTER, INTENT(OUT) :: ZOUTS0(:), ZOUTA0(:) INTEGER(KIND=JPIM) :: IFIRST, J3 @@ -196,16 +193,16 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(LTINV_HANDLE), INTENT(IN) :: HLTINV - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 - INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 - INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 - INTEGER(KIND=JPIM) :: IF_READIN, IF_LEG - INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ + INTEGER(KIND=JPIM) :: IF_READIN, IF_LEG + INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ - REAL(KIND=JPRBT), POINTER :: ZOUTS(:), ZOUTA(:), ZINP(:) - REAL(KIND=JPRD), POINTER :: ZOUTS0(:), ZOUTA0(:), ZINP0(:) + REAL(KIND=JPRBT), POINTER :: ZINP(:) + REAL(KIND=JPRD), POINTER :: ZINP0(:) ! ------------------------------------------------------------------ @@ -282,11 +279,6 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& & IALLOC_POS, IALLOC_SZ) IALLOC_POS = IALLOC_POS + IALLOC_SZ - ! FOUBUF_IN - IALLOC_SZ = D%NLENGT1B*2*IF_LEG*SIZEOF(FOUBUF_IN(1)) - CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HLTINV%HFOUBUF_IN),& - & 1_C_SIZE_T, IALLOC_SZ) - ! Assign pointers do the different components of PIA IFIRST = 0 IF (.NOT. LVORGP .OR. LDIVGP) THEN @@ -395,7 +387,6 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& ! Legendre transforms. When converting PIA into ZOUT, we ignore the first entries of LEINV. ! This is because vorticity and divergence is not necessarily converted to GP space. CALL LEINV(PIA(2*(IF_READIN-IF_LEG)+1:IF_READIN,:,:),ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,IF_LEG) - CALL LEINV_PACK(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,IF_LEG) IF (LHOOK) CALL DR_HOOK('LTINV_MOD',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/trmtol_pack_unpack.F90 b/src/trans/gpu/internal/trmtol_pack_unpack.F90 new file mode 100755 index 000000000..4e741a040 --- /dev/null +++ b/src/trans/gpu/internal/trmtol_pack_unpack.F90 @@ -0,0 +1,264 @@ +! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TRMTOL_PACK_UNPACK + USE ALLOCATOR_MOD + IMPLICIT NONE + + PRIVATE + PUBLIC :: TRMTOL_PACK, TRMTOL_PACK_HANDLE, PREPARE_TRMTOL_PACK + PUBLIC :: TRMTOL_UNPACK, TRMTOL_UNPACK_HANDLE, PREPARE_TRMTOL_UNPACK + + TYPE TRMTOL_PACK_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HFOUBUF_IN + END TYPE + TYPE TRMTOL_UNPACK_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HREEL + END TYPE + +CONTAINS + FUNCTION PREPARE_TRMTOL_PACK(ALLOCATOR,KF_LEG) RESULT(HTRMTOL_PACK) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE TPM_DISTR, ONLY: D + USE TPM_DIM, ONLY: R + USE ISO_C_BINDING + USE LEINV_MOD + USE ALLOCATOR_MOD + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_LEG + + TYPE(TRMTOL_PACK_HANDLE) :: HTRMTOL_PACK + + INTEGER(KIND=C_SIZE_T) :: IALLOC_SZ + + REAL(KIND=JPRBT) :: ZPRBT_DUMMY + + IALLOC_SZ = D%NLENGT1B*2*KF_LEG*SIZEOF(ZPRBT_DUMMY) + HTRMTOL_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, IALLOC_SZ) + END FUNCTION + SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,KF_LEG) + + !**** *TRMTOL_PACK* - Packing buffer for TRMTOL + + ! Purpose. + ! -------- + ! Packs data from LTINV outputs into FOUBUF for conversion to fourier space + + !** Interface. + ! ---------- + ! CALL TRMTOL_PACK(...) + + ! Explicit arguments : ZOUTS - symmetric data + ! -------------------- ZOUTA - asymmetric data + ! ZOUTS0 - symmetric data for KMLOC0 + ! ZOUTA0 - asymmetric data for KMLOC0 + ! FOUBUF_IN - output towards TRMTOL + ! KF_LEG - number of fields (we have 2XKF_LEG because complex) + + ! Implicit arguments : None. + ! -------------------- + + ! Externals. + ! ---------- + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! Nils Wedi + Mats Hamrud + George Modzynski + ! + ! Modifications. + ! -------------- + ! J.Hague : Oct 2012 DR_HOOK round calls to DGEMM: + ! F. Vana 05-Mar-2015 Support for single precision + ! ------------------------------------------------------------------ + + USE PARKIND_ECTRANS ,ONLY : JPIM,JPRB,JPRBT,JPRD + USE YOMHOOK, ONLY : LHOOK,DR_HOOK, JPHOOK + USE TPM_DIM, ONLY : R, R_NDGNH,R_NDGL + USE TPM_GEOMETRY,ONLY : G,G_NDGLU + USE TPM_DISTR, ONLY : D,D_NUMP,D_MYMS,D_NPNTGTB1 + USE LEINV_MOD, ONLY: LEINV_STRIDES + + IMPLICIT NONE + + + ! DUMMY ARGUMENTS + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(TRMTOL_PACK_HANDLE), INTENT(IN) :: HTRMTOL_PACK + REAL(KIND=JPRB), INTENT(OUT), POINTER :: FOUBUF_IN(:) + REAL(KIND=JPRBT), INTENT(IN) :: ZOUTS(:), ZOUTA(:) + REAL(KIND=JPRD), INTENT(IN) :: ZOUTS0(:), ZOUTA0(:) + INTEGER(KIND=JPIM), INTENT(IN) :: KF_LEG + + ! LOCAL + REAL(KIND=JPRBT) :: ZAOA, ZSOA + + INTEGER(KIND=JPIM) :: KMLOC, KM, ISL, JGL, JK, IGLS, OFFSET1, OFFSET2 + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + + IF (LHOOK) CALL DR_HOOK('TRMTOL_PACK',0,ZHOOK_HANDLE) + + CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRMTOL_PACK%HFOUBUF_IN),& + & 1_C_SIZE_T, D%NLENGT1B*2*KF_LEG*SIZEOF(FOUBUF_IN(1))) + + CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_STRIDES1=IOUT_STRIDES1,& + IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_STRIDES1=IOUT0_STRIDES1) + + !$ACC DATA PRESENT(D,D_MYMS,D_NPNTGTB1,G,G_NDGLU) & + !$ACC& PRESENT(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN) + + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,ZAOA,ZSOA) ASYNC(1) + DO KMLOC=1,D_NUMP + DO JGL=1,R_NDGNH + DO JK=1,2*KF_LEG + KM = D_MYMS(KMLOC) + ISL = R_NDGNH-G_NDGLU(KM)+1 + IF (JGL >= ISL) THEN + !(DO JGL=ISL,R_NDGNH) + IGLS = R_NDGL+1-JGL + OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*2*KF_LEG + OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_LEG + + IF(KM /= 0) THEN + ZSOA = ZOUTS(JK+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) + ZAOA = ZOUTA(JK+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) + ELSEIF (MOD((JK-1),2) .EQ. 0) THEN + ZSOA = ZOUTS0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) + ZAOA = ZOUTA0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) + ELSE + ! Imaginary values of KM=0 is zero, though I don't think we care + ZSOA = 0_JPRBT + ZAOA = 0_JPRBT + ENDIF + + FOUBUF_IN(OFFSET1+JK) = ZAOA+ZSOA + FOUBUF_IN(OFFSET2+JK) = ZSOA-ZAOA + ENDIF + ENDDO + ENDDO + ENDDO + + !$ACC WAIT(1) + + !$ACC END DATA + + IF (LHOOK) CALL DR_HOOK('TRMTOL_PACK',1,ZHOOK_HANDLE) + + END SUBROUTINE TRMTOL_PACK + + FUNCTION PREPARE_TRMTOL_UNPACK(ALLOCATOR,KF_FS) RESULT(HTRMTOL_UNPACK) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: D + + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM) :: KF_FS + + TYPE(TRMTOL_UNPACK_HANDLE) :: HTRMTOL_UNPACK + + REAL(KIND=JPRBT) :: DUMMY + + HTRMTOL_UNPACK%HREEL = RESERVE(ALLOCATOR, D%NLENGTF*KF_FS*SIZEOF(DUMMY)) + + END FUNCTION +SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) + +!**** *TRMTOL_UNPACK* - Copy fourier data from buffer to local array + +! Purpose. +! -------- +! Routine for copying fourier data from buffer to local array + +!** Interface. +! ---------- +! CALL TRMTOL_UNPACK(...) + +! Explicit arguments : PREEL_COMPLEX - local fourier/GP array +! -------------------- KF_CURRENT - number of fields that are read (from Legendre space) +! KF_TOTAL - total fields in PREEL ("stride") +! +! Externals. None. +! ---------- + +! Author. +! ------- +! Mats Hamrud *ECMWF* + +! Modifications. +! -------------- +! Original : 2000-04-01 + +! ------------------------------------------------------------------ + +USE PARKIND_ECTRANS ,ONLY : JPIM,JPRBT +USE TPM_DISTR ,ONLY : D,MYSETW,MYPROC, NPROC, D_NSTAGTF, D_NPNTGTB0,D_NPTRLS +USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NLOEN,G_NLOEN_MAX +! + +IMPLICIT NONE + +REAL(KIND=JPRBT), INTENT(IN) :: FOUBUF(:) +REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:) +INTEGER(KIND=JPIM),INTENT(IN) :: KF_CURRENT, KF_TOTAL +TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR +TYPE(TRMTOL_UNPACK_HANDLE), INTENT(IN) :: HTRMTOL_UNPACK + +INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL +REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX + +CALL ASSIGN_PTR(PREEL_COMPLEX, GET_ALLOCATION(ALLOCATOR, HTRMTOL_UNPACK%HREEL),& + & 1_C_SIZE_T, KF_TOTAL*D%NLENGTF*SIZEOF(PREEL_COMPLEX(1))) + +!$ACC DATA PRESENT(D,G_NLOEN,G_NMEN,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF) ASYNC(1) + +OFFSET_VAR=D_NPTRLS(MYSETW) +!$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,ISTA,RET_REAL,RET_COMPLEX) DEFAULT(NONE) & +!$ACC& ASYNC(1) TILE(32,16,1) +DO KGL=1,D%NDGL_FS + DO JF=1,KF_CURRENT + DO JM=0,G_NLOEN_MAX/2 + IGLG = OFFSET_VAR+KGL-1 + + ! FFT transforms NLON real values to floor(NLON/2)+1 complex numbers. Hence we have + ! to fill those floor(NLON/2)+1 values. + ! Truncation happens starting at G_NMEN+1. Hence, we zero-fill those values. + IF (JM <= G_NLOEN(IGLG)/2) THEN + RET_REAL = 0.0_JPRBT + RET_COMPLEX = 0.0_JPRBT + IF (JM <= G_NMEN(IGLG)) THEN + ISTA = D_NPNTGTB0(JM,KGL)*KF_CURRENT*2 + + RET_REAL = FOUBUF(ISTA+2*JF-1) + RET_COMPLEX = FOUBUF(ISTA+2*JF ) + ENDIF + IOFF_LAT = KF_TOTAL*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + PREEL_COMPLEX(IOFF_LAT+2*JM+1) = RET_REAL + PREEL_COMPLEX(IOFF_LAT+2*JM+2) = RET_COMPLEX + ENDIF + ENDDO + ENDDO +ENDDO +!$ACC END DATA + +!$ACC WAIT(1) + +END SUBROUTINE TRMTOL_UNPACK +END MODULE TRMTOL_PACK_UNPACK + From bd033778843f3386d3ec5fb1cd6f6e1241d18678 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:37 -0700 Subject: [PATCH 235/263] Move index computations into TRLTOG --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 1 - src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 129 +++---------------- src/trans/gpu/internal/trgtol_mod.F90 | 4 +- src/trans/gpu/internal/trltog_mod.F90 | 104 +++++++++++++-- 4 files changed, 119 insertions(+), 119 deletions(-) diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index f389604cd..75a6a8dd4 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -167,7 +167,6 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ENDIF HLTDIR = PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) - ! TODO this is going to be simplified when we have it implemented for invtrans too CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, REUSE_PTR) ! from the PGP arrays to PREEL_REAL diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index 976444c43..88e84292b 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -141,11 +141,6 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& & KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET INTEGER(KIND=JPIM) :: IF_LEG, IF_FOURIER -INTEGER(KIND=JPIM) :: IST -INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) -INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) -INTEGER(KIND=JPIM) :: IVSET(KF_GP) -INTEGER(KIND=JPIM) :: J3,JGL,IGL,IOFF,IFGP2,IFGP3A,IFGP3B,IGP3APAR,IGP3BPAR INTEGER(KIND=JPIM) :: IFIRST TYPE(BUFFERED_ALLOCATOR) :: ALLOCATOR @@ -211,9 +206,7 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, REUSE_PTR) - ! No splitting of fields, transform done in one go - ! from PSPXXX to FOUBUF - + ! Legendre transformations CALL GSTATS(102,0) IF (KF_FS > 0) THEN CALL LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& @@ -222,109 +215,29 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& END IF CALL GSTATS(102,1) - IF (KF_FS > 0) CALL TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,IF_LEG) - - CALL GSTATS(152,0) - WRITE(NOUT,*) 'ltinv_ctl:TRMTOL_CUDAAWARE' - IF (KF_FS > 0) CALL TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,FOUBUF_IN,FOUBUF,IF_LEG) - CALL GSTATS(152,1) - - CALL GSTATS(107,0) - ! from FOUBUF to PREEL_COMPLEX. Divide by two because we consider this complex space now - IF (KF_FS > 0) CALL TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,IF_LEG,IF_FOURIER) - - ! 2. Fourier space computations - - ! fill the rest of PREEL_COMPLEX - IF (KF_FS > 0) CALL FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, IF_FOURIER, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, & - & KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) - - ! 3. Fourier transform - ! inplace operation - IF (KF_FS > 0) CALL FTINV(ALLOCATOR, HFTINV, PREEL_COMPLEX,PREEL_REAL,IF_FOURIER) - - CALL GSTATS(107,1) - - ! 4. Transposition - - IF (PRESENT(KVSETUV)) THEN - IVSETUV(:) = KVSETUV(:) - ELSE - IVSETUV(:) = -1 - ENDIF - IVSETSC(:)=-1 - IF (PRESENT(KVSETSC)) THEN - IVSETSC(:) = KVSETSC(:) - ELSE - IOFF=0 - IF (PRESENT(KVSETSC2)) THEN - IFGP2=UBOUND(KVSETSC2,1) - IVSETSC(1:IFGP2)=KVSETSC2(:) - IOFF=IOFF+IFGP2 - ENDIF - IF (PRESENT(KVSETSC3A)) THEN - IFGP3A=UBOUND(KVSETSC3A,1) - IGP3APAR=UBOUND(PGP3A,3) - IF (LSCDERS) IGP3APAR=IGP3APAR/3 - DO J3=1,IGP3APAR - IVSETSC(IOFF+1:IOFF+IFGP3A)=KVSETSC3A(:) - IOFF=IOFF+IFGP3A - ENDDO - ENDIF - IF (PRESENT(KVSETSC3B)) THEN - IFGP3B=UBOUND(KVSETSC3B,1) - IGP3BPAR=UBOUND(PGP3B,3) - IF (LSCDERS) IGP3BPAR=IGP3BPAR/3 - DO J3=1,IGP3BPAR - IVSETSC(IOFF+1:IOFF+IFGP3B)=KVSETSC3B(:) - IOFF=IOFF+IFGP3B - ENDDO - ENDIF - IF (IOFF > 0 .AND. IOFF /= KF_SCALARS_G ) THEN - WRITE(NERR,*)'FTINV:IOFF,KF_SCALARS_G ',IOFF,KF_SCALARS_G - CALL ABORT_TRANS('FTINV_CTL_MOD:IOFF /= KF_SCALARS_G') - ENDIF - ENDIF - - IST = 1 - IF (KF_UV_G > 0) THEN - IF (LVORGP) THEN - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - ENDIF - IF ( LDIVGP) THEN - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - ENDIF - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - ENDIF - IF (KF_SCALARS_G > 0) THEN - IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) - IST = IST+KF_SCALARS_G - IF (LSCDERS) THEN - IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) - IST = IST+KF_SCALARS_G - ENDIF - ENDIF - IF (KF_UV_G > 0 .AND. LUVDER) THEN - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - ENDIF - IF (KF_SCALARS_G > 0) THEN - IF (LSCDERS) THEN - IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) - IST = IST+KF_SCALARS_G - ENDIF + ! Packing into send buffer, to fourier space and unpack + IF (KF_FS > 0) THEN + CALL GSTATS(152,0) + CALL TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,IF_LEG) + CALL TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,FOUBUF_IN,FOUBUF,IF_LEG) + CALL TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,IF_LEG,IF_FOURIER) + CALL GSTATS(152,1) + + CALL GSTATS(107,0) + ! compute NS derivatives + CALL FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, IF_FOURIER, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, & + & KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) + !Legendre transformations + CALL FTINV(ALLOCATOR, HFTINV, PREEL_COMPLEX,PREEL_REAL,IF_FOURIER) + CALL GSTATS(107,1) ENDIF + ! Transposition into grid-point space CALL GSTATS(157,0) - CALL TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,IF_FOURIER,KF_GP,KF_UV_G,KF_SCALARS_G,IVSET,& - &PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) + CALL TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,IF_FOURIER,KF_GP,KF_UV_G,KF_SCALARS_G,& + & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& + & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& + & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) CALL GSTATS(157,1) ! ------------------------------------------------------------------ ! ------------------------------------------------------------------ diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 3d133647d..839727d34 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -215,9 +215,9 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDDO ENDIF ENDIF - + IF (IOFF /= 2*KF_UV_G+KF_SCALARS_G) THEN - PRINT*, "ERROR IN IVSET COMPUTATION" + PRINT*, "TRGTOL: ERROR IN IVSET COMPUTATION" FLUSH(6) STOP 38 ENDIF diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 984184aab..3fd5944fe 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -40,9 +40,10 @@ FUNCTION PREPARE_TRLTOG(ALLOCATOR,KF_FS,KF_GP) RESULT(HTRLTOG) HTRLTOG%HCOMBUFR_AND_COMBUFS = RESERVE(ALLOCATOR, NELEM) END FUNCTION - SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KVSET,KPTRGP,& - &PGP,PGPUV,PGP3A,PGP3B,PGP2) - + SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KPTRGP,& + & KVSETUV,KVSETSC,KVSETSC3A,KVSETSC3B,KVSETSC2,& + & PGP,PGPUV,PGP3A,PGP3B,PGP2) + !**** *trltog * - transposition of grid point data from latitudinal ! to column structure. This takes place between inverse ! FFT and grid point calculations. @@ -120,7 +121,6 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ IMPLICIT NONE REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) - INTEGER(KIND=JPIM),INTENT(IN) :: KVSET(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV_G, KF_SCALARS_G INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) @@ -129,6 +129,12 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP3A(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP2(:,:,:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(TRLTOG_HANDLE) :: HTRLTOG @@ -161,6 +167,11 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V + INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) + INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) + INTEGER(KIND=JPIM) :: IVSET(KF_GP) + INTEGER(KIND=JPIM) :: J3,IFGP2,IFGP3A,IFGP3B + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR @@ -177,8 +188,83 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ ! -------------------- IF (LHOOK) CALL DR_HOOK('TRLTOG',0,ZHOOK_HANDLE) - CALL GSTATS(1806,0) + ! Note we have either + ! - KVSETUV and KVSETSC (with PGP, which has u, v, and scalar fields), or + ! - KVSETUV, KVSETSC2, KVSETSC3A KVSETSC3B (with PGPUV, GP3A, PGP3B and PGP2) + ! KVSETs are optionals. Their sizes canalso be inferred from KV_UV_G/KV_SCALARS_G (which + ! should match PSPXXX and PGPXXX arrays) + + ! We first get the decomposition individually + IVSETUV(:) = -1 + IF (PRESENT(KVSETUV)) IVSETUV(:) = KVSETUV(:) + IVSETSC(:)=-1 + IF (PRESENT(KVSETSC)) THEN + IVSETSC(:) = KVSETSC(:) + ELSE + IOFF=0 + IF (PRESENT(KVSETSC2)) THEN + IVSETSC(IOFF+1:IOFF+SIZE(KVSETSC2))=KVSETSC2(:) + IOFF = IOFF+SIZE(KVSETSC2) + ENDIF + IF (PRESENT(KVSETSC3A)) THEN + DO J3=1,MERGE(UBOUND(PGP3A,3),UBOUND(PGP3A,3)/3,.NOT. LSCDERS) + IVSETSC(IOFF+1:IOFF+SIZE(KVSETSC3A))=KVSETSC3A(:) + IOFF=IOFF+SIZE(KVSETSC3A) + ENDDO + ENDIF + IF (PRESENT(KVSETSC3B)) THEN + ! If SCDERS is on, the size of PGP is 3X larger because it is + ! holding various derivatives. The problem is that those are + ! at different non-contiguous positions, hence we treat them + ! as separate fields + DO J3=1,MERGE(UBOUND(PGP3B,3),UBOUND(PGP3B,3)/3,.NOT. LSCDERS) + IVSETSC(IOFF+1:IOFF+SIZE(KVSETSC3B))=KVSETSC3B(:) + IOFF=IOFF+SIZE(KVSETSC3B) + ENDDO + ENDIF + IF (IOFF > 0 .AND. IOFF /= KF_SCALARS_G ) THEN + PRINT*, "TRLTOG: ERROR IN IVSETSC COMPUTATION" + STOP 39 + ENDIF + ENDIF + + ! Now from UV and Scalars decomposition we get the full decomposition + IOFF=0 + IF (KF_UV_G > 0) THEN + IF (LVORGP) THEN + IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) + IOFF=IOFF+KF_UV_G + ENDIF + IF ( LDIVGP) THEN + IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) + IOFF=IOFF+KF_UV_G + ENDIF + IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) + IOFF=IOFF+KF_UV_G + IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) + IOFF=IOFF+KF_UV_G + ENDIF + IF (KF_SCALARS_G > 0) THEN + IVSET(IOFF+1:IOFF+KF_SCALARS_G) = IVSETSC(:) + IOFF=IOFF+KF_SCALARS_G + IF (LSCDERS) THEN + IVSET(IOFF+1:IOFF+KF_SCALARS_G) = IVSETSC(:) + IOFF=IOFF+KF_SCALARS_G + ENDIF + ENDIF + IF (KF_UV_G > 0 .AND. LUVDER) THEN + IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) + IOFF=IOFF+KF_UV_G + IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) + IOFF=IOFF+KF_UV_G + ENDIF + IF (KF_SCALARS_G > 0) THEN + IF (LSCDERS) THEN + IVSET(IOFF+1:IOFF+KF_SCALARS_G) = IVSETSC(:) + IOFF=IOFF+KF_SCALARS_G + ENDIF + ENDIF IF (.NOT. PRESENT(PGP)) THEN ! This is only relevant if we use the split interface (i.e. not PGP) @@ -309,6 +395,8 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ ENDIF ENDIF + CALL GSTATS(1806,0) + ! Prepare receiver arrays ! find number of fields on a certain V-set IF(NPRTRV == 1) THEN @@ -317,7 +405,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ ELSE IRECV_FIELD_COUNT(:) = 0 DO JFLD=1,KF_GP - IRECV_FIELD_COUNT(KVSET(JFLD)) = IRECV_FIELD_COUNT(KVSET(JFLD)) + 1 + IRECV_FIELD_COUNT(IVSET(JFLD)) = IRECV_FIELD_COUNT(IVSET(JFLD)) + 1 ENDDO ENDIF ! find number of grid-points on a certain W-set that overlap with myself @@ -396,7 +484,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ ! Input is KF_GP fields. We find the resulting KF_FS fields. IFLDS = 0 DO JFLD=1,KF_GP - IF(KVSET(JFLD) == MYSETV .OR. KVSET(JFLD) == -1) THEN + IF(IVSET(JFLD) == MYSETV .OR. IVSET(JFLD) == -1) THEN IFLDS = IFLDS+1 IF(PRESENT(KPTRGP)) THEN IFLDA(IFLDS) = KPTRGP(JFLD) @@ -572,7 +660,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ IFLDS = 0 DO JFLD=1,KF_GP - IF(KVSET(JFLD) == ISETV .OR. KVSET(JFLD) == -1 ) THEN + IF(IVSET(JFLD) == ISETV .OR. IVSET(JFLD) == -1 ) THEN IFLDS = IFLDS+1 IF(PRESENT(KPTRGP)) THEN IFLDA(IFLDS)=KPTRGP(JFLD) From 4c3fa0ee9ec46aaaa65c6a4ea6091623aa7458b1 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:38 -0700 Subject: [PATCH 236/263] Fix alignment in allocator and add implicit none --- src/trans/gpu/internal/allocator_mod.F90 | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/trans/gpu/internal/allocator_mod.F90 b/src/trans/gpu/internal/allocator_mod.F90 index a31b5903d..48ceffa68 100644 --- a/src/trans/gpu/internal/allocator_mod.F90 +++ b/src/trans/gpu/internal/allocator_mod.F90 @@ -38,12 +38,14 @@ MODULE ALLOCATOR_MOD INTERFACE ASSIGN_PTR SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES) USE ISO_C_BINDING + IMPLICIT NONE INTEGER(KIND=C_INT8_T), POINTER, INTENT(IN) :: SRC(:) REAL(KIND=C_FLOAT), POINTER, INTENT(OUT) :: DST(:) INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES END SUBROUTINE SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES) USE ISO_C_BINDING + IMPLICIT NONE INTEGER(KIND=C_INT8_T), POINTER, INTENT(IN) :: SRC(:) REAL(KIND=C_DOUBLE), POINTER, INTENT(OUT) :: DST(:) INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES @@ -58,6 +60,7 @@ SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES) ! the bottom of the allocation. FUNCTION MAKE_BUFFERED_ALLOCATOR() + IMPLICIT NONE TYPE(BUFFERED_ALLOCATOR) :: MAKE_BUFFERED_ALLOCATOR MAKE_BUFFERED_ALLOCATOR%BUFR_SZ(:) = 0 @@ -65,6 +68,7 @@ FUNCTION MAKE_BUFFERED_ALLOCATOR() END FUNCTION FUNCTION RESERVE(ALLOCATOR, SZ) + IMPLICIT NONE TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR INTEGER(KIND=C_SIZE_T), INTENT(IN) :: SZ @@ -78,11 +82,12 @@ FUNCTION RESERVE(ALLOCATOR, SZ) END FUNCTION SUBROUTINE INSTANTIATE_ALLOCATOR(ALLOCATOR, OLD_PTR) + IMPLICIT NONE TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR INTEGER(C_INT8_T), OPTIONAL, INTENT(INOUT), POINTER :: OLD_PTR(:) + ALLOCATOR%BUFR_SZ(0) = ALIGN(ALLOCATOR%BUFR_SZ(0),128) ALLOCATOR%BUFR_SZ(1) = ALIGN(ALLOCATOR%BUFR_SZ(1),128) - ALLOCATOR%BUFR_SZ(2) = ALIGN(ALLOCATOR%BUFR_SZ(2),128) IF (ASSOCIATED(OLD_PTR)) THEN IF (SIZEOF(OLD_PTR) < SUM(ALLOCATOR%BUFR_SZ) ) THEN @@ -104,6 +109,7 @@ SUBROUTINE INSTANTIATE_ALLOCATOR(ALLOCATOR, OLD_PTR) END SUBROUTINE FUNCTION GET_ALLOCATION(ALLOCATOR, RESERVATION) + IMPLICIT NONE TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(ALLOCATION_RESERVATION_HANDLE), INTENT(IN) :: RESERVATION @@ -123,6 +129,7 @@ FUNCTION GET_ALLOCATION(ALLOCATOR, RESERVATION) SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES) USE ISO_C_BINDING + IMPLICIT NONE INTEGER(KIND=C_INT8_T), POINTER, INTENT(IN) :: SRC(:) REAL(KIND=C_FLOAT), POINTER, INTENT(OUT) :: DST(:) INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES @@ -135,6 +142,7 @@ SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES) END SUBROUTINE SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES) USE ISO_C_BINDING + IMPLICIT NONE INTEGER(KIND=C_INT8_T), POINTER, INTENT(IN) :: SRC(:) REAL(KIND=C_DOUBLE), POINTER, INTENT(OUT) :: DST(:) INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES From 3aa054c1dc4c65cb9fa779cbccb4f98aae2e9653 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:38 -0700 Subject: [PATCH 237/263] Reformat some files --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 381 +++--- src/trans/gpu/internal/ftdir_mod.F90 | 122 +- src/trans/gpu/internal/ftinv_mod.F90 | 123 +- src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 463 ++++--- src/trans/gpu/internal/ledir_mod.F90 | 560 ++++---- src/trans/gpu/internal/leinv_mod.F90 | 2 - src/trans/gpu/internal/ltdir_mod.F90 | 438 +++--- src/trans/gpu/internal/ltinv_mod.F90 | 572 ++++---- src/trans/gpu/internal/trgtol_mod.F90 | 1021 +++++++------- src/trans/gpu/internal/trltog_mod.F90 | 1249 +++++++++--------- src/trans/gpu/internal/trltom_mod.F90 | 303 +++-- src/trans/gpu/internal/trmtol_mod.F90 | 296 +++-- 12 files changed, 2749 insertions(+), 2781 deletions(-) diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 75a6a8dd4..46fec9b4c 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -10,183 +10,181 @@ MODULE DIR_TRANS_CTL_MOD CONTAINS -SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& - & PSPVOR,PSPDIV,PSPSCALAR,KVSETUV,KVSETSC,PGP,& - & PSPSC3A,PSPSC3B,PSPSC2,KVSETSC3A,KVSETSC3B,KVSETSC2,PGPUV,PGP3A,PGP3B,PGP2) - -!**** *DIR_TRANS_CTL* - Control routine for direct spectral transform. - -! Purpose. -! -------- -! Control routine for the direct spectral transform - -!** Interface. -! ---------- -! CALL DIR_TRANS_CTL(...) - -! Explicit arguments : -! -------------------- -! KF_UV_G - global number of spectral u-v fields -! KF_SCALARS_G - global number of scalar spectral fields -! KF_GP - total number of output gridpoint fields -! KF_FS - total number of fields in fourier space -! KF_UV - local number of spectral u-v fields -! KF_SCALARS - local number of scalar spectral fields -! PSPVOR(:,:) - spectral vorticity -! PSPDIV(:,:) - spectral divergence -! PSPSCALAR(:,:) - spectral scalarvalued fields -! KVSETUV(:) - indicating which 'b-set' in spectral space owns a -! vor/div field. Equivalant to NBSETLEV in the IFS. -! The length of KVSETUV should be the GLOBAL number -! of u/v fields which is the dimension of u and v releated -! fields in grid-point space. -! KVESETSC(:) - indicating which 'b-set' in spectral space owns a -! scalar field. As for KVSETUV this argument is required -! if the total number of processors is greater than -! the number of processors used for distribution in -! spectral wave space. -! PGP(:,:,:) - gridpoint fields - -! The ordering of the output fields is as follows (all -! parts are optional depending on the input switches): -! -! u : KF_UV_G fields -! v : KF_UV_G fields -! scalar fields : KF_SCALARS_G fields - -! Method. -! ------- - -! Externals. SHUFFLE - reshuffle fields for load balancing -! ---------- FIELD_SPLIT - split fields in NPROMATR packets -! LTDIR_CTL - control of Legendre transform -! FTDIR_CTL - control of Fourier transform - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 01-01-03 - -! ------------------------------------------------------------------ - -USE PARKIND1 ,ONLY : JPIM ,JPRB -USE PARKIND_ECTRANS ,ONLY : JPRBT, JPRD - -USE TPM_GEN ,ONLY : NPROMATR, NOUT -USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B -USE TPM_DISTR, ONLY: NPROC -USE FTDIR_MOD ,ONLY : FTDIR, FTDIR_HANDLE, PREPARE_FTDIR - -USE SHUFFLE_MOD ,ONLY : SHUFFLE -USE FIELD_SPLIT_MOD ,ONLY : FIELD_SPLIT -USE LTDIR_MOD ,ONLY : LTDIR, PREPARE_LTDIR, LTDIR_HANDLE -USE TRGTOL_MOD -USE MPL_MODULE ,ONLY : MPL_BARRIER -USE TPM_GEN ,ONLY : LSYNC_TRANS -USE TRLTOM_MOD ,ONLY : TRLTOM_CUDAAWARE, TRLTOM_HANDLE, PREPARE_TRLTOM -USE TRLTOM_PACK_UNPACK - -USE TPM_DISTR, ONLY : D, NPROC -USE TPM_TRANS, ONLY:REUSE_PTR - -USE ALLOCATOR_MOD -USE ISO_C_BINDING, ONLY: C_INT8_T -! + SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& + & PSPVOR,PSPDIV,PSPSCALAR,KVSETUV,KVSETSC,PGP,& + & PSPSC3A,PSPSC3B,PSPSC2,KVSETSC3A,KVSETSC3B,KVSETSC2,PGPUV,PGP3A,PGP3B,PGP2) + + !**** *DIR_TRANS_CTL* - Control routine for direct spectral transform. + + ! Purpose. + ! -------- + ! Control routine for the direct spectral transform + + !** Interface. + ! ---------- + ! CALL DIR_TRANS_CTL(...) + + ! Explicit arguments : + ! -------------------- + ! KF_UV_G - global number of spectral u-v fields + ! KF_SCALARS_G - global number of scalar spectral fields + ! KF_GP - total number of output gridpoint fields + ! KF_FS - total number of fields in fourier space + ! KF_UV - local number of spectral u-v fields + ! KF_SCALARS - local number of scalar spectral fields + ! PSPVOR(:,:) - spectral vorticity + ! PSPDIV(:,:) - spectral divergence + ! PSPSCALAR(:,:) - spectral scalarvalued fields + ! KVSETUV(:) - indicating which 'b-set' in spectral space owns a + ! vor/div field. Equivalant to NBSETLEV in the IFS. + ! The length of KVSETUV should be the GLOBAL number + ! of u/v fields which is the dimension of u and v releated + ! fields in grid-point space. + ! KVESETSC(:) - indicating which 'b-set' in spectral space owns a + ! scalar field. As for KVSETUV this argument is required + ! if the total number of processors is greater than + ! the number of processors used for distribution in + ! spectral wave space. + ! PGP(:,:,:) - gridpoint fields + + ! The ordering of the output fields is as follows (all + ! parts are optional depending on the input switches): + ! + ! u : KF_UV_G fields + ! v : KF_UV_G fields + ! scalar fields : KF_SCALARS_G fields + + ! Method. + ! ------- + + ! Externals. SHUFFLE - reshuffle fields for load balancing + ! ---------- FIELD_SPLIT - split fields in NPROMATR packets + ! LTDIR_CTL - control of Legendre transform + ! FTDIR_CTL - control of Fourier transform + + ! Author. + ! ------- + ! Mats Hamrud *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 01-01-03 + + ! ------------------------------------------------------------------ + + USE PARKIND1 ,ONLY : JPIM ,JPRB + USE PARKIND_ECTRANS ,ONLY : JPRBT, JPRD + + USE TPM_GEN ,ONLY : NPROMATR, NOUT + USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B + USE TPM_DISTR, ONLY: NPROC + USE FTDIR_MOD ,ONLY : FTDIR, FTDIR_HANDLE, PREPARE_FTDIR + + USE SHUFFLE_MOD ,ONLY : SHUFFLE + USE FIELD_SPLIT_MOD ,ONLY : FIELD_SPLIT + USE LTDIR_MOD ,ONLY : LTDIR, PREPARE_LTDIR, LTDIR_HANDLE + USE TRGTOL_MOD + USE MPL_MODULE ,ONLY : MPL_BARRIER + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE TRLTOM_MOD ,ONLY : TRLTOM_CUDAAWARE, TRLTOM_HANDLE, PREPARE_TRLTOM + USE TRLTOM_PACK_UNPACK + + USE TPM_DISTR, ONLY : D, NPROC + USE TPM_TRANS, ONLY:REUSE_PTR + + USE ALLOCATOR_MOD + USE ISO_C_BINDING, ONLY: C_INT8_T + ! + + IMPLICIT NONE + + ! Declaration of arguments + + INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV_G + INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS_G + INTEGER(KIND=JPIM), INTENT(IN) :: KF_GP + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV + INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3A(:,:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3B(:,:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC2(:,:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGP(:,:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGPUV(:,:,:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGP3A(:,:,:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGP3B(:,:,:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGP2(:,:,:) + + ! Local variables + + INTEGER(KIND=JPIM) :: IPTRGP(KF_GP),IPTRSPUV(NPROMATR),IPTRSPSC(NPROMATR) + INTEGER(KIND=JPIM) :: ISHFUV_G(KF_GP),ISHFSC_G(KF_GP) + INTEGER(KIND=JPIM) :: IVSETUV(KF_GP),IVSETSC(KF_GP) + INTEGER(KIND=JPIM) :: IBLKS,JBLK,ISTUV_G,IENUV_G + INTEGER(KIND=JPIM) :: IF_UV_G,IF_UV,ISTUV,IF_SCALARS,IF_SCALARS_G,IF_FS,IF_GP + INTEGER(KIND=JPIM) :: JFLD,ISTSC_G,IENSC_G,ISTSC,IENSC,IENUV,IF_GPB + + REAL(KIND=JPRBT), POINTER :: FOUBUF_IN(:), FOUBUF(:) + REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:), PREEL_COMPLEX(:) + + REAL(KIND=JPRBT), POINTER :: ZINPS(:), ZINPA(:) + REAL(KIND=JPRD), POINTER :: ZINPS0(:), ZINPA0(:) + + TYPE(BUFFERED_ALLOCATOR) :: ALLOCATOR + TYPE(TRGTOL_HANDLE) :: HTRGTOL + TYPE(FTDIR_HANDLE) :: HFTDIR + TYPE(TRLTOM_PACK_HANDLE) :: HTRLTOM_PACK + TYPE(TRLTOM_HANDLE) :: HTRLTOM + TYPE(TRLTOM_UNPACK_HANDLE) :: HTRLTOM_UNPACK + TYPE(TRLTOM_DIRECT_HANDLE) :: HTRLTOM_DIRECT + TYPE(LTDIR_HANDLE) :: HLTDIR + + IF(NPROMATR > 0) THEN + PRINT *, "ERROR, not implemented right now (NPROMATR > 0)" + STOP 4 + ENDIF + + ! Prepare everything + ALLOCATOR = MAKE_BUFFERED_ALLOCATOR() + HTRGTOL = PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) + HFTDIR = PREPARE_FTDIR() + IF (NPROC > 1) THEN + HTRLTOM_PACK = PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) + HTRLTOM = PREPARE_TRLTOM(ALLOCATOR, KF_FS) + HTRLTOM_UNPACK = PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) + ELSE + HTRLTOM_DIRECT = PREPARE_TRLTOM_DIRECT(ALLOCATOR, KF_FS) + ENDIF + HLTDIR = PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) + + CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, REUSE_PTR) -IMPLICIT NONE - -! Declaration of arguments - -INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV_G -INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS_G -INTEGER(KIND=JPIM), INTENT(IN) :: KF_GP -INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS -INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV -INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3A(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3B(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC2(:,:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGP(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGPUV(:,:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGP3A(:,:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGP3B(:,:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGP2(:,:,:) - -! Local variables - -INTEGER(KIND=JPIM) :: IPTRGP(KF_GP),IPTRSPUV(NPROMATR),IPTRSPSC(NPROMATR) -INTEGER(KIND=JPIM) :: ISHFUV_G(KF_GP),ISHFSC_G(KF_GP) -INTEGER(KIND=JPIM) :: IVSETUV(KF_GP),IVSETSC(KF_GP) -INTEGER(KIND=JPIM) :: IBLKS,JBLK,ISTUV_G,IENUV_G -INTEGER(KIND=JPIM) :: IF_UV_G,IF_UV,ISTUV,IF_SCALARS,IF_SCALARS_G,IF_FS,IF_GP -INTEGER(KIND=JPIM) :: JFLD,ISTSC_G,IENSC_G,ISTSC,IENSC,IENUV,IF_GPB - -REAL(KIND=JPRBT), POINTER :: FOUBUF_IN(:) -REAL(KIND=JPRBT), POINTER :: FOUBUF(:) -REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:) -REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) - -REAL(KIND=JPRBT), POINTER :: ZINPS(:), ZINPA(:) -REAL(KIND=JPRD), POINTER :: ZINPS0(:), ZINPA0(:) - -TYPE(BUFFERED_ALLOCATOR) :: ALLOCATOR -TYPE(TRGTOL_HANDLE) :: HTRGTOL -TYPE(FTDIR_HANDLE) :: HFTDIR -TYPE(TRLTOM_PACK_HANDLE) :: HTRLTOM_PACK -TYPE(TRLTOM_HANDLE) :: HTRLTOM -TYPE(TRLTOM_UNPACK_HANDLE) :: HTRLTOM_UNPACK -TYPE(TRLTOM_DIRECT_HANDLE) :: HTRLTOM_DIRECT -TYPE(LTDIR_HANDLE) :: HLTDIR - - IF(NPROMATR > 0) THEN - PRINT *, "ERROR, not implemented right now (NPROMATR > 0)" - STOP 4 - ENDIF - - ! Prepare everything - ALLOCATOR = MAKE_BUFFERED_ALLOCATOR() - HTRGTOL = PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) - HFTDIR = PREPARE_FTDIR() - IF (NPROC > 1) THEN - HTRLTOM_PACK = PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) - HTRLTOM = PREPARE_TRLTOM(ALLOCATOR, KF_FS) - HTRLTOM_UNPACK = PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) - ELSE - HTRLTOM_DIRECT = PREPARE_TRLTOM_DIRECT(ALLOCATOR, KF_FS) - ENDIF - HLTDIR = PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) - - CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, REUSE_PTR) - - ! from the PGP arrays to PREEL_REAL - CALL TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,& - & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& - & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& - & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) - - ! fourier transform from PREEL_REAL to PREEL_COMPLEX (in-place!) - CALL GSTATS(1640,0) - IF (KF_FS > 0) THEN - CALL FTDIR(HFTDIR,PREEL_REAL,PREEL_COMPLEX,KF_FS) - ELSE - PREEL_COMPLEX => PREEL_REAL - ENDIF - CALL GSTATS(1640,1) - - CALL GSTATS(153,0) - IF (NPROC > 1) THEN - WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_CUDAAWARE' + ! from the PGP arrays to PREEL_REAL + CALL TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,& + & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& + & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& + & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) + + ! fourier transform from PREEL_REAL to PREEL_COMPLEX (in-place!) + CALL GSTATS(1640,0) + IF (KF_FS > 0) THEN + CALL FTDIR(HFTDIR,PREEL_REAL,PREEL_COMPLEX,KF_FS) + ELSE + PREEL_COMPLEX => PREEL_REAL + ENDIF + CALL GSTATS(1640,1) + + CALL GSTATS(153,0) + IF (NPROC > 1) THEN + WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_CUDAAWARE' IF (KF_FS > 0) THEN CALL TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) @@ -194,20 +192,19 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& CALL TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) ENDIF - ELSE - WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_DIRECT' - ! Short cut - no need to go through tansforms, we will go directly into - ! the legendre space - CALL TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) - ENDIF - CALL GSTATS(153,1) - - IF (KF_FS > 0) THEN - CALL LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALARS, & - & PSPVOR,PSPDIV,PSPSCALAR,& - & PSPSC3A,PSPSC3B,PSPSC2) - ENDIF -! ------------------------------------------------------------------ - -END SUBROUTINE DIR_TRANS_CTL + ELSE + WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_DIRECT' + ! Short cut - no need to go through tansforms, we will go directly into + ! the legendre space + CALL TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + ENDIF + CALL GSTATS(153,1) + + IF (KF_FS > 0) THEN + CALL LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALARS, & + & PSPVOR,PSPDIV,PSPSCALAR,& + & PSPSC3A,PSPSC3B,PSPSC2) + ENDIF + + END SUBROUTINE DIR_TRANS_CTL END MODULE DIR_TRANS_CTL_MOD diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 2cbac1257..bcce3e749 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -20,86 +20,84 @@ FUNCTION PREPARE_FTDIR() RESULT(HFTDIR) TYPE(FTDIR_HANDLE) :: HFTDIR END FUNCTION -SUBROUTINE FTDIR(HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) + SUBROUTINE FTDIR(HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) + !**** *FTDIR - Direct Fourier transform -!**** *FTDIR - Direct Fourier transform + ! Purpose. Routine for Grid-point to Fourier transform + ! -------- -! Purpose. Routine for Grid-point to Fourier transform -! -------- + !** Interface. + ! ---------- + ! CALL FTDIR(..) -!** Interface. -! ---------- -! CALL FTDIR(..) + ! Explicit arguments : PREEL - Fourier/grid-point array + ! -------------------- KFIELD - number of fields -! Explicit arguments : PREEL - Fourier/grid-point array -! -------------------- KFIELD - number of fields + ! Method. + ! ------- -! Method. -! ------- + ! Externals. FFT992 - FFT routine + ! ---------- + ! -! Externals. FFT992 - FFT routine -! ---------- -! - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 00-03-03 -! G. Radnoti 01-04-24 2D model (NLOEN=1) -! D. Degrauwe (Feb 2012): Alternative extension zone (E') -! G. Mozdzynski (Oct 2014): support for FFTW transforms -! G. Mozdzynski (Jun 2015): Support alternative FFTs to FFTW -! ------------------------------------------------------------------ + ! Author. + ! ------- + ! Mats Hamrud *ECMWF* -USE TPM_GEN ,ONLY : LSYNC_TRANS -USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT + ! Modifications. + ! -------------- + ! Original : 00-03-03 + ! G. Radnoti 01-04-24 2D model (NLOEN=1) + ! D. Degrauwe (Feb 2012): Alternative extension zone (E') + ! G. Mozdzynski (Oct 2014): support for FFTW transforms + ! G. Mozdzynski (Jun 2015): Support alternative FFTs to FFTW + ! ------------------------------------------------------------------ -USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC -USE TPM_GEOMETRY ,ONLY : G -USE TPM_FFTC ,ONLY : EXECUTE_DIR_FFT -USE MPL_MODULE ,ONLY : MPL_BARRIER -USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT -IMPLICIT NONE + USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC + USE TPM_GEOMETRY ,ONLY : G + USE TPM_FFTC ,ONLY : EXECUTE_DIR_FFT + USE MPL_MODULE ,ONLY : MPL_BARRIER + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD -REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) -REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:) -TYPE(FTDIR_HANDLE) :: HFTDIR + IMPLICIT NONE -INTEGER(KIND=JPIM) :: IGLG,KGL + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) + REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:) + TYPE(FTDIR_HANDLE) :: HFTDIR -! ------------------------------------------------------------------ + INTEGER(KIND=JPIM) :: KGL -PREEL_COMPLEX => PREEL_REAL + ! ------------------------------------------------------------------ -!$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX) + PREEL_COMPLEX => PREEL_REAL -IF (LSYNC_TRANS) THEN - CALL GSTATS(430,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(430,1) -ENDIF -CALL GSTATS(413,0) -CALL EXECUTE_DIR_FFT(PREEL_REAL(:),PREEL_COMPLEX(:),KFIELD, & - & LOENS=G%NLOEN(D%NPTRLS(MYSETW):D%NPTRLS(MYSETW)+D%NDGL_FS-1), & - & OFFSETS=D%NSTAGTF(1:D%NDGL_FS+1)) + !$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX) -IF (LSYNC_TRANS) THEN - CALL GSTATS(433,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(433,1) -ENDIF -CALL GSTATS(413,1) + IF (LSYNC_TRANS) THEN + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) + ENDIF + CALL GSTATS(413,0) + CALL EXECUTE_DIR_FFT(PREEL_REAL(:),PREEL_COMPLEX(:),KFIELD, & + & LOENS=G%NLOEN(D%NPTRLS(MYSETW):D%NPTRLS(MYSETW)+D%NDGL_FS-1), & + & OFFSETS=D%NSTAGTF(1:D%NDGL_FS+1)) -!$ACC END DATA + IF (LSYNC_TRANS) THEN + CALL GSTATS(433,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(433,1) + ENDIF + CALL GSTATS(413,1) -NULLIFY(PREEL_REAL) + !$ACC END DATA -! ------------------------------------------------------------------ + NULLIFY(PREEL_REAL) -END SUBROUTINE FTDIR + ! ------------------------------------------------------------------ + END SUBROUTINE FTDIR END MODULE FTDIR_MOD diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index 33fddfc94..757bc6957 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -27,87 +27,86 @@ FUNCTION PREPARE_FTINV(ALLOCATOR) RESULT(HFTINV) TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR TYPE(FTINV_HANDLE) :: HFTINV END FUNCTION -SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) -!**** *FTINV - Inverse Fourier transform + SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) + !**** *FTINV - Inverse Fourier transform -! Purpose. Routine for Fourier to Grid-point transform -! -------- + ! Purpose. Routine for Fourier to Grid-point transform + ! -------- -!** Interface. -! ---------- -! CALL FTINV(..) + !** Interface. + ! ---------- + ! CALL FTINV(..) -! Explicit arguments : PREEL - Fourier/grid-point array -! -------------------- KFIELD - number of fields + ! Explicit arguments : PREEL - Fourier/grid-point array + ! -------------------- KFIELD - number of fields -! Method. -! ------- + ! Method. + ! ------- -! Externals. FFT992 - FFT routine -! ---------- -! - -! Author. -! ------- -! Mats Hamrud *ECMWF* + ! Externals. FFT992 - FFT routine + ! ---------- + ! -! Modifications. -! -------------- -! Original : 00-03-03 -! G. Radnoti 01-04-24 2D model (NLOEN=1) -! D. Degrauwe (Feb 2012): Alternative extension zone (E') -! G. Mozdzynski (Oct 2014): support for FFTW transforms -! G. Mozdzynski (Jun 2015): Support alternative FFTs to FFTW -! ------------------------------------------------------------------ + ! Author. + ! ------- + ! Mats Hamrud *ECMWF* -USE TPM_GEN ,ONLY : LSYNC_TRANS -USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT + ! Modifications. + ! -------------- + ! Original : 00-03-03 + ! G. Radnoti 01-04-24 2D model (NLOEN=1) + ! D. Degrauwe (Feb 2012): Alternative extension zone (E') + ! G. Mozdzynski (Oct 2014): support for FFTW transforms + ! G. Mozdzynski (Jun 2015): Support alternative FFTs to FFTW + ! ------------------------------------------------------------------ -USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC -USE TPM_GEOMETRY ,ONLY : G -USE TPM_FFTC ,ONLY : EXECUTE_INV_FFT -USE MPL_MODULE ,ONLY : MPL_BARRIER -USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT -IMPLICIT NONE + USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC + USE TPM_GEOMETRY ,ONLY : G + USE TPM_FFTC ,ONLY : EXECUTE_INV_FFT + USE MPL_MODULE ,ONLY : MPL_BARRIER + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD -REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) -REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:) -TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR -TYPE(FTINV_HANDLE), INTENT(IN) :: HFTINV + IMPLICIT NONE -INTEGER(KIND=JPIM) :: IGLG,KGL,IRET + INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD + REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) + REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:) + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(FTINV_HANDLE), INTENT(IN) :: HFTINV -! ------------------------------------------------------------------ + INTEGER(KIND=JPIM) :: KGL,IRET -PREEL_REAL => PREEL_COMPLEX + ! ------------------------------------------------------------------ -!$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX) + PREEL_REAL => PREEL_COMPLEX -IF (LSYNC_TRANS) THEN - CALL GSTATS(440,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(440,1) -ENDIF -CALL GSTATS(423,0) -CALL EXECUTE_INV_FFT(PREEL_COMPLEX(:),PREEL_REAL(:),KFIELD, & - & LOENS=G%NLOEN(D%NPTRLS(MYSETW):D%NPTRLS(MYSETW)+D%NDGL_FS-1), & - & OFFSETS=D%NSTAGTF(1:D%NDGL_FS)) + !$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX) -IF (LSYNC_TRANS) THEN - CALL GSTATS(443,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(443,1) -ENDIF -CALL GSTATS(423,1) + IF (LSYNC_TRANS) THEN + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) + ENDIF + CALL GSTATS(423,0) + CALL EXECUTE_INV_FFT(PREEL_COMPLEX(:),PREEL_REAL(:),KFIELD, & + & LOENS=G%NLOEN(D%NPTRLS(MYSETW):D%NPTRLS(MYSETW)+D%NDGL_FS-1), & + & OFFSETS=D%NSTAGTF(1:D%NDGL_FS)) -!$ACC END DATA + IF (LSYNC_TRANS) THEN + CALL GSTATS(443,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(443,1) + ENDIF + CALL GSTATS(423,1) -NULLIFY(PREEL_COMPLEX) + !$ACC END DATA -! ------------------------------------------------------------------ + NULLIFY(PREEL_COMPLEX) -END SUBROUTINE FTINV + ! ------------------------------------------------------------------ + END SUBROUTINE FTINV END MODULE FTINV_MOD diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index 88e84292b..e46b24e36 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -10,237 +10,234 @@ MODULE INV_TRANS_CTL_MOD CONTAINS -SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& - & KF_UV,KF_SCALARS,KF_SCDERS,& - & PSPVOR,PSPDIV,PSPSCALAR,KVSETUV,KVSETSC,PGP,FSPGL_PROC,& - & PSPSC3A,PSPSC3B,PSPSC2,KVSETSC3A,KVSETSC3B,KVSETSC2,PGPUV,PGP3A,PGP3B,PGP2) - -!**** *INV_TRANS_CTL* - Control routine for inverse spectral transform. - -! Purpose. -! -------- -! Control routine for the inverse spectral transform - -!** Interface. -! ---------- -! CALL INV_TRANS_CTL(...) - -! Explicit arguments : -! -------------------- -! KF_UV_G - global number of spectral u-v fields -! KF_SCALARS_G - global number of scalar spectral fields -! KF_GP - total number of output gridpoint fields -! KF_FS - total number of fields in fourier space -! KF_OUT_LT - total number of fields coming out from inverse LT -! KF_UV - local number of spectral u-v fields -! KF_SCALARS - local number of scalar spectral fields -! KF_SCDERS - local number of derivatives of scalar spectral fields -! PSPVOR(:,:) - spectral vorticity (input) -! PSPDIV(:,:) - spectral divergence (input) -! PSPSCALAR(:,:) - spectral scalarvalued fields (input) -! KVSETUV(:) - indicating which 'b-set' in spectral space owns a -! vor/div field. Equivalant to NBSETLEV in the IFS. -! The length of KVSETUV should be the GLOBAL number -! of u/v fields which is the dimension of u and v releated -! fields in grid-point space. -! KVESETSC(:) - indicating which 'b-set' in spectral space owns a -! scalar field. As for KVSETUV this argument is required -! if the total number of processors is greater than -! the number of processors used for distribution in -! spectral wave space. -! FSPGL_PROC - external procedure to be executed in fourier space -! before transposition -! PGP(:,:,:) - gridpoint fields (output) - -! The ordering of the output fields is as follows (all -! parts are optional depending on the input switches): - -! vorticity : KF_UV_G fields -! divergence : KF_UV_G fields -! u : KF_UV_G fields -! v : KF_UV_G fields -! scalar fields : KF_SCALARS_G fields -! N-S derivative of scalar fields : KF_SCALARS_G fields -! E-W derivative of u : KF_UV_G fields -! E-W derivative of v : KF_UV_G fields -! E-W derivative of scalar fields : KF_SCALARS_G fields - -! Method. -! ------- - -! Externals. SHUFFLE - reshuffle fields for load balancing -! ---------- FIELD_SPLIT - split fields in NPROMATR packets -! LTINV_CTL - control of Legendre transform -! FTINV_CTL - control of Fourier transform - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 01-01-03 - -! ------------------------------------------------------------------ - - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT, JPRD - -USE TPM_GEN ,ONLY : NPROMATR, NOUT, NERR -USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, REUSE_PTR -USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS - -USE TRMTOL_MOD -USE LTINV_MOD -USE TRMTOL_PACK_UNPACK -USE FSC_MOD -USE FTINV_MOD -USE TRLTOG_MOD -USE TPM_DISTR ,ONLY : D -USE ALLOCATOR_MOD -! - -IMPLICIT NONE - -! Declaration of arguments - -INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV_G -INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS_G -INTEGER(KIND=JPIM), INTENT(IN) :: KF_GP -INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS -INTEGER(KIND=JPIM), INTENT(IN) :: KF_OUT_LT -INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV -INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS -INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCDERS -REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPVOR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPDIV(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSCALAR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC3A(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC3B(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC2(:,:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) -REAL(KIND=JPRB) ,OPTIONAL ,INTENT(OUT) :: PGP(:,:,:) -EXTERNAL FSPGL_PROC -OPTIONAL FSPGL_PROC -REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGPUV(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3A(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3B(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP2(:,:,:) - -! Local variables - -REAL(KIND=JPRB), POINTER :: FOUBUF(:), FOUBUF_IN(:) -REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:), PREEL_COMPLEX(:) -REAL(KIND=JPRBT), POINTER :: ZOUTS(:), ZOUTA(:) -REAL(KIND=JPRD), POINTER :: ZOUTS0(:), ZOUTA0(:) -INTEGER(KIND=JPIM) :: KUV_OFFSET, KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, & - & KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET -INTEGER(KIND=JPIM) :: IF_LEG, IF_FOURIER - -INTEGER(KIND=JPIM) :: IFIRST - -TYPE(BUFFERED_ALLOCATOR) :: ALLOCATOR -TYPE(LTINV_HANDLE) :: HLTINV -TYPE(TRMTOL_PACK_HANDLE) :: HTRMTOL_PACK -TYPE(TRMTOL_HANDLE) :: HTRMTOL -TYPE(TRMTOL_UNPACK_HANDLE) :: HTRMTOL_UNPACK -TYPE(FSC_HANDLE) :: HFSC -TYPE(FTINV_HANDLE) :: HFTINV -TYPE(TRLTOG_HANDLE) :: HTRLTOG - -INTEGER(KIND=C_INT8_T), POINTER :: PTR(:) - -! ------------------------------------------------------------------ - - IF(NPROMATR > 0) THEN - print *, "This is currently not supported and/or tested (NPROMATR > 0j" - stop 24 - ENDIF - - ! Compute Vertical domain decomposition - - ! Initialize potentially unset offsets - KSCALARS_NSDER_OFFSET = -1 - KUV_EWDER_OFFSET = -1 - KSCALARS_EWDER_OFFSET = -1 - - ! (note in ltinv we will initially start with a slightly different domain decomposition - ! which always has vorticity and divergence because this is the actual input) - IFIRST = 0 - IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity - IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence - KUV_OFFSET = IFIRST - IFIRST = IFIRST + KF_UV ! U - IFIRST = IFIRST + KF_UV ! V - KSCALARS_OFFSET = IFIRST - IFIRST = IFIRST + KF_SCALARS ! Scalars - IF (LSCDERS) THEN - KSCALARS_NSDER_OFFSET = IFIRST - IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives - ENDIF - ! the rest of fields is being computed in fourier space, namely in FSC - IF_LEG = IFIRST - IF (LUVDER) THEN - KUV_EWDER_OFFSET = IFIRST - IFIRST = IFIRST+2*KF_UV ! U and V derivatives - ENDIF - IF (LSCDERS) THEN - KSCALARS_EWDER_OFFSET = IFIRST - IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives - ENDIF - IF_FOURIER = IFIRST - IF (IF_FOURIER /= KF_FS) CALL ABORT_TRANS('Size mismatch: Wrong computation KF_FS') - - ALLOCATOR = MAKE_BUFFERED_ALLOCATOR() - HLTINV = PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) - HTRMTOL_PACK = PREPARE_TRMTOL_PACK(ALLOCATOR,IF_LEG) - HTRMTOL = PREPARE_TRMTOL(ALLOCATOR,IF_LEG) - HTRMTOL_UNPACK = PREPARE_TRMTOL_UNPACK(ALLOCATOR,IF_FOURIER) - HFSC = PREPARE_FSC(ALLOCATOR) - HFTINV = PREPARE_FTINV(ALLOCATOR) - HTRLTOG = PREPARE_TRLTOG(ALLOCATOR,IF_FOURIER,KF_GP) - - CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, REUSE_PTR) - - ! Legendre transformations - CALL GSTATS(102,0) - IF (KF_FS > 0) THEN - CALL LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& - & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & - & ZOUTS,ZOUTA,ZOUTS0,ZOUTA0) - END IF - CALL GSTATS(102,1) - - ! Packing into send buffer, to fourier space and unpack - IF (KF_FS > 0) THEN - CALL GSTATS(152,0) - CALL TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,IF_LEG) - CALL TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,FOUBUF_IN,FOUBUF,IF_LEG) - CALL TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,IF_LEG,IF_FOURIER) - CALL GSTATS(152,1) - - CALL GSTATS(107,0) - ! compute NS derivatives - CALL FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, IF_FOURIER, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, & - & KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) - !Legendre transformations - CALL FTINV(ALLOCATOR, HFTINV, PREEL_COMPLEX,PREEL_REAL,IF_FOURIER) - CALL GSTATS(107,1) - ENDIF - - ! Transposition into grid-point space - CALL GSTATS(157,0) - CALL TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,IF_FOURIER,KF_GP,KF_UV_G,KF_SCALARS_G,& - & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& - & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& - & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) - CALL GSTATS(157,1) - ! ------------------------------------------------------------------ -! ------------------------------------------------------------------ - -END SUBROUTINE INV_TRANS_CTL + SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& + & KF_UV,KF_SCALARS,KF_SCDERS,& + & PSPVOR,PSPDIV,PSPSCALAR,KVSETUV,KVSETSC,PGP,FSPGL_PROC,& + & PSPSC3A,PSPSC3B,PSPSC2,KVSETSC3A,KVSETSC3B,KVSETSC2,PGPUV,PGP3A,PGP3B,PGP2) + + !**** *INV_TRANS_CTL* - Control routine for inverse spectral transform. + + ! Purpose. + ! -------- + ! Control routine for the inverse spectral transform + + !** Interface. + ! ---------- + ! CALL INV_TRANS_CTL(...) + + ! Explicit arguments : + ! -------------------- + ! KF_UV_G - global number of spectral u-v fields + ! KF_SCALARS_G - global number of scalar spectral fields + ! KF_GP - total number of output gridpoint fields + ! KF_FS - total number of fields in fourier space + ! KF_OUT_LT - total number of fields coming out from inverse LT + ! KF_UV - local number of spectral u-v fields + ! KF_SCALARS - local number of scalar spectral fields + ! KF_SCDERS - local number of derivatives of scalar spectral fields + ! PSPVOR(:,:) - spectral vorticity (input) + ! PSPDIV(:,:) - spectral divergence (input) + ! PSPSCALAR(:,:) - spectral scalarvalued fields (input) + ! KVSETUV(:) - indicating which 'b-set' in spectral space owns a + ! vor/div field. Equivalant to NBSETLEV in the IFS. + ! The length of KVSETUV should be the GLOBAL number + ! of u/v fields which is the dimension of u and v releated + ! fields in grid-point space. + ! KVESETSC(:) - indicating which 'b-set' in spectral space owns a + ! scalar field. As for KVSETUV this argument is required + ! if the total number of processors is greater than + ! the number of processors used for distribution in + ! spectral wave space. + ! FSPGL_PROC - external procedure to be executed in fourier space + ! before transposition + ! PGP(:,:,:) - gridpoint fields (output) + + ! The ordering of the output fields is as follows (all + ! parts are optional depending on the input switches): + + ! vorticity : KF_UV_G fields + ! divergence : KF_UV_G fields + ! u : KF_UV_G fields + ! v : KF_UV_G fields + ! scalar fields : KF_SCALARS_G fields + ! N-S derivative of scalar fields : KF_SCALARS_G fields + ! E-W derivative of u : KF_UV_G fields + ! E-W derivative of v : KF_UV_G fields + ! E-W derivative of scalar fields : KF_SCALARS_G fields + + ! Method. + ! ------- + + ! Externals. SHUFFLE - reshuffle fields for load balancing + ! ---------- FIELD_SPLIT - split fields in NPROMATR packets + ! LTINV_CTL - control of Legendre transform + ! FTINV_CTL - control of Fourier transform + + ! Author. + ! ------- + ! Mats Hamrud *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 01-01-03 + + ! ------------------------------------------------------------------ + + + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT, JPRD + + USE TPM_GEN ,ONLY : NPROMATR, NOUT, NERR + USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, REUSE_PTR + USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS + + USE TRMTOL_MOD + USE LTINV_MOD + USE TRMTOL_PACK_UNPACK + USE FSC_MOD + USE FTINV_MOD + USE TRLTOG_MOD + USE TPM_DISTR ,ONLY : D + USE ALLOCATOR_MOD + ! + + IMPLICIT NONE + + ! Declaration of arguments + + INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV_G + INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS_G + INTEGER(KIND=JPIM), INTENT(IN) :: KF_GP + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + INTEGER(KIND=JPIM), INTENT(IN) :: KF_OUT_LT + INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV + INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS + INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCDERS + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPVOR(:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPDIV(:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSCALAR(:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC3A(:,:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC3B(:,:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PSPSC2(:,:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) + REAL(KIND=JPRB) ,OPTIONAL ,INTENT(OUT) :: PGP(:,:,:) + EXTERNAL FSPGL_PROC + OPTIONAL FSPGL_PROC + REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGPUV(:,:,:,:) + REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3A(:,:,:,:) + REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3B(:,:,:,:) + REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP2(:,:,:) + + ! Local variables + + REAL(KIND=JPRB), POINTER :: FOUBUF(:), FOUBUF_IN(:) + REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:), PREEL_COMPLEX(:) + REAL(KIND=JPRBT), POINTER :: ZOUTS(:), ZOUTA(:) + REAL(KIND=JPRD), POINTER :: ZOUTS0(:), ZOUTA0(:) + INTEGER(KIND=JPIM) :: KUV_OFFSET, KSCALARS_OFFSET, KSCALARS_NSDER_OFFSET, & + & KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET + INTEGER(KIND=JPIM) :: IF_LEG, IF_FOURIER + + INTEGER(KIND=JPIM) :: IFIRST + + TYPE(BUFFERED_ALLOCATOR) :: ALLOCATOR + TYPE(LTINV_HANDLE) :: HLTINV + TYPE(TRMTOL_PACK_HANDLE) :: HTRMTOL_PACK + TYPE(TRMTOL_HANDLE) :: HTRMTOL + TYPE(TRMTOL_UNPACK_HANDLE) :: HTRMTOL_UNPACK + TYPE(FSC_HANDLE) :: HFSC + TYPE(FTINV_HANDLE) :: HFTINV + TYPE(TRLTOG_HANDLE) :: HTRLTOG + + INTEGER(KIND=C_INT8_T), POINTER :: PTR(:) + + ! ------------------------------------------------------------------ + + IF(NPROMATR > 0) THEN + print *, "This is currently not supported and/or tested (NPROMATR > 0j" + stop 24 + ENDIF + + ! Compute Vertical domain decomposition + + ! Initialize potentially unset offsets + KSCALARS_NSDER_OFFSET = -1 + KUV_EWDER_OFFSET = -1 + KSCALARS_EWDER_OFFSET = -1 + + ! (note in ltinv we will initially start with a slightly different domain decomposition + ! which always has vorticity and divergence because this is the actual input) + IFIRST = 0 + IF (LVORGP) IFIRST = IFIRST + KF_UV ! Vorticity + IF (LDIVGP) IFIRST = IFIRST + KF_UV ! Divergence + KUV_OFFSET = IFIRST + IFIRST = IFIRST + KF_UV ! U + IFIRST = IFIRST + KF_UV ! V + KSCALARS_OFFSET = IFIRST + IFIRST = IFIRST + KF_SCALARS ! Scalars + IF (LSCDERS) THEN + KSCALARS_NSDER_OFFSET = IFIRST + IFIRST = IFIRST + KF_SCALARS ! Scalars NS Derivatives + ENDIF + ! the rest of fields is being computed in fourier space, namely in FSC + IF_LEG = IFIRST + IF (LUVDER) THEN + KUV_EWDER_OFFSET = IFIRST + IFIRST = IFIRST+2*KF_UV ! U and V derivatives + ENDIF + IF (LSCDERS) THEN + KSCALARS_EWDER_OFFSET = IFIRST + IFIRST = IFIRST + KF_SCALARS ! Scalars EW Derivatives + ENDIF + IF_FOURIER = IFIRST + IF (IF_FOURIER /= KF_FS) CALL ABORT_TRANS('Size mismatch: Wrong computation KF_FS') + + ALLOCATOR = MAKE_BUFFERED_ALLOCATOR() + HLTINV = PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) + HTRMTOL_PACK = PREPARE_TRMTOL_PACK(ALLOCATOR,IF_LEG) + HTRMTOL = PREPARE_TRMTOL(ALLOCATOR,IF_LEG) + HTRMTOL_UNPACK = PREPARE_TRMTOL_UNPACK(ALLOCATOR,IF_FOURIER) + HFSC = PREPARE_FSC(ALLOCATOR) + HFTINV = PREPARE_FTINV(ALLOCATOR) + HTRLTOG = PREPARE_TRLTOG(ALLOCATOR,IF_FOURIER,KF_GP) + + CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, REUSE_PTR) + + ! Legendre transformations + CALL GSTATS(102,0) + IF (KF_FS > 0) THEN + CALL LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& + & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & + & ZOUTS,ZOUTA,ZOUTS0,ZOUTA0) + END IF + CALL GSTATS(102,1) + + ! Packing into send buffer, to fourier space and unpack + IF (KF_FS > 0) THEN + CALL GSTATS(152,0) + CALL TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,IF_LEG) + CALL TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,FOUBUF_IN,FOUBUF,IF_LEG) + CALL TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,IF_LEG,IF_FOURIER) + CALL GSTATS(152,1) + + CALL GSTATS(107,0) + ! compute NS derivatives + CALL FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, IF_FOURIER, KF_UV, KF_SCALARS, KUV_OFFSET, KSCALARS_OFFSET, & + & KSCALARS_NSDER_OFFSET, KUV_EWDER_OFFSET, KSCALARS_EWDER_OFFSET) + !Legendre transformations + CALL FTINV(ALLOCATOR, HFTINV, PREEL_COMPLEX,PREEL_REAL,IF_FOURIER) + CALL GSTATS(107,1) + ENDIF + + ! Transposition into grid-point space + CALL GSTATS(157,0) + CALL TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,IF_FOURIER,KF_GP,KF_UV_G,KF_SCALARS_G,& + & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& + & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& + & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) + CALL GSTATS(157,1) + END SUBROUTINE INV_TRANS_CTL END MODULE INV_TRANS_CTL_MOD diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index ece8184df..d834c691c 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -18,293 +18,289 @@ MODULE LEDIR_MOD INTEGER(KIND=JPIM) :: A = 8 !Alignment CONTAINS -SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& - IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) - USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD - USE TPM_DIM ,ONLY : R - - IMPLICIT NONE + SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& + IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD + USE TPM_DIM ,ONLY : R + + IMPLICIT NONE + + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + + INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0, IOUT_STRIDES1 + INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_STRIDES1 + INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_STRIDES1 + + IF (PRESENT(IOUT_STRIDES0)) & + IOUT_STRIDES0 = ALIGN(2*KF_FS,A) + IF (PRESENT(IOUT_STRIDES1)) & + IOUT_STRIDES1 = IOUT_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) + IF (PRESENT(IIN_STRIDES0)) & + IIN_STRIDES0 = ALIGN(2*KF_FS,A) + IF (PRESENT(IIN_STRIDES1)) & + IIN_STRIDES1 = IIN_STRIDES0 * ALIGN(R%NDGNH,A) + IF (PRESENT(IOUT0_STRIDES0)) & + IOUT0_STRIDES0 = ALIGN(KF_FS,A) + IF (PRESENT(IOUT0_STRIDES1)) & + IOUT0_STRIDES1 = IOUT0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) + IF (PRESENT(IIN0_STRIDES0)) & + IIN0_STRIDES0 = ALIGN(KF_FS,A) + IF (PRESENT(IIN0_STRIDES1)) & + IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(R%NDGNH,A) + END SUBROUTINE + + SUBROUTINE LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) + !**** *LEDIR* - Direct Legendre transform. + + ! Purpose. + ! -------- + ! Direct Legendre tranform of state variables. + + !** Interface. + ! ---------- + ! CALL LEDIR(...) + + ! Explicit arguments : KM - zonal wavenumber + ! -------------------- KFC - number of field to transform + ! fields for zonal wavenumber KM + ! PSIA - symmetric part of Fourier + ! fields for zonal wavenumber KM + ! POA1 - spectral + ! fields for zonal wavenumber KM + + ! Implicit arguments : None. + ! -------------------- + + ! Method. + ! ------- use butterfly or dgemm + + ! Externals. + ! ---------- + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! Nils Wedi + Mats Hamrud + George Modzynski + + ! Modifications. + ! -------------- + ! J.Hague : Oct 2012 DR_HOOK round calls to DGEMM: + ! F. Vana 05-Mar-2015 Support for single precision + ! ------------------------------------------------------------------ + + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD + USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX,R_NTMAX,R_NDGL + USE TPM_GEOMETRY ,ONLY : G, G_NDGLU + USE TPM_FIELDS ,ONLY : F,ZAA,ZAS,ZAA0,ZAS0,KMLOC0 + USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS + USE CUDA_GEMM_BATCHED_MOD + USE MPL_MODULE ,ONLY : MPL_BARRIER + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + USE, INTRINSIC :: ISO_C_BINDING + USE IEEE_ARITHMETIC + USE OPENACC + + + IMPLICIT NONE + + ! DUMMY ARGUMENTS + REAL(KIND=JPRBT), INTENT(IN) :: ZINPS(:), ZINPA(:) + REAL(KIND=JPRD), INTENT(IN) :: ZINPS0(:), ZINPA0(:) + REAL(KIND=JPRBT), INTENT(INOUT) :: ZOUT(:) + REAL(KIND=JPRD), INTENT(INOUT) :: ZOUT0(:) + REAL(KIND=JPRBT), INTENT(OUT), POINTER :: POA1(:,:,:) + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS + + ! LOCAL VARIABLES + INTEGER(KIND=JPIM) :: KM + INTEGER(KIND=JPIM) :: KMLOC + INTEGER(KIND=JPIM) :: IA, IS, ISL, J + INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + REAL(KIND=JPRBT) :: PAIA, PAIS, V1, V2 + + INTEGER(KIND=JPIM) :: IGLS, JF, JGL + INTEGER(KIND=JPIM) :: OFFSET1, OFFSET2 + + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS + + IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) + + CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& + IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) + + !$ACC DATA & + !$ACC& PRESENT(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) & + !$ACC& PRESENT(F,F%RW) & + !$ACC& PRESENT(D,D_MYMS,R,G,G_NDGLU) & + !$ACC& PRESENT(ZAA,ZAS,POA1) + + ! anti-symmetric + IF(KMLOC0 > 0) THEN + PRINT*,'computing m=0 in double precision' + ENDIF - INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS - - INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0, IOUT_STRIDES1 - INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0, IIN_STRIDES1 - INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_STRIDES1 - INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_STRIDES1 - - IF (PRESENT(IOUT_STRIDES0)) & - IOUT_STRIDES0 = ALIGN(2*KF_FS,A) - IF (PRESENT(IOUT_STRIDES1)) & - IOUT_STRIDES1 = IOUT_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) - IF (PRESENT(IIN_STRIDES0)) & - IIN_STRIDES0 = ALIGN(2*KF_FS,A) - IF (PRESENT(IIN_STRIDES1)) & - IIN_STRIDES1 = IIN_STRIDES0 * ALIGN(R%NDGNH,A) - IF (PRESENT(IOUT0_STRIDES0)) & - IOUT0_STRIDES0 = ALIGN(KF_FS,A) - IF (PRESENT(IOUT0_STRIDES1)) & - IOUT0_STRIDES1 = IOUT0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) - IF (PRESENT(IIN0_STRIDES0)) & - IIN0_STRIDES0 = ALIGN(KF_FS,A) - IF (PRESENT(IIN0_STRIDES1)) & - IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(R%NDGNH,A) -END SUBROUTINE - -SUBROUTINE LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) - -!**** *LEDIR* - Direct Legendre transform. - -! Purpose. -! -------- -! Direct Legendre tranform of state variables. - -!** Interface. -! ---------- -! CALL LEDIR(...) - -! Explicit arguments : KM - zonal wavenumber -! -------------------- KFC - number of field to transform -! fields for zonal wavenumber KM -! PSIA - symmetric part of Fourier -! fields for zonal wavenumber KM -! POA1 - spectral -! fields for zonal wavenumber KM - -! Implicit arguments : None. -! -------------------- - -! Method. -! ------- use butterfly or dgemm - -! Externals. -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! Nils Wedi + Mats Hamrud + George Modzynski - -! Modifications. -! -------------- -! J.Hague : Oct 2012 DR_HOOK round calls to DGEMM: -! F. Vana 05-Mar-2015 Support for single precision -! ------------------------------------------------------------------ - -USE TPM_GEN ,ONLY : LSYNC_TRANS -USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD -USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK -USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX,R_NTMAX,R_NDGL -USE TPM_GEOMETRY ,ONLY : G, G_NDGLU -USE TPM_FIELDS ,ONLY : F,ZAA,ZAS,ZAA0,ZAS0,KMLOC0 -USE TPM_TRANS, ONLY: REUSE_PTR -USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1, NPROC -USE CUDA_GEMM_BATCHED_MOD -USE MPL_MODULE ,ONLY : MPL_BARRIER -USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX -USE, INTRINSIC :: ISO_C_BINDING -USE IEEE_ARITHMETIC -USE OPENACC - - -IMPLICIT NONE - -! DUMMY ARGUMENTS -REAL(KIND=JPRBT), INTENT(IN) :: ZINPS(:), ZINPA(:) -REAL(KIND=JPRD), INTENT(IN) :: ZINPS0(:), ZINPA0(:) -REAL(KIND=JPRBT), INTENT(INOUT) :: ZOUT(:) -REAL(KIND=JPRD), INTENT(INOUT) :: ZOUT0(:) -REAL(KIND=JPRBT), INTENT(OUT), POINTER :: POA1(:,:,:) -INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS - -! LOCAL VARIABLES -INTEGER(KIND=JPIM) :: KM -INTEGER(KIND=JPIM) :: KMLOC -INTEGER(KIND=JPIM) :: IA, IS, ISL, J -INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE -REAL(KIND=JPRBT) :: PAIA, PAIS, V1, V2 - -INTEGER(KIND=JPIM) :: IGLS, JF, JGL -INTEGER(KIND=JPIM) :: OFFSET1, OFFSET2 - -INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 -INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 -INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 -INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 -INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS - -IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) - -CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& - IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) - -!$ACC DATA & -!$ACC& PRESENT(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) & -!$ACC& PRESENT(F,F%RW) & -!$ACC& PRESENT(D,D_MYMS,R,G,G_NDGLU) & -!$ACC& PRESENT(ZAA,ZAS,POA1) - -! anti-symmetric -IF(KMLOC0 > 0) THEN - PRINT*,'computing m=0 in double precision' -ENDIF - -IF (LSYNC_TRANS) THEN - !$ACC WAIT(1) - CALL GSTATS(430,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(430,1) -ENDIF -CALL GSTATS(414,0) - -IF(KMLOC0 > 0) THEN - ! compute m=0 in double precision: - CALL CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_N, & - & KF_FS, (R%NSMAX+2)/2, G%NDGLU(0), & - & 1.0_JPRD, & - & ZINPA0, IIN0_STRIDES0, 0, & - & ZAA0, SIZE(ZAA0,1), 0, & - & 0.0_JPRD, & - & ZOUT0, IOUT0_STRIDES0, 0, & - & 1, STREAM=1_C_LONG) -ENDIF -! Get C in transpose format to get better memory access patterns later -!C=A*B => -! C^T=B^T*A^T -DO KMLOC=1,D_NUMP - KM = D_MYMS(KMLOC) - NS(KMLOC) = (R%NSMAX-KM+2)/2 - KS(KMLOC) = G%NDGLU(KM) - AOFFSETS(KMLOC) = IIN_STRIDES1*(KMLOC-1) - BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) - COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) -ENDDO -IF(KMLOC0 > 0) THEN - NS(KMLOC0) = 0 - KS(KMLOC0) = 0 -ENDIF -CALL CUDA_GEMM_BATCHED( & - & 21, & ! unique identifier - & CUBLAS_OP_N, CUBLAS_OP_N, & - & 2*KF_FS, NS(:), KS(:), & - & 1.0_JPRBT, & - & ZINPA, IIN_STRIDES0, AOFFSETS, & - & ZAA, SIZE(ZAA,1), BOFFSETS, & - & 0.0_JPRBT, & - & ZOUT, IOUT_STRIDES0, COFFSETS, & - & D_NUMP, STREAM=1_C_LONG) -IF (LSYNC_TRANS) THEN - !$ACC WAIT(1) - CALL GSTATS(434,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(434,1) -ENDIF -CALL GSTATS(414,1) - -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) DEFAULT(NONE) ASYNC(1) -DO KMLOC=1,D_NUMP - DO JF=1,2*KF_FS - KM = D_MYMS(KMLOC) - IA = 1+MOD(R_NTMAX-KM+2,2) - IF (KM /= 0) THEN - !$ACC LOOP SEQ - DO J=1,(R%NSMAX-KM+2)/2 - POA1(JF,IA+1+(J-1)*2,KMLOC) = ZOUT(JF+(J-1)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) - ENDDO - ELSEIF (MOD(JF-1,2) == 0) THEN - !$ACC LOOP SEQ - DO J=1,(R_NSMAX+2)/2 - POA1(JF,IA+1+(J-1)*2,KMLOC) = ZOUT0((JF-1)/2+1+(J-1)*IOUT0_STRIDES0) - ENDDO + IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) ENDIF - ENDDO -ENDDO - -! symmetric - -IF (LSYNC_TRANS) THEN - !$ACC WAIT(1) - CALL GSTATS(430,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(430,1) -ENDIF -CALL GSTATS(414,0) - -IF(KMLOC0 > 0) THEN - ! compute m=0 in double precision: - call CUDA_GEMM_BATCHED( & - & CUBLAS_OP_N, CUBLAS_OP_N, & - & KF_FS, (R%NSMAX+3)/2, G%NDGLU(0), & - & 1.0_JPRD, & - & ZINPS0, IIN0_STRIDES0, 0, & - & ZAS0, SIZE(ZAS0,1), 0, & - & 0.0_JPRD, & - & ZOUT0, IOUT0_STRIDES0, 0, & - & 1, STREAM=1_C_LONG) -ENDIF - -! Get C in transpose format to get better memory access patterns later -!C=A*B => -! C^T=B^T*A^T -DO KMLOC=1,D_NUMP - KM = D_MYMS(KMLOC) - NS(KMLOC) = (R%NSMAX-KM+3)/2 - KS(KMLOC) = G%NDGLU(KM) - AOFFSETS(KMLOC) = IIN_STRIDES1*(KMLOC-1) - BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) - COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) -ENDDO -IF(KMLOC0 > 0) THEN - NS(KMLOC0) = 0 - KS(KMLOC0) = 0 -ENDIF -CALL CUDA_GEMM_BATCHED( & - & 22, & ! unique identifier - & CUBLAS_OP_N, CUBLAS_OP_N, & - & 2*KF_FS, NS(:), KS(:), & - & 1.0_JPRBT, & - & ZINPS, IIN_STRIDES0, AOFFSETS, & - & ZAS, SIZE(ZAS,1), BOFFSETS, & - & 0.0_JPRBT, & - & ZOUT, IOUT_STRIDES0, COFFSETS, & - & D_NUMP, STREAM=1_C_LONG) -IF (LSYNC_TRANS) THEN - !$ACC WAIT(1) - CALL GSTATS(434,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(434,1) -ENDIF -CALL GSTATS(414,1) - -!$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS) DEFAULT(NONE) ASYNC(1) -DO KMLOC=1,D_NUMP - DO JF=1,2*KF_FS - KM = D_MYMS(KMLOC) - IS = 1+MOD(R_NTMAX-KM+1,2) - IF (KM /= 0) THEN - !$ACC LOOP SEQ - DO J=1,(R_NSMAX-KM+3)/2 - POA1(JF,IS+1+(J-1)*2,KMLOC) = ZOUT(JF+(J-1)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) - ENDDO - ELSEIF (MOD(JF-1,2) == 0) THEN - !$ACC LOOP SEQ - DO J=1,(R_NSMAX+3)/2 - POA1(JF,IS+1+(J-1)*2,KMLOC) = ZOUT0((JF-1)/2+1+(J-1)*IOUT0_STRIDES0) - ENDDO + CALL GSTATS(414,0) + + IF(KMLOC0 > 0) THEN + ! compute m=0 in double precision: + CALL CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_N, & + & KF_FS, (R%NSMAX+2)/2, G%NDGLU(0), & + & 1.0_JPRD, & + & ZINPA0, IIN0_STRIDES0, 0, & + & ZAA0, SIZE(ZAA0,1), 0, & + & 0.0_JPRD, & + & ZOUT0, IOUT0_STRIDES0, 0, & + & 1, STREAM=1_C_LONG) + ENDIF + ! Get C in transpose format to get better memory access patterns later + !C=A*B => + ! C^T=B^T*A^T + DO KMLOC=1,D_NUMP + KM = D_MYMS(KMLOC) + NS(KMLOC) = (R%NSMAX-KM+2)/2 + KS(KMLOC) = G%NDGLU(KM) + AOFFSETS(KMLOC) = IIN_STRIDES1*(KMLOC-1) + BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) + COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) + ENDDO + IF(KMLOC0 > 0) THEN + NS(KMLOC0) = 0 + KS(KMLOC0) = 0 ENDIF - ENDDO -ENDDO -!$ACC WAIT(1) + CALL CUDA_GEMM_BATCHED( & + & 21, & ! unique identifier + & CUBLAS_OP_N, CUBLAS_OP_N, & + & 2*KF_FS, NS(:), KS(:), & + & 1.0_JPRBT, & + & ZINPA, IIN_STRIDES0, AOFFSETS, & + & ZAA, SIZE(ZAA,1), BOFFSETS, & + & 0.0_JPRBT, & + & ZOUT, IOUT_STRIDES0, COFFSETS, & + & D_NUMP, STREAM=1_C_LONG) + IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) + CALL GSTATS(434,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(434,1) + ENDIF + CALL GSTATS(414,1) + + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) DEFAULT(NONE) ASYNC(1) + DO KMLOC=1,D_NUMP + DO JF=1,2*KF_FS + KM = D_MYMS(KMLOC) + IA = 1+MOD(R_NTMAX-KM+2,2) + IF (KM /= 0) THEN + !$ACC LOOP SEQ + DO J=1,(R%NSMAX-KM+2)/2 + POA1(JF,IA+1+(J-1)*2,KMLOC) = ZOUT(JF+(J-1)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) + ENDDO + ELSEIF (MOD(JF-1,2) == 0) THEN + !$ACC LOOP SEQ + DO J=1,(R_NSMAX+2)/2 + POA1(JF,IA+1+(J-1)*2,KMLOC) = ZOUT0((JF-1)/2+1+(J-1)*IOUT0_STRIDES0) + ENDDO + ENDIF + ENDDO + ENDDO -!$ACC END DATA + ! symmetric + + IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) + ENDIF + CALL GSTATS(414,0) + + IF(KMLOC0 > 0) THEN + ! compute m=0 in double precision: + call CUDA_GEMM_BATCHED( & + & CUBLAS_OP_N, CUBLAS_OP_N, & + & KF_FS, (R%NSMAX+3)/2, G%NDGLU(0), & + & 1.0_JPRD, & + & ZINPS0, IIN0_STRIDES0, 0, & + & ZAS0, SIZE(ZAS0,1), 0, & + & 0.0_JPRD, & + & ZOUT0, IOUT0_STRIDES0, 0, & + & 1, STREAM=1_C_LONG) + ENDIF + ! Get C in transpose format to get better memory access patterns later + !C=A*B => + ! C^T=B^T*A^T + DO KMLOC=1,D_NUMP + KM = D_MYMS(KMLOC) + NS(KMLOC) = (R%NSMAX-KM+3)/2 + KS(KMLOC) = G%NDGLU(KM) + AOFFSETS(KMLOC) = IIN_STRIDES1*(KMLOC-1) + BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) + COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) + ENDDO + IF(KMLOC0 > 0) THEN + NS(KMLOC0) = 0 + KS(KMLOC0) = 0 + ENDIF + CALL CUDA_GEMM_BATCHED( & + & 22, & ! unique identifier + & CUBLAS_OP_N, CUBLAS_OP_N, & + & 2*KF_FS, NS(:), KS(:), & + & 1.0_JPRBT, & + & ZINPS, IIN_STRIDES0, AOFFSETS, & + & ZAS, SIZE(ZAS,1), BOFFSETS, & + & 0.0_JPRBT, & + & ZOUT, IOUT_STRIDES0, COFFSETS, & + & D_NUMP, STREAM=1_C_LONG) + IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) + CALL GSTATS(434,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(434,1) + ENDIF + CALL GSTATS(414,1) + + !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS) DEFAULT(NONE) ASYNC(1) + DO KMLOC=1,D_NUMP + DO JF=1,2*KF_FS + KM = D_MYMS(KMLOC) + IS = 1+MOD(R_NTMAX-KM+1,2) + IF (KM /= 0) THEN + !$ACC LOOP SEQ + DO J=1,(R_NSMAX-KM+3)/2 + POA1(JF,IS+1+(J-1)*2,KMLOC) = ZOUT(JF+(J-1)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) + ENDDO + ELSEIF (MOD(JF-1,2) == 0) THEN + !$ACC LOOP SEQ + DO J=1,(R_NSMAX+3)/2 + POA1(JF,IS+1+(J-1)*2,KMLOC) = ZOUT0((JF-1)/2+1+(J-1)*IOUT0_STRIDES0) + ENDDO + ENDIF + ENDDO + ENDDO + !$ACC WAIT(1) -IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) -! ------------------------------------------------------------------ + !$ACC END DATA -END SUBROUTINE LEDIR + IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) + ! ------------------------------------------------------------------ + END SUBROUTINE LEDIR END MODULE LEDIR_MOD diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 3672f11af..c6cf5b903 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -53,7 +53,6 @@ SUBROUTINE LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STR END SUBROUTINE SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) - !**** *LEINV* - Inverse Legendre transform. ! Purpose. @@ -312,6 +311,5 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ - END SUBROUTINE LEINV END MODULE LEINV_MOD diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index e0d024be6..c941103ce 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -55,231 +55,229 @@ FUNCTION PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) RESULT(HLTDIR) IALLOC_SZ = IALLOC_SZ+ ALIGN(IOUT0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) HLTDIR%HOUT_AND_POA = RESERVE(ALLOCATOR, IALLOC_SZ) - END FUNCTION + SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALARS,& - & PSPVOR,PSPDIV,PSPSCALAR,& - & PSPSC3A,PSPSC3B,PSPSC2, & - & KFLDPTRUV,KFLDPTRSC) - - - USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD, JPRB - USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX - - USE TPM_DIM ,ONLY : R - USE TPM_DISTR ,ONLY : D - USE TPM_GEOMETRY - - USE PREPSNM_MOD ,ONLY : PREPSNM - USE LEDIR_MOD - USE UVTVD_MOD - USE UPDSP_MOD ,ONLY : UPDSP - USE UPDSPB_MOD ,ONLY : UPDSPB - USE MPL_MODULE ,ONLY : MPL_BARRIER - USE TPM_GEN ,ONLY : LSYNC_TRANS - USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B - USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + & PSPVOR,PSPDIV,PSPSCALAR,& + & PSPSC3A,PSPSC3B,PSPSC2, & + & KFLDPTRUV,KFLDPTRSC) + + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD, JPRB + USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + + USE TPM_DIM ,ONLY : R + USE TPM_DISTR ,ONLY : D + USE TPM_GEOMETRY + + USE PREPSNM_MOD ,ONLY : PREPSNM + USE LEDIR_MOD + USE UVTVD_MOD + USE UPDSP_MOD ,ONLY : UPDSP + USE UPDSPB_MOD ,ONLY : UPDSPB + USE MPL_MODULE ,ONLY : MPL_BARRIER + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX USE ALLOCATOR_MOD - - !**** *LTDIR* - Control of Direct Legendre transform step - - ! Purpose. - ! -------- - ! Tranform from Fourier space to spectral space, compute - ! vorticity and divergence. - - !** Interface. - ! ---------- - ! *CALL* *LTDIR(...)* - - ! Explicit arguments : - ! -------------------- KM - zonal wavenumber - ! KMLOC - local zonal wavenumber - - ! Implicit arguments : None - ! -------------------- - - ! Method. - ! ------- - - ! Externals. - ! ---------- - ! PREPSNM - prepare REPSNM for wavenumber KM - ! PRFI2 - prepares the Fourier work arrays for model variables. - ! LEDIR - direct Legendre transform - ! UVTVD - - ! UPDSP - updating of spectral arrays (fields) - - ! Reference. - ! ---------- - ! ECMWF Research Department documentation of the IFS - - ! Author. - ! ------- - ! Mats Hamrud and Philippe Courtier *ECMWF* - - ! Modifications. - ! -------------- - ! Original : 87-11-24 - ! Modified : 91-07-01 Philippe Courtier/Mats Hamrud - Rewrite - ! for uv formulation - ! Modified 93-03-19 D. Giard - CDCONF='T' for tendencies - ! Modified 93-11-18 M. Hamrud - use only one Fourier buffer - ! Modified 94-04-06 R. El khatib Full-POS implementation - ! M.Hamrud : 94-11-01 New conf 'G' - vor,div->vor,div - ! instead of u,v->vor,div - ! MPP Group : 95-10-01 Support for Distributed Memory version - ! K. YESSAD (AUGUST 1996): - ! - Legendre transforms for transmission coefficients. - ! Modified : 04/06/99 D.Salmond : change order of AIA and SIA - ! R. El Khatib 12-Jul-2012 LDSPC2 replaced by UVTVD - ! ------------------------------------------------------------------ - - IMPLICIT NONE - - ! DUMMY INTEGER SCALARS - INTEGER(KIND=JPIM) :: KM - INTEGER(KIND=JPIM) :: KMLOC - INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_UV,KF_SCALARS - - REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) - REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) - REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) - REAL(KIND=JPRB) ,OPTIONAL,INTENT(OUT) :: PSPSC2(:,:) - REAL(KIND=JPRB) ,OPTIONAL,INTENT(OUT) :: PSPSC3A(:,:,:) - REAL(KIND=JPRB) ,OPTIONAL,INTENT(OUT) :: PSPSC3B(:,:,:) - INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) - INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) - REAL(KIND=JPRBT), INTENT(IN) :: ZINPS(:), ZINPA(:) - REAL(KIND=JPRD), INTENT(IN) :: ZINPS0(:), ZINPA0(:) - - ! LOCAL INTEGER SCALARS - INTEGER(KIND=JPIM) :: IFC, IIFC, IDGLU, IFIRST - - REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - REAL(KIND=JPRB), POINTER :: POA1_L(:), POA1(:,:,:) - REAL(KIND=JPRB), POINTER :: POA2_L(:), POA2(:,:,:) - REAL(KIND=JPRB), POINTER :: PU(:,:,:), PV(:,:,:), PVOR(:,:,:), PDIV(:,:,:) - REAL(KIND=JPRBT), POINTER :: ZOUT(:) - REAL(KIND=JPRD), POINTER :: ZOUT0(:) - TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR - TYPE(LTDIR_HANDLE), INTENT(IN) :: HLTDIR - INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 - INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + !**** *LTDIR* - Control of Direct Legendre transform step - - ! ------------------------------------------------------------------ - IF (LHOOK) CALL DR_HOOK('LTDIR_MOD',0,ZHOOK_HANDLE) - - ! ------------------------------------------------------------------ - - !* 1. PREPARE LEGENDRE POLONOMIALS AND EPSNM - ! -------------------------------------- - - - ! ------------------------------------------------------------------ - - !* 2. PREPARE WORK ARRAYS. - ! -------------------- - CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_STRIDES1=IOUT_STRIDES1,& - IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_STRIDES1=IOUT0_STRIDES1) - - IALLOC_POS = 1 - - IALLOC_SZ = ALIGN(2*KF_FS*(R%NTMAX+3)*D%NUMP*SIZEOF(POA1_L(1)),128) - CALL ASSIGN_PTR(POA1_L, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& - & IALLOC_POS, IALLOC_SZ) - CALL C_F_POINTER(C_LOC(POA1_L), POA1, (/ 2*KF_FS, R%NTMAX+3, D%NUMP /)) - IALLOC_POS = IALLOC_POS + IALLOC_SZ - - IALLOC_SZ = ALIGN(4*KF_UV*(R%NTMAX+3)*D%NUMP*SIZEOF(POA2_L(1)),128) - CALL ASSIGN_PTR(POA2_L, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& - & IALLOC_POS, IALLOC_SZ) - CALL C_F_POINTER(C_LOC(POA2_L), POA2, (/ 4*KF_UV, R%NTMAX+3, D%NUMP /)) - IALLOC_POS = IALLOC_POS + IALLOC_SZ - - ! ZOUT - IALLOC_SZ = ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZOUT(1)),128) - CALL ASSIGN_PTR(ZOUT, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& - & IALLOC_POS, IALLOC_SZ) - IALLOC_POS = IALLOC_POS + IALLOC_SZ - - ! ZOUT0 - IALLOC_SZ = ALIGN(IOUT0_STRIDES1*SIZEOF(ZOUT0(1)),128) - CALL ASSIGN_PTR(ZOUT0, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& - & IALLOC_POS, IALLOC_SZ) - IALLOC_POS = IALLOC_POS + IALLOC_SZ - - ! do the legendre transform - CALL LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) - - !$ACC DATA COPYOUT(PSPVOR,PSPDIV) IF(KF_UV > 0) - !$ACC DATA COPYOUT(PSPSCALAR) IF(PRESENT(PSPSCALAR) .AND. KF_SCALARS > 0) - !$ACC DATA COPYOUT(PSPSC2) IF(NF_SC2 > 0) - !$ACC DATA COPYOUT(PSPSC3A) IF(NF_SC3A > 0) - !$ACC DATA COPYOUT(PSPSC3B) IF(NF_SC3B > 0) - - ! ------------------------------------------------------------------ - - !* 5. COMPUTE VORTICITY AND DIVERGENCE. - ! --------------------------------- - - IF( KF_UV > 0 ) THEN - ! U and V are in POA1 - IFIRST = 0 - PU => POA1(IFIRST+1:IFIRST+2*KF_UV,:,:) - IFIRST = IFIRST + 2*KF_UV - PV => POA1(IFIRST+1:IFIRST+2*KF_UV,:,:) - ! Compute VOR and DIV ino POA2 - IFIRST = 0 - PVOR => POA2(IFIRST+1:IFIRST+2*KF_UV,:,:) - IFIRST = IFIRST + 2*KF_UV - PDIV => POA2(IFIRST+1:IFIRST+2*KF_UV,:,:) - - ! Compute vorticity and divergence - CALL UVTVD(KF_UV,PU,PV,PVOR,PDIV) - - ! Write back. Note, if we have UV, the contract says we *must* have VOR/DIV - CALL UPDSPB(KF_UV,PVOR,PSPVOR,KFLDPTRUV) - CALL UPDSPB(KF_UV,PDIV,PSPDIV,KFLDPTRUV) - - ENDIF - ! ------------------------------------------------------------------ - - !* 6. UPDATE SPECTRAL ARRAYS. - ! ----------------------- - - ! this is on the host, so need to cp from device, Nils - CALL UPDSP(KF_UV,KF_SCALARS,POA1,& - & PSPSCALAR,& - & PSPSC3A,PSPSC3B,PSPSC2 , & - & KFLDPTRUV,KFLDPTRSC) - - IF (LSYNC_TRANS) THEN - CALL GSTATS(430,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(430,1) - ENDIF - CALL GSTATS(412,0) - !$ACC END DATA - !$ACC END DATA - !$ACC END DATA - !$ACC END DATA - !$ACC END DATA - IF (LSYNC_TRANS) THEN - CALL GSTATS(432,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(432,1) - ENDIF - CALL GSTATS(412,1) - - ! ------------------------------------------------------------------ - - IF (LHOOK) CALL DR_HOOK('LTDIR_MOD',1,ZHOOK_HANDLE) + ! Purpose. + ! -------- + ! Tranform from Fourier space to spectral space, compute + ! vorticity and divergence. + + !** Interface. + ! ---------- + ! *CALL* *LTDIR(...)* + + ! Explicit arguments : + ! -------------------- KM - zonal wavenumber + ! KMLOC - local zonal wavenumber + + ! Implicit arguments : None + ! -------------------- + + ! Method. + ! ------- + + ! Externals. + ! ---------- + ! PREPSNM - prepare REPSNM for wavenumber KM + ! PRFI2 - prepares the Fourier work arrays for model variables. + ! LEDIR - direct Legendre transform + ! UVTVD - + ! UPDSP - updating of spectral arrays (fields) + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! Mats Hamrud and Philippe Courtier *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 87-11-24 + ! Modified : 91-07-01 Philippe Courtier/Mats Hamrud - Rewrite + ! for uv formulation + ! Modified 93-03-19 D. Giard - CDCONF='T' for tendencies + ! Modified 93-11-18 M. Hamrud - use only one Fourier buffer + ! Modified 94-04-06 R. El khatib Full-POS implementation + ! M.Hamrud : 94-11-01 New conf 'G' - vor,div->vor,div + ! instead of u,v->vor,div + ! MPP Group : 95-10-01 Support for Distributed Memory version + ! K. YESSAD (AUGUST 1996): + ! - Legendre transforms for transmission coefficients. + ! Modified : 04/06/99 D.Salmond : change order of AIA and SIA + ! R. El Khatib 12-Jul-2012 LDSPC2 replaced by UVTVD + ! ------------------------------------------------------------------ + + IMPLICIT NONE + + ! DUMMY INTEGER SCALARS + INTEGER(KIND=JPIM) :: KM + INTEGER(KIND=JPIM) :: KMLOC + INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_UV,KF_SCALARS + + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) + REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) + REAL(KIND=JPRB) ,OPTIONAL,INTENT(OUT) :: PSPSC2(:,:) + REAL(KIND=JPRB) ,OPTIONAL,INTENT(OUT) :: PSPSC3A(:,:,:) + REAL(KIND=JPRB) ,OPTIONAL,INTENT(OUT) :: PSPSC3B(:,:,:) + INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) + INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) + REAL(KIND=JPRBT), INTENT(IN) :: ZINPS(:), ZINPA(:) + REAL(KIND=JPRD), INTENT(IN) :: ZINPS0(:), ZINPA0(:) + + ! LOCAL INTEGER SCALARS + INTEGER(KIND=JPIM) :: IFC, IIFC, IDGLU, IFIRST + + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + REAL(KIND=JPRB), POINTER :: POA1_L(:), POA1(:,:,:) + REAL(KIND=JPRB), POINTER :: POA2_L(:), POA2(:,:,:) + REAL(KIND=JPRB), POINTER :: PU(:,:,:), PV(:,:,:), PVOR(:,:,:), PDIV(:,:,:) + REAL(KIND=JPRBT), POINTER :: ZOUT(:) + REAL(KIND=JPRD), POINTER :: ZOUT0(:) + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(LTDIR_HANDLE), INTENT(IN) :: HLTDIR + INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + + + + ! ------------------------------------------------------------------ + IF (LHOOK) CALL DR_HOOK('LTDIR_MOD',0,ZHOOK_HANDLE) + + ! ------------------------------------------------------------------ + + !* 1. PREPARE LEGENDRE POLONOMIALS AND EPSNM + ! -------------------------------------- + + + ! ------------------------------------------------------------------ + + !* 2. PREPARE WORK ARRAYS. + ! -------------------- + CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_STRIDES1=IOUT_STRIDES1,& + IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_STRIDES1=IOUT0_STRIDES1) + + IALLOC_POS = 1 + + IALLOC_SZ = ALIGN(2*KF_FS*(R%NTMAX+3)*D%NUMP*SIZEOF(POA1_L(1)),128) + CALL ASSIGN_PTR(POA1_L, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& + & IALLOC_POS, IALLOC_SZ) + CALL C_F_POINTER(C_LOC(POA1_L), POA1, (/ 2*KF_FS, R%NTMAX+3, D%NUMP /)) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + IALLOC_SZ = ALIGN(4*KF_UV*(R%NTMAX+3)*D%NUMP*SIZEOF(POA2_L(1)),128) + CALL ASSIGN_PTR(POA2_L, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& + & IALLOC_POS, IALLOC_SZ) + CALL C_F_POINTER(C_LOC(POA2_L), POA2, (/ 4*KF_UV, R%NTMAX+3, D%NUMP /)) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZOUT + IALLOC_SZ = ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZOUT(1)),128) + CALL ASSIGN_PTR(ZOUT, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZOUT0 + IALLOC_SZ = ALIGN(IOUT0_STRIDES1*SIZEOF(ZOUT0(1)),128) + CALL ASSIGN_PTR(ZOUT0, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! do the legendre transform + CALL LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) + + !$ACC DATA COPYOUT(PSPVOR,PSPDIV) IF(KF_UV > 0) + !$ACC DATA COPYOUT(PSPSCALAR) IF(PRESENT(PSPSCALAR) .AND. KF_SCALARS > 0) + !$ACC DATA COPYOUT(PSPSC2) IF(NF_SC2 > 0) + !$ACC DATA COPYOUT(PSPSC3A) IF(NF_SC3A > 0) + !$ACC DATA COPYOUT(PSPSC3B) IF(NF_SC3B > 0) + + ! ------------------------------------------------------------------ + + !* 5. COMPUTE VORTICITY AND DIVERGENCE. + ! --------------------------------- + + IF( KF_UV > 0 ) THEN + ! U and V are in POA1 + IFIRST = 0 + PU => POA1(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV + PV => POA1(IFIRST+1:IFIRST+2*KF_UV,:,:) + ! Compute VOR and DIV ino POA2 + IFIRST = 0 + PVOR => POA2(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV + PDIV => POA2(IFIRST+1:IFIRST+2*KF_UV,:,:) + + ! Compute vorticity and divergence + CALL UVTVD(KF_UV,PU,PV,PVOR,PDIV) + + ! Write back. Note, if we have UV, the contract says we *must* have VOR/DIV + CALL UPDSPB(KF_UV,PVOR,PSPVOR,KFLDPTRUV) + CALL UPDSPB(KF_UV,PDIV,PSPDIV,KFLDPTRUV) + + ENDIF + ! ------------------------------------------------------------------ + + !* 6. UPDATE SPECTRAL ARRAYS. + ! ----------------------- + + ! this is on the host, so need to cp from device, Nils + CALL UPDSP(KF_UV,KF_SCALARS,POA1,& + & PSPSCALAR,& + & PSPSC3A,PSPSC3B,PSPSC2 , & + & KFLDPTRUV,KFLDPTRSC) + + IF (LSYNC_TRANS) THEN + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) + ENDIF + CALL GSTATS(412,0) + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + IF (LSYNC_TRANS) THEN + CALL GSTATS(432,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(432,1) + ENDIF + CALL GSTATS(412,1) + + ! ------------------------------------------------------------------ + IF (LHOOK) CALL DR_HOOK('LTDIR_MOD',1,ZHOOK_HANDLE) END SUBROUTINE LTDIR - END MODULE LTDIR_MOD +END MODULE LTDIR_MOD diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index f5baee608..bca1d4cdd 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -22,7 +22,7 @@ MODULE LTINV_MOD TYPE(ALLOCATION_RESERVATION_HANDLE) :: HOUTS_AND_OUTA END TYPE - CONTAINS +CONTAINS FUNCTION PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT(HLTINV) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD USE TPM_DISTR, ONLY: D @@ -93,304 +93,300 @@ FUNCTION PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT( IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) HLTINV%HOUTS_AND_OUTA = RESERVE(ALLOCATOR, IALLOC_SZ) - END FUNCTION SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& - & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & - & ZOUTS,ZOUTA,ZOUTS0,ZOUTA0) - - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT - USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - - USE TPM_DIM ,ONLY : R - USE TPM_TRANS ,ONLY : LDIVGP, LVORGP, NF_SC2, NF_SC3A, NF_SC3B, LSCDERS - USE TPM_FLT - USE TPM_GEOMETRY - USE TPM_DISTR ,ONLY : D - USE PRFI1B_MOD ,ONLY : PRFI1B - USE VDTUV_MOD ,ONLY : VDTUV - USE SPNSDE_MOD ,ONLY : SPNSDE - USE LEINV_MOD - USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS - use ieee_arithmetic - USE TPM_FIELDS ,ONLY : F,ZEPSNM - USE MPL_MODULE ,ONLY : MPL_BARRIER - USE TPM_GEN ,ONLY : LSYNC_TRANS - USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX - - - !**** *LTINV* - Inverse Legendre transform - ! - ! Purpose. - ! -------- - ! Tranform from Laplace space to Fourier space, compute U and V - ! and north/south derivatives of state variables. - - !** Interface. - ! ---------- - ! *CALL* *LTINV(...) - - ! Explicit arguments : - ! -------------------- - ! KM - zonal wavenumber - ! KMLOC - local zonal wavenumber - ! PSPVOR - spectral vorticity - ! PSPDIV - spectral divergence - ! PSPSCALAR - spectral scalar variables - - ! Implicit arguments : The Laplace arrays of the model. - ! -------------------- The values of the Legendre polynomials - ! The grid point arrays of the model - ! Method. - ! ------- - - ! Externals. - ! ---------- - - ! PREPSNM - prepare REPSNM for wavenumber KM - ! PRFI1B - prepares the spectral fields - ! VDTUV - compute u and v from vorticity and divergence - ! SPNSDE - compute north-south derivatives - ! LEINV - Inverse Legendre transform - - ! Reference. - ! ---------- - ! ECMWF Research Department documentation of the IFS - ! Temperton, 1991, MWR 119 p1303 - - ! Author. - ! ------- - ! Mats Hamrud *ECMWF* - - ! Modifications. - ! -------------- - ! Original : 00-02-01 From LTINV in IFS CY22R1 - ! ------------------------------------------------------------------ - - IMPLICIT NONE - - - INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV - INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS - - REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPVOR(:,:) - REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPDIV(:,:) - REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSCALAR(:,:) - REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC2(:,:) - REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC3A(:,:,:) - REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC3B(:,:,:) - REAL(KIND=JPRBT), POINTER, INTENT(OUT) :: ZOUTS(:), ZOUTA(:) - REAL(KIND=JPRD), POINTER, INTENT(OUT) :: ZOUTS0(:), ZOUTA0(:) - - INTEGER(KIND=JPIM) :: IFIRST, J3 - - REAL(KIND=JPRB), POINTER :: PIA_L(:), PIA(:,:,:) - REAL(KIND=JPRB), POINTER :: PU(:,:,:), PV(:,:,:), PVOR(:,:,:), PDIV(:,:,:) - REAL(KIND=JPRB), POINTER :: PSCALARS(:,:,:), PSCALARS_NSDER(:,:,:) - - REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR - TYPE(LTINV_HANDLE), INTENT(IN) :: HLTINV - - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 - INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 - INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 - - INTEGER(KIND=JPIM) :: IF_READIN, IF_LEG - INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ - - REAL(KIND=JPRBT), POINTER :: ZINP(:) - REAL(KIND=JPRD), POINTER :: ZINP0(:) - - ! ------------------------------------------------------------------ - - !* 1. PERFORM LEGENDRE TRANFORM. - ! -------------------------- - - IF (LHOOK) CALL DR_HOOK('LTINV_MOD',0,ZHOOK_HANDLE) - - ! Get all pointers - IF_READIN = 0 - IF_READIN = IF_READIN + KF_UV ! Vorticity or divergence - IF_READIN = IF_READIN + KF_UV ! Vorticity or divergence - IF_READIN = IF_READIN + KF_UV ! U - IF_READIN = IF_READIN + KF_UV ! V - IF_READIN = IF_READIN + KF_SCALARS ! Scalars - IF (LSCDERS) & - IF_READIN = IF_READIN + KF_SCALARS ! Scalars NS Derivatives - - !IPIA_SZ = ALIGN(2*IF_READIN*(R%NSMAX+3)*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) - - ! In Legendre space, we then ignore vorticity/divergence, if - ! they don't need to be transformed. - IF_LEG = IF_READIN - IF(.NOT. LVORGP) IF_LEG = IF_LEG - KF_UV ! No vorticity needed - IF(.NOT. LDIVGP) IF_LEG = IF_LEG - KF_UV ! No divergence needed - - CALL LEINV_STRIDES(IF_LEG,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& - IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) - - IALLOC_POS = 1 - - ! PIA - IALLOC_SZ = ALIGN(2*IF_READIN*(R%NTMAX+3)*D%NUMP*SIZEOF(PIA_L(1)),128) - CALL ASSIGN_PTR(PIA_L, GET_ALLOCATION(ALLOCATOR, HLTINV%HPIA_AND_IN),& - & IALLOC_POS, IALLOC_SZ) - CALL C_F_POINTER(C_LOC(PIA_L), PIA, (/ 2*IF_READIN, R%NTMAX+3, D%NUMP /)) - IALLOC_POS = IALLOC_POS + IALLOC_SZ - - ! ZINP - IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINP(1)),128) - CALL ASSIGN_PTR(ZINP, GET_ALLOCATION(ALLOCATOR, HLTINV%HPIA_AND_IN),& - & IALLOC_POS, IALLOC_SZ) - IALLOC_POS = IALLOC_POS + IALLOC_SZ - - ! ZINP0 - IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINP0(1)),128) - CALL ASSIGN_PTR(ZINP0, GET_ALLOCATION(ALLOCATOR, HLTINV%HPIA_AND_IN),& - & IALLOC_POS, IALLOC_SZ) - IALLOC_POS = IALLOC_POS + IALLOC_SZ - - IALLOC_POS = 1 - - ! ZOUTA - IALLOC_SZ = ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZOUTA(1)),128) - CALL ASSIGN_PTR(ZOUTA, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& - & IALLOC_POS, IALLOC_SZ) - IALLOC_POS = IALLOC_POS + IALLOC_SZ - - ! ZOUTS - IALLOC_SZ = ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZOUTS(1)),128) - CALL ASSIGN_PTR(ZOUTS, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& - & IALLOC_POS, IALLOC_SZ) - IALLOC_POS = IALLOC_POS + IALLOC_SZ - - ! ZOUTA0 - IALLOC_SZ = ALIGN(IOUT0_STRIDES1*SIZEOF(ZOUTA0(1)),128) - CALL ASSIGN_PTR(ZOUTA0, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& - & IALLOC_POS, IALLOC_SZ) - IALLOC_POS = IALLOC_POS + IALLOC_SZ - - ! ZOUTS0 - IALLOC_SZ = ALIGN(IOUT0_STRIDES1*SIZEOF(ZOUTS0(1)),128) - CALL ASSIGN_PTR(ZOUTS0, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& - & IALLOC_POS, IALLOC_SZ) - IALLOC_POS = IALLOC_POS + IALLOC_SZ - - ! Assign pointers do the different components of PIA - IFIRST = 0 - IF (.NOT. LVORGP .OR. LDIVGP) THEN - ! Usually we want to store vorticity first - PVOR => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) - IFIRST = IFIRST + 2*KF_UV ! Vorticity - - PDIV => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) - IFIRST = IFIRST + 2*KF_UV ! Divergence - ELSE - ! Except if we want to translate Vorticity but not Divergence, we should have Divergence first - ! Then we have all buffers that move on in a contiguous buffer - PDIV => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) - IFIRST = IFIRST + 2*KF_UV ! Divergence - - PVOR => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) - IFIRST = IFIRST + 2*KF_UV ! Vorticity - ENDIF - PU => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) - IFIRST = IFIRST + 2*KF_UV ! U - PV => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) - IFIRST = IFIRST + 2*KF_UV ! V - PSCALARS => PIA(IFIRST+1:IFIRST+2*KF_SCALARS,:,:) - IFIRST = IFIRST + 2*KF_SCALARS ! Scalars - IF (LSCDERS) THEN - PSCALARS_NSDER => PIA(IFIRST+1:IFIRST+2*KF_SCALARS,:,:) - IFIRST = IFIRST + 2*KF_SCALARS ! Scalars NS Derivatives - ENDIF - - ! ------------------------------------------------------------------ - - - !* 3. SPECTRAL COMPUTATIONS FOR U,V AND DERIVATIVES. - ! ---------------------------------------------- - - IF (LSYNC_TRANS) THEN - CALL GSTATS(440,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(440,1) - ENDIF - CALL GSTATS(422,0) - !$ACC DATA COPYIN(PSPVOR,PSPDIV) IF(KF_UV > 0) - !$ACC DATA COPYIN(PSPSCALAR) IF(PRESENT(PSPSCALAR) .AND. KF_SCALARS > 0) - !$ACC DATA COPYIN(PSPSC2) IF(NF_SC2 > 0) - !$ACC DATA COPYIN(PSPSC3A) IF(NF_SC3A > 0) - !$ACC DATA COPYIN(PSPSC3B) IF(NF_SC3B > 0) - IF (LSYNC_TRANS) THEN - CALL GSTATS(442,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(442,1) - ENDIF - CALL GSTATS(422,1) - - IF (KF_UV > 0) THEN - CALL PRFI1B(PVOR,PSPVOR,KF_UV,UBOUND(PSPVOR,2)) - CALL PRFI1B(PDIV,PSPDIV,KF_UV,UBOUND(PSPDIV,2)) - - ! Compute U and V for VOR and DIV - CALL VDTUV(KF_UV,ZEPSNM,PVOR,PDIV,PU,PV) - ENDIF - - IF (KF_SCALARS > 0) THEN - IF(PRESENT(PSPSCALAR)) THEN - CALL PRFI1B(PSCALARS,PSPSCALAR,KF_SCALARS,UBOUND(PSPSCALAR,2)) + & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & + & ZOUTS,ZOUTA,ZOUTS0,ZOUTA0) + + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT + USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + + USE TPM_DIM ,ONLY : R + USE TPM_TRANS ,ONLY : LDIVGP, LVORGP, NF_SC2, NF_SC3A, NF_SC3B, LSCDERS + USE TPM_FLT + USE TPM_GEOMETRY + USE TPM_DISTR ,ONLY : D + USE PRFI1B_MOD ,ONLY : PRFI1B + USE VDTUV_MOD ,ONLY : VDTUV + USE SPNSDE_MOD ,ONLY : SPNSDE + USE LEINV_MOD + USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS + use ieee_arithmetic + USE TPM_FIELDS ,ONLY : F,ZEPSNM + USE MPL_MODULE ,ONLY : MPL_BARRIER + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + + + !**** *LTINV* - Inverse Legendre transform + ! + ! Purpose. + ! -------- + ! Tranform from Laplace space to Fourier space, compute U and V + ! and north/south derivatives of state variables. + + !** Interface. + ! ---------- + ! *CALL* *LTINV(...) + + ! Explicit arguments : + ! -------------------- + ! KM - zonal wavenumber + ! KMLOC - local zonal wavenumber + ! PSPVOR - spectral vorticity + ! PSPDIV - spectral divergence + ! PSPSCALAR - spectral scalar variables + + ! Implicit arguments : The Laplace arrays of the model. + ! -------------------- The values of the Legendre polynomials + ! The grid point arrays of the model + ! Method. + ! ------- + + ! Externals. + ! ---------- + + ! PREPSNM - prepare REPSNM for wavenumber KM + ! PRFI1B - prepares the spectral fields + ! VDTUV - compute u and v from vorticity and divergence + ! SPNSDE - compute north-south derivatives + ! LEINV - Inverse Legendre transform + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + ! Temperton, 1991, MWR 119 p1303 + + ! Author. + ! ------- + ! Mats Hamrud *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 00-02-01 From LTINV in IFS CY22R1 + ! ------------------------------------------------------------------ + + IMPLICIT NONE + + + INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV + INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS + + REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPVOR(:,:) + REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPDIV(:,:) + REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSCALAR(:,:) + REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC2(:,:) + REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC3A(:,:,:) + REAL(KIND=JPRB) ,OPTIONAL,INTENT(IN) :: PSPSC3B(:,:,:) + REAL(KIND=JPRBT), POINTER, INTENT(OUT) :: ZOUTS(:), ZOUTA(:) + REAL(KIND=JPRD), POINTER, INTENT(OUT) :: ZOUTS0(:), ZOUTA0(:) + + INTEGER(KIND=JPIM) :: IFIRST, J3 + + REAL(KIND=JPRB), POINTER :: PIA_L(:), PIA(:,:,:) + REAL(KIND=JPRB), POINTER :: PU(:,:,:), PV(:,:,:), PVOR(:,:,:), PDIV(:,:,:) + REAL(KIND=JPRB), POINTER :: PSCALARS(:,:,:), PSCALARS_NSDER(:,:,:) + + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(LTINV_HANDLE), INTENT(IN) :: HLTINV + + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + + INTEGER(KIND=JPIM) :: IF_READIN, IF_LEG + INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ + + REAL(KIND=JPRBT), POINTER :: ZINP(:) + REAL(KIND=JPRD), POINTER :: ZINP0(:) + + ! ------------------------------------------------------------------ + + !* 1. PERFORM LEGENDRE TRANFORM. + ! -------------------------- + + IF (LHOOK) CALL DR_HOOK('LTINV_MOD',0,ZHOOK_HANDLE) + + ! Get all pointers + IF_READIN = 0 + IF_READIN = IF_READIN + KF_UV ! Vorticity or divergence + IF_READIN = IF_READIN + KF_UV ! Vorticity or divergence + IF_READIN = IF_READIN + KF_UV ! U + IF_READIN = IF_READIN + KF_UV ! V + IF_READIN = IF_READIN + KF_SCALARS ! Scalars + IF (LSCDERS) & + IF_READIN = IF_READIN + KF_SCALARS ! Scalars NS Derivatives + + ! In Legendre space, we then ignore vorticity/divergence, if + ! they don't need to be transformed. + IF_LEG = IF_READIN + IF(.NOT. LVORGP) IF_LEG = IF_LEG - KF_UV ! No vorticity needed + IF(.NOT. LDIVGP) IF_LEG = IF_LEG - KF_UV ! No divergence needed + + CALL LEINV_STRIDES(IF_LEG,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& + IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) + + IALLOC_POS = 1 + + ! PIA + IALLOC_SZ = ALIGN(2*IF_READIN*(R%NTMAX+3)*D%NUMP*SIZEOF(PIA_L(1)),128) + CALL ASSIGN_PTR(PIA_L, GET_ALLOCATION(ALLOCATOR, HLTINV%HPIA_AND_IN),& + & IALLOC_POS, IALLOC_SZ) + CALL C_F_POINTER(C_LOC(PIA_L), PIA, (/ 2*IF_READIN, R%NTMAX+3, D%NUMP /)) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZINP + IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINP(1)),128) + CALL ASSIGN_PTR(ZINP, GET_ALLOCATION(ALLOCATOR, HLTINV%HPIA_AND_IN),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZINP0 + IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINP0(1)),128) + CALL ASSIGN_PTR(ZINP0, GET_ALLOCATION(ALLOCATOR, HLTINV%HPIA_AND_IN),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + IALLOC_POS = 1 + + ! ZOUTA + IALLOC_SZ = ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZOUTA(1)),128) + CALL ASSIGN_PTR(ZOUTA, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZOUTS + IALLOC_SZ = ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZOUTS(1)),128) + CALL ASSIGN_PTR(ZOUTS, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZOUTA0 + IALLOC_SZ = ALIGN(IOUT0_STRIDES1*SIZEOF(ZOUTA0(1)),128) + CALL ASSIGN_PTR(ZOUTA0, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! ZOUTS0 + IALLOC_SZ = ALIGN(IOUT0_STRIDES1*SIZEOF(ZOUTS0(1)),128) + CALL ASSIGN_PTR(ZOUTS0, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& + & IALLOC_POS, IALLOC_SZ) + IALLOC_POS = IALLOC_POS + IALLOC_SZ + + ! Assign pointers do the different components of PIA + IFIRST = 0 + IF (.NOT. LVORGP .OR. LDIVGP) THEN + ! Usually we want to store vorticity first + PVOR => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! Vorticity + + PDIV => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! Divergence ELSE - IFIRST = 1 - IF(PRESENT(PSPSC2) .AND. NF_SC2 > 0) THEN - CALL PRFI1B(PSCALARS(IFIRST:IFIRST+2*NF_SC2-1,:,:),PSPSC2(:,:),NF_SC2,UBOUND(PSPSC2,2)) - IFIRST = IFIRST+2*NF_SC2 - ENDIF - IF(PRESENT(PSPSC3A) .AND. NF_SC3A > 0) THEN - DO J3=1,UBOUND(PSPSC3A,3) - CALL PRFI1B(PSCALARS(IFIRST:IFIRST+2*NF_SC3A-1,:,:),PSPSC3A(:,:,J3),NF_SC3A,UBOUND(PSPSC3A,2)) + ! Except if we want to translate Vorticity but not Divergence, we should have Divergence first + ! Then we have all buffers that move on in a contiguous buffer + PDIV => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! Divergence + + PVOR => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! Vorticity + ENDIF + PU => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! U + PV => PIA(IFIRST+1:IFIRST+2*KF_UV,:,:) + IFIRST = IFIRST + 2*KF_UV ! V + PSCALARS => PIA(IFIRST+1:IFIRST+2*KF_SCALARS,:,:) + IFIRST = IFIRST + 2*KF_SCALARS ! Scalars + IF (LSCDERS) THEN + PSCALARS_NSDER => PIA(IFIRST+1:IFIRST+2*KF_SCALARS,:,:) + IFIRST = IFIRST + 2*KF_SCALARS ! Scalars NS Derivatives + ENDIF + + ! ------------------------------------------------------------------ + + + !* 3. SPECTRAL COMPUTATIONS FOR U,V AND DERIVATIVES. + ! ---------------------------------------------- + + IF (LSYNC_TRANS) THEN + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) + ENDIF + CALL GSTATS(422,0) + !$ACC DATA COPYIN(PSPVOR,PSPDIV) IF(KF_UV > 0) + !$ACC DATA COPYIN(PSPSCALAR) IF(PRESENT(PSPSCALAR) .AND. KF_SCALARS > 0) + !$ACC DATA COPYIN(PSPSC2) IF(NF_SC2 > 0) + !$ACC DATA COPYIN(PSPSC3A) IF(NF_SC3A > 0) + !$ACC DATA COPYIN(PSPSC3B) IF(NF_SC3B > 0) + IF (LSYNC_TRANS) THEN + CALL GSTATS(442,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(442,1) + ENDIF + CALL GSTATS(422,1) + + IF (KF_UV > 0) THEN + CALL PRFI1B(PVOR,PSPVOR,KF_UV,UBOUND(PSPVOR,2)) + CALL PRFI1B(PDIV,PSPDIV,KF_UV,UBOUND(PSPDIV,2)) + + ! Compute U and V for VOR and DIV + CALL VDTUV(KF_UV,ZEPSNM,PVOR,PDIV,PU,PV) + ENDIF + + IF (KF_SCALARS > 0) THEN + IF(PRESENT(PSPSCALAR)) THEN + CALL PRFI1B(PSCALARS,PSPSCALAR,KF_SCALARS,UBOUND(PSPSCALAR,2)) + ELSE + IFIRST = 1 + IF(PRESENT(PSPSC2) .AND. NF_SC2 > 0) THEN + CALL PRFI1B(PSCALARS(IFIRST:IFIRST+2*NF_SC2-1,:,:),PSPSC2(:,:),NF_SC2,UBOUND(PSPSC2,2)) + IFIRST = IFIRST+2*NF_SC2 + ENDIF + IF(PRESENT(PSPSC3A) .AND. NF_SC3A > 0) THEN + DO J3=1,UBOUND(PSPSC3A,3) + CALL PRFI1B(PSCALARS(IFIRST:IFIRST+2*NF_SC3A-1,:,:),PSPSC3A(:,:,J3),NF_SC3A,UBOUND(PSPSC3A,2)) IFIRST = IFIRST+2*NF_SC3A - ENDDO - ENDIF - IF(PRESENT(PSPSC3B) .AND. NF_SC3B > 0) THEN - DO J3=1,UBOUND(PSPSC3B,3) - CALL PRFI1B(PSCALARS(IFIRST:IFIRST+2*NF_SC3B-1,:,:),PSPSC3B(:,:,J3),NF_SC3B,UBOUND(PSPSC3B,2)) + ENDDO + ENDIF + IF(PRESENT(PSPSC3B) .AND. NF_SC3B > 0) THEN + DO J3=1,UBOUND(PSPSC3B,3) + CALL PRFI1B(PSCALARS(IFIRST:IFIRST+2*NF_SC3B-1,:,:),PSPSC3B(:,:,J3),NF_SC3B,UBOUND(PSPSC3B,2)) IFIRST = IFIRST+2*NF_SC3B - ENDDO - ENDIF - IF(IFIRST-1 /= 2*KF_SCALARS) THEN - WRITE(0,*) 'LTINV:KF_SCALARS,IFIRST',KF_SCALARS,IFIRST - CALL ABORT_TRANS('LTINV_MOD:IFIRST /= 2*KF_SCALARS') + ENDDO + ENDIF + IF(IFIRST-1 /= 2*KF_SCALARS) THEN + WRITE(0,*) 'LTINV:KF_SCALARS,IFIRST',KF_SCALARS,IFIRST + CALL ABORT_TRANS('LTINV_MOD:IFIRST /= 2*KF_SCALARS') + ENDIF ENDIF ENDIF - ENDIF - !$ACC END DATA - !$ACC END DATA - !$ACC END DATA - !$ACC END DATA - !$ACC END DATA - - ! Compute NS derivatives if needed - IF (LSCDERS) THEN - CALL SPNSDE(KF_SCALARS,ZEPSNM,PSCALARS,PSCALARS_NSDER) - ENDIF + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + !$ACC END DATA + + ! Compute NS derivatives if needed + IF (LSCDERS) THEN + CALL SPNSDE(KF_SCALARS,ZEPSNM,PSCALARS,PSCALARS_NSDER) + ENDIF - ! ------------------------------------------------------------------ + ! ------------------------------------------------------------------ - !* 4. INVERSE LEGENDRE TRANSFORM. - ! --------------------------- + !* 4. INVERSE LEGENDRE TRANSFORM. + ! --------------------------- - ! Legendre transforms. When converting PIA into ZOUT, we ignore the first entries of LEINV. - ! This is because vorticity and divergence is not necessarily converted to GP space. - CALL LEINV(PIA(2*(IF_READIN-IF_LEG)+1:IF_READIN,:,:),ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,IF_LEG) + ! Legendre transforms. When converting PIA into ZOUT, we ignore the first entries of LEINV. + ! This is because vorticity and divergence is not necessarily converted to GP space. + CALL LEINV(PIA(2*(IF_READIN-IF_LEG)+1:IF_READIN,:,:),ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,IF_LEG) - IF (LHOOK) CALL DR_HOOK('LTINV_MOD',1,ZHOOK_HANDLE) - ! ------------------------------------------------------------------ - + IF (LHOOK) CALL DR_HOOK('LTINV_MOD',1,ZHOOK_HANDLE) + ! ------------------------------------------------------------------ END SUBROUTINE LTINV - END MODULE LTINV_MOD - +END MODULE LTINV_MOD + diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 839727d34..51dd4f3b3 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -19,7 +19,7 @@ MODULE TRGTOL_MOD TYPE TRGTOL_HANDLE TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFS, HCOMBUFR_AND_REEL END TYPE - CONTAINS +CONTAINS FUNCTION PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) RESULT(HTRGTOL) USE PARKIND_ECTRANS, ONLY : JPIM, JPRB, JPRBT USE ALLOCATOR_MOD @@ -40,549 +40,546 @@ FUNCTION PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) RESULT(HTRGTOL) NELEM = KF_FS*D%NLENGTF*SIZEOF(DUMMY) ! ZCOMBUFR NELEM = NELEM + KF_FS*D%NLENGTF*SIZEOF(DUMMY) ! PREEL_REAL HTRGTOL%HCOMBUFR_AND_REEL = RESERVE(ALLOCATOR, NELEM) - END FUNCTION + SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,& - &PGP,PGPUV,PGP3A,PGP3B,PGP2,KPTRGP,KVSETUV,KVSETSC,KVSETSC3A,KVSETSC3B,KVSETSC2) - - !**** *TRGTOL * - transposition of grid point data from column - ! structure to latitudinal. Reorganize data between - ! grid point calculations and direct Fourier Transform - - ! Version using CUDA-aware MPI - - ! Purpose. - ! -------- - - - !** Interface. - ! ---------- - ! *call* *trgtol(...) - - ! Explicit arguments : - ! -------------------- - ! PREEL_REAL - Latitudinal data ready for direct FFT (output) - ! PGP - Blocked grid point data (input) - - ! Implicit arguments : - ! -------------------- - - ! Method. - ! ------- - ! See documentation - - ! Externals. - ! ---------- - - ! Reference. - ! ---------- - ! ECMWF Research Department documentation of the IFS - - ! Author. - ! ------- - ! MPP Group *ECMWF* - - ! Modifications. - ! -------------- - ! Original: 95-10-01 - ! D.Dent : 97-08-04 Reorganisation to allow - ! NPRTRV to differ from NPRGPEW - ! : 98-06-17 add mailbox control logic (from TRLTOM) - ! =99-03-29= Mats Hamrud and Deborah Salmond - ! JUMP in FFT's changed to 1 - ! KINDEX introduced and ZCOMBUF not used for same PE - ! 01-11-23 Deborah Salmond and John Hague - ! LIMP_NOOLAP Option for non-overlapping message passing - ! and buffer packing - ! 01-12-18 Peter Towers - ! Improved vector performance of GTOL_PACK,GTOL_UNPACK - ! 03-04-02 G. Radnoti: call barrier always when nproc>1 - ! 08-01-01 G.Mozdzynski: cleanup - ! 09-01-02 G.Mozdzynski: use non-blocking recv and send - ! ------------------------------------------------------------------ - - - - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB , JPRBT - USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - USE MPL_MODULE ,ONLY : MPL_WAIT, MPL_BARRIER - USE TPM_GEN ,ONLY : LSYNC_TRANS - USE EQ_REGIONS_MOD ,ONLY : MY_REGION_EW, MY_REGION_NS - USE TPM_DISTR ,ONLY : D,MYSETV, MYSETW, MTAGLG,NPRCIDS,MYPROC,NPROC,NPRTRW,NPRTRV - USE PE2SET_MOD ,ONLY : PE2SET - USE MPL_DATA_MODULE ,ONLY : MPL_COMM_OML - USE OML_MOD ,ONLY : OML_MY_THREAD - USE MPI - USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX - USE TPM_TRANS ,ONLY : NPROMA - USE ALLOCATOR_MOD + &PGP,PGPUV,PGP3A,PGP3B,PGP2,KPTRGP,KVSETUV,KVSETSC,KVSETSC3A,KVSETSC3B,KVSETSC2) + + !**** *TRGTOL * - transposition of grid point data from column + ! structure to latitudinal. Reorganize data between + ! grid point calculations and direct Fourier Transform + + ! Version using CUDA-aware MPI + + ! Purpose. + ! -------- + + + !** Interface. + ! ---------- + ! *call* *trgtol(...) + + ! Explicit arguments : + ! -------------------- + ! PREEL_REAL - Latitudinal data ready for direct FFT (output) + ! PGP - Blocked grid point data (input) + + ! Implicit arguments : + ! -------------------- + + ! Method. + ! ------- + ! See documentation + + ! Externals. + ! ---------- + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! MPP Group *ECMWF* + + ! Modifications. + ! -------------- + ! Original: 95-10-01 + ! D.Dent : 97-08-04 Reorganisation to allow + ! NPRTRV to differ from NPRGPEW + ! : 98-06-17 add mailbox control logic (from TRLTOM) + ! =99-03-29= Mats Hamrud and Deborah Salmond + ! JUMP in FFT's changed to 1 + ! KINDEX introduced and ZCOMBUF not used for same PE + ! 01-11-23 Deborah Salmond and John Hague + ! LIMP_NOOLAP Option for non-overlapping message passing + ! and buffer packing + ! 01-12-18 Peter Towers + ! Improved vector performance of GTOL_PACK,GTOL_UNPACK + ! 03-04-02 G. Radnoti: call barrier always when nproc>1 + ! 08-01-01 G.Mozdzynski: cleanup + ! 09-01-02 G.Mozdzynski: use non-blocking recv and send + ! ------------------------------------------------------------------ + + + + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB , JPRBT + USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + USE MPL_MODULE ,ONLY : MPL_WAIT, MPL_BARRIER + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE EQ_REGIONS_MOD ,ONLY : MY_REGION_EW, MY_REGION_NS + USE TPM_DISTR ,ONLY : D,MYSETV, MYSETW, MTAGLG,NPRCIDS,MYPROC,NPROC,NPRTRW,NPRTRV + USE PE2SET_MOD ,ONLY : PE2SET + USE MPL_DATA_MODULE ,ONLY : MPL_COMM_OML + USE OML_MOD ,ONLY : OML_MY_THREAD + USE MPI + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + USE TPM_TRANS ,ONLY : NPROMA + USE ALLOCATOR_MOD - IMPLICIT NONE + IMPLICIT NONE - REAL(KIND=JPRBT),INTENT(OUT), POINTER :: PREEL_REAL(:) - INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G - INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) - INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:), KVSETSC(:), KVSETSC3A(:), KVSETSC3B(:), KVSETSC2(:) - REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP(:,:,:), PGPUV(:,:,:,:), PGP3A(:,:,:,:), PGP3B(:,:,:,:), PGP2(:,:,:) - - TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR - TYPE(TRGTOL_HANDLE), INTENT(IN) :: HTRGTOL - - ! LOCAL VARIABLES - - ! LOCAL INTEGER SCALARS - REAL(KIND=JPRBT), POINTER :: ZCOMBUFS(:),ZCOMBUFR(:) - - INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) - INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) - INTEGER(KIND=JPIM) :: IREQ (NPROC*2) - INTEGER(KIND=JPIM) :: IRECV_TO_PROC(NPROC) - INTEGER(KIND=JPIM) :: ISEND_TO_PROC(NPROC) - - INTEGER(KIND=JPIM) :: IFIRSTLAT, IGL, IGLL, ILAST,& - &ILASTLAT, ILEN, JROC, IPOS, ISETA, & - &ISETB, IRECV, & - &ISETV, ISEND, JBLK, JFLD, & - &JGL, JI, JK, JL, ISETW, IFLD, & - &II,IBUFLENR,IRECV_COUNTS, IPROC,IFLDS, & - &ISEND_COUNTS,INS,INR,IR, JKL, PBOUND, IERROR, ILOCAL_LAT - INTEGER(KIND=JPIM) :: KF, KGL, KI, J3 - - INTEGER(KIND=JPIM) :: IOFF, ILAT_STRIP - INTEGER(KIND=JPIM) :: IRECV_BUFR_TO_OUT(D%NLENGTF,2),IRECV_BUFR_TO_OUT_OFFSET(NPROC), IRECV_BUFR_TO_OUT_V - INTEGER(KIND=JPIM) :: ISEND_FIELD_COUNT(NPRTRV),ISEND_FIELD_COUNT_V - INTEGER(KIND=JPIM) :: ISEND_WSET_SIZE(NPRTRW),ISEND_WSET_SIZE_V - INTEGER(KIND=JPIM) :: ISEND_WSET_OFFSET(NPRTRW+1), ISEND_WSET_OFFSET_V - INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) - INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V - INTEGER(KIND=JPIM) :: IFLDA(KF_GP) - INTEGER(KIND=JPIM) :: IVSET(KF_GP) - - REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - - INTEGER(JPIM), PARAMETER :: PGP_INDICES_UV = 1 - INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP2 = 2 - INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP3A = 3 - INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP3B = 4 - INTEGER(JPIM), PARAMETER :: PGP_INDICES_END = 5 - INTEGER(JPIM) :: PGP_INDICES(PGP_INDICES_END) - - #ifdef PARKINDTRANS_SINGLE - #define TRGTOL_DTYPE MPI_REAL - #else - #define TRGTOL_DTYPE MPI_DOUBLE_PRECISION - #endif - - ! ------------------------------------------------------------------ - - !* 0. Some initializations - ! -------------------- - ! Note we have either - ! - KVSETUV and KVSETSC (with PGP, which has u, v, and scalar fields), or - ! - KVSETUV, KVSETSC2, KVSETSC3A KVSETSC3B (with PGPUV, GP3A, PGP3B and PGP2) - ! KVSETs are optionals. Their sizes canalso be inferred from KV_UV_G/KV_SCALARS_G (which - ! should match PSPXXX and PGPXXX arrays) - IOFF=0 - IF(PRESENT(KVSETUV)) THEN - IVSET(IOFF+1:IOFF+KF_UV_G) = KVSETUV(:) - IOFF=IOFF+KF_UV_G - IVSET(IOFF+1:IOFF+KF_UV_G) = KVSETUV(:) - IOFF=IOFF+KF_UV_G - ELSE - IVSET(IOFF+1:IOFF+KF_UV_G) = -1 - IOFF=IOFF+KF_UV_G - IVSET(IOFF+1:IOFF+KF_UV_G) = -1 - IOFF=IOFF+KF_UV_G - ENDIF - IF(PRESENT(KVSETSC)) THEN - IVSET(IOFF+1:IOFF+KF_SCALARS_G) = KVSETSC(:) - IOFF=IOFF+KF_SCALARS_G - ELSE - IF(PRESENT(KVSETSC2)) THEN - IVSET(IOFF+1:IOFF+SIZE(KVSETSC2)) = KVSETSC2(:) - IOFF=IOFF+SIZE(KVSETSC2) + REAL(KIND=JPRBT),INTENT(OUT), POINTER :: PREEL_REAL(:) + INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:), KVSETSC(:), KVSETSC3A(:), KVSETSC3B(:), KVSETSC2(:) + REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP(:,:,:), PGPUV(:,:,:,:), PGP3A(:,:,:,:), PGP3B(:,:,:,:), PGP2(:,:,:) + + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(TRGTOL_HANDLE), INTENT(IN) :: HTRGTOL + + ! LOCAL VARIABLES + + ! LOCAL INTEGER SCALARS + REAL(KIND=JPRBT), POINTER :: ZCOMBUFS(:),ZCOMBUFR(:) + + INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) + INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) + INTEGER(KIND=JPIM) :: IREQ (NPROC*2) + INTEGER(KIND=JPIM) :: IRECV_TO_PROC(NPROC) + INTEGER(KIND=JPIM) :: ISEND_TO_PROC(NPROC) + + INTEGER(KIND=JPIM) :: IFIRSTLAT, IGL, IGLL, ILAST,& + &ILASTLAT, ILEN, JROC, IPOS, ISETA, & + &ISETB, IRECV, & + &ISETV, ISEND, JBLK, JFLD, & + &JGL, JI, JK, JL, ISETW, IFLD, & + &II,IBUFLENR,IRECV_COUNTS, IPROC,IFLDS, & + &ISEND_COUNTS,INS,INR,IR, JKL, PBOUND, IERROR, ILOCAL_LAT + INTEGER(KIND=JPIM) :: KF, KGL, KI, J3 + + INTEGER(KIND=JPIM) :: IOFF, ILAT_STRIP + INTEGER(KIND=JPIM) :: IRECV_BUFR_TO_OUT(D%NLENGTF,2),IRECV_BUFR_TO_OUT_OFFSET(NPROC), IRECV_BUFR_TO_OUT_V + INTEGER(KIND=JPIM) :: ISEND_FIELD_COUNT(NPRTRV),ISEND_FIELD_COUNT_V + INTEGER(KIND=JPIM) :: ISEND_WSET_SIZE(NPRTRW),ISEND_WSET_SIZE_V + INTEGER(KIND=JPIM) :: ISEND_WSET_OFFSET(NPRTRW+1), ISEND_WSET_OFFSET_V + INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) + INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V + INTEGER(KIND=JPIM) :: IFLDA(KF_GP) + INTEGER(KIND=JPIM) :: IVSET(KF_GP) + + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + + INTEGER(JPIM), PARAMETER :: PGP_INDICES_UV = 1 + INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP2 = 2 + INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP3A = 3 + INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP3B = 4 + INTEGER(JPIM), PARAMETER :: PGP_INDICES_END = 5 + INTEGER(JPIM) :: PGP_INDICES(PGP_INDICES_END) + + #ifdef PARKINDTRANS_SINGLE + #define TRGTOL_DTYPE MPI_REAL + #else + #define TRGTOL_DTYPE MPI_DOUBLE_PRECISION + #endif + + ! ------------------------------------------------------------------ + + !* 0. Some initializations + ! -------------------- + ! Note we have either + ! - KVSETUV and KVSETSC (with PGP, which has u, v, and scalar fields), or + ! - KVSETUV, KVSETSC2, KVSETSC3A KVSETSC3B (with PGPUV, GP3A, PGP3B and PGP2) + ! KVSETs are optionals. Their sizes canalso be inferred from KV_UV_G/KV_SCALARS_G (which + ! should match PSPXXX and PGPXXX arrays) + IOFF=0 + IF(PRESENT(KVSETUV)) THEN + IVSET(IOFF+1:IOFF+KF_UV_G) = KVSETUV(:) + IOFF=IOFF+KF_UV_G + IVSET(IOFF+1:IOFF+KF_UV_G) = KVSETUV(:) + IOFF=IOFF+KF_UV_G + ELSE + IVSET(IOFF+1:IOFF+KF_UV_G) = -1 + IOFF=IOFF+KF_UV_G + IVSET(IOFF+1:IOFF+KF_UV_G) = -1 + IOFF=IOFF+KF_UV_G ENDIF - IF(PRESENT(KVSETSC3A)) THEN - DO J3=1,SIZE(PGP3A,3) - IVSET(IOFF+1:IOFF+SIZE(KVSETSC3A))=KVSETSC3A(:) - IOFF=IOFF+SIZE(KVSETSC3A) - ENDDO + IF(PRESENT(KVSETSC)) THEN + IVSET(IOFF+1:IOFF+KF_SCALARS_G) = KVSETSC(:) + IOFF=IOFF+KF_SCALARS_G + ELSE + IF(PRESENT(KVSETSC2)) THEN + IVSET(IOFF+1:IOFF+SIZE(KVSETSC2)) = KVSETSC2(:) + IOFF=IOFF+SIZE(KVSETSC2) + ENDIF + IF(PRESENT(KVSETSC3A)) THEN + DO J3=1,SIZE(PGP3A,3) + IVSET(IOFF+1:IOFF+SIZE(KVSETSC3A))=KVSETSC3A(:) + IOFF=IOFF+SIZE(KVSETSC3A) + ENDDO + ENDIF + IF(PRESENT(KVSETSC3B)) THEN + DO J3=1,SIZE(PGP3B,3) + IVSET(IOFF+1:IOFF+SIZE(KVSETSC3B))=KVSETSC3B(:) + IOFF=IOFF+SIZE(KVSETSC3B) + ENDDO + ENDIF + ENDIF + + IF (IOFF /= 2*KF_UV_G+KF_SCALARS_G) THEN + PRINT*, "TRGTOL: ERROR IN IVSET COMPUTATION" + FLUSH(6) + STOP 38 ENDIF - IF(PRESENT(KVSETSC3B)) THEN - DO J3=1,SIZE(PGP3B,3) - IVSET(IOFF+1:IOFF+SIZE(KVSETSC3B))=KVSETSC3B(:) - IOFF=IOFF+SIZE(KVSETSC3B) + + IF (LHOOK) CALL DR_HOOK('TRGTOL',0,ZHOOK_HANDLE) + + CALL GSTATS(1805,0) + IOFF=1 + PGP_INDICES(PGP_INDICES_UV) = IOFF + IF (PRESENT(PGPUV)) IOFF=IOFF+UBOUND(PGPUV,2)*2 + PGP_INDICES(PGP_INDICES_GP2) = IOFF + IF (PRESENT(PGP2)) IOFF=IOFF+UBOUND(PGP2,2) + PGP_INDICES(PGP_INDICES_GP3A) = IOFF + IF (PRESENT(PGP3A)) IOFF=IOFF+UBOUND(PGP3A,2)*UBOUND(PGP3A,3) + PGP_INDICES(PGP_INDICES_GP3B) = IOFF + IF (PRESENT(PGP3B)) IOFF=IOFF+UBOUND(PGP3B,2)*UBOUND(PGP3B,3) + PGP_INDICES(PGP_INDICES_END) = IOFF + + ! Prepare sender arrays + ! find number of fields on a certain V-set + IF(NPRTRV == 1) THEN + ! This is needed because IVSET(JFLD) == -1 if there is only one V-set + ISEND_FIELD_COUNT(1) = KF_GP + ELSE + ISEND_FIELD_COUNT(:) = 0 + DO JFLD=1,KF_GP + ISEND_FIELD_COUNT(IVSET(JFLD)) = ISEND_FIELD_COUNT(IVSET(JFLD)) + 1 ENDDO ENDIF - ENDIF - - IF (IOFF /= 2*KF_UV_G+KF_SCALARS_G) THEN - PRINT*, "TRGTOL: ERROR IN IVSET COMPUTATION" - FLUSH(6) - STOP 38 - ENDIF - - IF (LHOOK) CALL DR_HOOK('TRGTOL',0,ZHOOK_HANDLE) - - CALL GSTATS(1805,0) - IOFF=1 - PGP_INDICES(PGP_INDICES_UV) = IOFF - IF (PRESENT(PGPUV)) IOFF=IOFF+UBOUND(PGPUV,2)*2 - PGP_INDICES(PGP_INDICES_GP2) = IOFF - IF (PRESENT(PGP2)) IOFF=IOFF+UBOUND(PGP2,2) - PGP_INDICES(PGP_INDICES_GP3A) = IOFF - IF (PRESENT(PGP3A)) IOFF=IOFF+UBOUND(PGP3A,2)*UBOUND(PGP3A,3) - PGP_INDICES(PGP_INDICES_GP3B) = IOFF - IF (PRESENT(PGP3B)) IOFF=IOFF+UBOUND(PGP3B,2)*UBOUND(PGP3B,3) - PGP_INDICES(PGP_INDICES_END) = IOFF - - ! Prepare sender arrays - ! find number of fields on a certain V-set - IF(NPRTRV == 1) THEN - ! This is needed because IVSET(JFLD) == -1 if there is only one V-set - ISEND_FIELD_COUNT(1) = KF_GP - ELSE - ISEND_FIELD_COUNT(:) = 0 - DO JFLD=1,KF_GP - ISEND_FIELD_COUNT(IVSET(JFLD)) = ISEND_FIELD_COUNT(IVSET(JFLD)) + 1 + ! find number of grid-points on a certain W-set that overlap with myself + ISEND_WSET_SIZE(:) = 0 + DO ILOCAL_LAT=D%NFRSTLAT(MY_REGION_NS),D%NLSTLAT(MY_REGION_NS) + ILAT_STRIP = ILOCAL_LAT-D%NFRSTLAT(MY_REGION_NS)+D%NPTRFLOFF+1 + ISEND_WSET_SIZE(D%NPROCL(ILOCAL_LAT)) = & + & ISEND_WSET_SIZE(D%NPROCL(ILOCAL_LAT))+D%NONL(ILAT_STRIP,MY_REGION_EW) + ENDDO + ! sum up offsets + ISEND_WSET_OFFSET(1) = 0 + DO JROC=1,NPRTRW + ISEND_WSET_OFFSET(JROC+1)=ISEND_WSET_OFFSET(JROC)+ISEND_WSET_SIZE(JROC) + ENDDO + DO JROC=1,NPROC + CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) + ! total send size is # points per field * # fields + ISENDTOT(JROC) = ISEND_WSET_SIZE(ISETW)*ISEND_FIELD_COUNT(ISETV) ENDDO - ENDIF - ! find number of grid-points on a certain W-set that overlap with myself - ISEND_WSET_SIZE(:) = 0 - DO ILOCAL_LAT=D%NFRSTLAT(MY_REGION_NS),D%NLSTLAT(MY_REGION_NS) - ILAT_STRIP = ILOCAL_LAT-D%NFRSTLAT(MY_REGION_NS)+D%NPTRFLOFF+1 - ISEND_WSET_SIZE(D%NPROCL(ILOCAL_LAT)) = & - & ISEND_WSET_SIZE(D%NPROCL(ILOCAL_LAT))+D%NONL(ILAT_STRIP,MY_REGION_EW) - ENDDO - ! sum up offsets - ISEND_WSET_OFFSET(1) = 0 - DO JROC=1,NPRTRW - ISEND_WSET_OFFSET(JROC+1)=ISEND_WSET_OFFSET(JROC)+ISEND_WSET_SIZE(JROC) - ENDDO - DO JROC=1,NPROC - CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) - ! total send size is # points per field * # fields - ISENDTOT(JROC) = ISEND_WSET_SIZE(ISETW)*ISEND_FIELD_COUNT(ISETV) - ENDDO - - ! Prepare receiver arrays - IRECV_BUFR_TO_OUT_OFFSET(:) = 0 - DO JROC=1,NPROC - ! Get new offset to my current KINDEX entry - IF (JROC > 1 .AND. KF_FS > 0) THEN - IRECV_BUFR_TO_OUT_OFFSET(JROC) = IRECV_BUFR_TO_OUT_OFFSET(JROC-1)+IRECVTOT(JROC-1)/KF_FS - ELSEIF (JROC > 1) THEN - IRECV_BUFR_TO_OUT_OFFSET(JROC) = IRECV_BUFR_TO_OUT_OFFSET(JROC-1) - ENDIF - CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) - - ! MAX(Index of first fourier latitude for this W set, first latitude of a senders A set) - ! i.e. we find the overlap between what we have on sender side (others A set) and the receiver - ! (me, the W-set). Ideally those conincide, at least mostly. - IFIRSTLAT = MAX(D%NPTRLS(MYSETW),D%NFRSTLAT(ISETA)) - ! MIN(Index of last fourier latitude for this W set, last latitude of a senders A set) - ILASTLAT = MIN(D%NPTRLS(MYSETW)+D%NULTPP(MYSETW)-1,D%NLSTLAT(ISETA)) - - IPOS = 0 - DO JGL=IFIRSTLAT,ILASTLAT - ! get from "actual" latitude to the latitude strip offset - IGL = JGL-D%NFRSTLAT(ISETA)+D%NPTRFRSTLAT(ISETA) - ! get from "actual" latitude to the latitude offset - IGLL = JGL-D%NPTRLS(MYSETW)+1 - DO JL=1,D%NONL(IGL,ISETB) - IPOS = IPOS+1 - ! offset to first layer of this gridpoint - IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_OFFSET(JROC)+IPOS,1) = & - & KF_FS*D%NSTAGTF(IGLL)+(D%NSTA(IGL,ISETB)-1)+(JL-1) - ! distance between two layers of this gridpoint - IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_OFFSET(JROC)+IPOS,2) = & - & D%NSTAGTF(IGLL+1)-D%NSTAGTF(IGLL) + ! Prepare receiver arrays + IRECV_BUFR_TO_OUT_OFFSET(:) = 0 + DO JROC=1,NPROC + ! Get new offset to my current KINDEX entry + IF (JROC > 1 .AND. KF_FS > 0) THEN + IRECV_BUFR_TO_OUT_OFFSET(JROC) = IRECV_BUFR_TO_OUT_OFFSET(JROC-1)+IRECVTOT(JROC-1)/KF_FS + ELSEIF (JROC > 1) THEN + IRECV_BUFR_TO_OUT_OFFSET(JROC) = IRECV_BUFR_TO_OUT_OFFSET(JROC-1) + ENDIF + + CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) + + ! MAX(Index of first fourier latitude for this W set, first latitude of a senders A set) + ! i.e. we find the overlap between what we have on sender side (others A set) and the receiver + ! (me, the W-set). Ideally those conincide, at least mostly. + IFIRSTLAT = MAX(D%NPTRLS(MYSETW),D%NFRSTLAT(ISETA)) + ! MIN(Index of last fourier latitude for this W set, last latitude of a senders A set) + ILASTLAT = MIN(D%NPTRLS(MYSETW)+D%NULTPP(MYSETW)-1,D%NLSTLAT(ISETA)) + + IPOS = 0 + DO JGL=IFIRSTLAT,ILASTLAT + ! get from "actual" latitude to the latitude strip offset + IGL = JGL-D%NFRSTLAT(ISETA)+D%NPTRFRSTLAT(ISETA) + ! get from "actual" latitude to the latitude offset + IGLL = JGL-D%NPTRLS(MYSETW)+1 + DO JL=1,D%NONL(IGL,ISETB) + IPOS = IPOS+1 + ! offset to first layer of this gridpoint + IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_OFFSET(JROC)+IPOS,1) = & + & KF_FS*D%NSTAGTF(IGLL)+(D%NSTA(IGL,ISETB)-1)+(JL-1) + ! distance between two layers of this gridpoint + IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_OFFSET(JROC)+IPOS,2) = & + & D%NSTAGTF(IGLL+1)-D%NSTAGTF(IGLL) + ENDDO ENDDO + !we always receive the full fourier space + IRECVTOT(JROC) = IPOS*KF_FS ENDDO - !we always receive the full fourier space - IRECVTOT(JROC) = IPOS*KF_FS - ENDDO - CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& - & KF_FS*D%NLENGTF*SIZEOF(PREEL_REAL(1))+1, KF_FS*D%NLENGTF*SIZEOF(PREEL_REAL(1))) + CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& + & KF_FS*D%NLENGTF*SIZEOF(PREEL_REAL(1))+1, KF_FS*D%NLENGTF*SIZEOF(PREEL_REAL(1))) - !$ACC DATA COPYIN(IRECV_BUFR_TO_OUT,PGP_INDICES) PRESENT(PREEL_REAL) ASYNC(1) + !$ACC DATA COPYIN(IRECV_BUFR_TO_OUT,PGP_INDICES) PRESENT(PREEL_REAL) ASYNC(1) - CALL GSTATS(1805,1) + CALL GSTATS(1805,1) - ! Put data on device for copyin - IF (LSYNC_TRANS) THEN - !$ACC WAIT(1) - CALL GSTATS(430,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(430,1) - ENDIF - CALL GSTATS(412,0) - !$ACC DATA IF(PRESENT(PGP)) COPYIN(PGP) ASYNC(1) - !$ACC DATA IF(PRESENT(PGPUV)) COPYIN(PGPUV) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP2)) COPYIN(PGP2) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3A)) COPYIN(PGP3A) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3B)) COPYIN(PGP3B) ASYNC(1) - IF (LSYNC_TRANS) THEN - !$ACC WAIT(1) - CALL GSTATS(432,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(432,1) - ENDIF - CALL GSTATS(412,1) - - ! Figure out processes that send or recv something - ISEND_COUNTS = 0 - IRECV_COUNTS = 0 - DO JROC=1,NPROC - IF( JROC /= MYPROC) THEN - IF(IRECVTOT(JROC) > 0) THEN - ! I have to recv something, so let me store that - IRECV_COUNTS = IRECV_COUNTS + 1 - IRECV_TO_PROC(IRECV_COUNTS)=JROC - ENDIF - IF(ISENDTOT(JROC) > 0) THEN - ! I have to send something, so let me store that - ISEND_COUNTS = ISEND_COUNTS+1 - ISEND_TO_PROC(ISEND_COUNTS)=JROC - ENDIF + ! Put data on device for copyin + IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) + ENDIF + CALL GSTATS(412,0) + !$ACC DATA IF(PRESENT(PGP)) COPYIN(PGP) ASYNC(1) + !$ACC DATA IF(PRESENT(PGPUV)) COPYIN(PGPUV) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP2)) COPYIN(PGP2) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3A)) COPYIN(PGP3A) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3B)) COPYIN(PGP3B) ASYNC(1) + IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) + CALL GSTATS(432,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(432,1) ENDIF - ENDDO - - ALLOCATE(ICOMBUFS_OFFSET(ISEND_COUNTS+1)) - ICOMBUFS_OFFSET(1) = 0 - DO JROC=1,ISEND_COUNTS - ICOMBUFS_OFFSET(JROC+1) = ICOMBUFS_OFFSET(JROC) + ISENDTOT(ISEND_TO_PROC(JROC)) - ENDDO - ALLOCATE(ICOMBUFR_OFFSET(IRECV_COUNTS+1)) - ICOMBUFR_OFFSET(1) = 0 - DO JROC=1,IRECV_COUNTS - ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(IRECV_TO_PROC(JROC)) - ENDDO - - IF (ISEND_COUNTS > 0) THEN - CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFS),& - & 1_C_SIZE_T, ICOMBUFS_OFFSET(ISEND_COUNTS+1)*SIZEOF(ZCOMBUFS(1))) - ENDIF - - !....Pack loop......................................................... - !$ACC DATA IF(ISEND_COUNTS > 0) PRESENT(ZCOMBUFS) ASYNC(1) - - CALL GSTATS(1602,0) - DO INS=1,ISEND_COUNTS - ISEND=ISEND_TO_PROC(INS) - CALL PE2SET(ISEND,ISETA,ISETB,ISETW,ISETV) - - ISEND_FIELD_COUNT_V = ISEND_FIELD_COUNT(ISETV) - ICOMBUFS_OFFSET_V = ICOMBUFS_OFFSET(INS) - - IFLDS = 0 - DO JFLD=1,KF_GP - IF(IVSET(JFLD) == ISETV .OR. IVSET(JFLD) == -1 ) THEN - IFLDS = IFLDS+1 - IF(PRESENT(KPTRGP)) THEN - IFLDA(IFLDS)=KPTRGP(JFLD) - ELSE - IFLDA(IFLDS)=JFLD + CALL GSTATS(412,1) + + ! Figure out processes that send or recv something + ISEND_COUNTS = 0 + IRECV_COUNTS = 0 + DO JROC=1,NPROC + IF( JROC /= MYPROC) THEN + IF(IRECVTOT(JROC) > 0) THEN + ! I have to recv something, so let me store that + IRECV_COUNTS = IRECV_COUNTS + 1 + IRECV_TO_PROC(IRECV_COUNTS)=JROC + ENDIF + IF(ISENDTOT(JROC) > 0) THEN + ! I have to send something, so let me store that + ISEND_COUNTS = ISEND_COUNTS+1 + ISEND_TO_PROC(ISEND_COUNTS)=JROC ENDIF ENDIF ENDDO - !$ACC DATA COPYIN(IFLDA(1:ISEND_FIELD_COUNT_V)) ASYNC(1) - - ISEND_WSET_OFFSET_V = ISEND_WSET_OFFSET(ISETW) - ISEND_WSET_SIZE_V = ISEND_WSET_SIZE(ISETW) - IF(PRESENT(PGP)) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) ASYNC(1) - DO JFLD=1,ISEND_FIELD_COUNT_V - DO JL=1,ISEND_WSET_SIZE_V - JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 - JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 - IFLD = IFLDA(JFLD) - JI = (JFLD-1)*ISEND_WSET_SIZE_V+JL - ZCOMBUFS(ICOMBUFS_OFFSET_V+JI) = PGP(JK,IFLD,JBLK) - ENDDO - ENDDO - ELSE - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI,IOFF) ASYNC(1) - DO JFLD=1,ISEND_FIELD_COUNT_V - DO JL=1,ISEND_WSET_SIZE_V - JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 - JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 - IFLD = IFLDA(JFLD) - JI = ICOMBUFS_OFFSET_V+(JFLD-1)*ISEND_WSET_SIZE_V+JL - IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) - PBOUND=UBOUND(PGPUV,2) - ! TODO we could certainly reshape PGPXX arrays and we would simplify this - ZCOMBUFS(JI) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) - ZCOMBUFS(JI) = PGP2(JK,IOFF+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) - PBOUND=UBOUND(PGP3A,2) - ZCOMBUFS(JI) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) - PBOUND=UBOUND(PGP3B,2) - ZCOMBUFS(JI)= PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) - ENDIF - ENDDO - ENDDO + ALLOCATE(ICOMBUFS_OFFSET(ISEND_COUNTS+1)) + ICOMBUFS_OFFSET(1) = 0 + DO JROC=1,ISEND_COUNTS + ICOMBUFS_OFFSET(JROC+1) = ICOMBUFS_OFFSET(JROC) + ISENDTOT(ISEND_TO_PROC(JROC)) + ENDDO + ALLOCATE(ICOMBUFR_OFFSET(IRECV_COUNTS+1)) + ICOMBUFR_OFFSET(1) = 0 + DO JROC=1,IRECV_COUNTS + ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(IRECV_TO_PROC(JROC)) + ENDDO + + IF (ISEND_COUNTS > 0) THEN + CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFS),& + & 1_C_SIZE_T, ICOMBUFS_OFFSET(ISEND_COUNTS+1)*SIZEOF(ZCOMBUFS(1))) ENDIF - !$ACC END DATA - ENDDO - !$ACC WAIT(1) - CALL GSTATS(1602,1) - - IF (LSYNC_TRANS) THEN - CALL GSTATS(430,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(430,1) - ENDIF - - CALL GSTATS(411,0) - IF (IRECV_COUNTS > 0) THEN - CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& - & 1_C_SIZE_T, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*SIZEOF(ZCOMBUFR(1))) - ENDIF - !$ACC DATA IF(IRECV_COUNTS > 0) PRESENT(ZCOMBUFR) - - IR=0 - - !$ACC HOST_DATA USE_DEVICE(ZCOMBUFR,ZCOMBUFS) - ! Receive loop......................................................... - DO INR=1,IRECV_COUNTS - IR=IR+1 - IPROC=IRECV_TO_PROC(INR) - CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)),IRECVTOT(IPROC), & - & TRGTOL_DTYPE,NPRCIDS(IPROC)-1,MTAGLG,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) - ENDDO - - !....Send loop......................................................... - DO INS=1,ISEND_COUNTS - IR=IR+1 - ISEND=ISEND_TO_PROC(INS) - CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT(ISEND), & - & TRGTOL_DTYPE,NPRCIDS(ISEND)-1,MTAGLG,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) - ENDDO - !$ACC END HOST_DATA - - ! Copy local contribution - IF(ISENDTOT(MYPROC) > 0 )THEN - ! I have to send something to myself... - - ! Input is KF_GP fields. We find the resulting KF_FS fields. - IFLDS = 0 - DO JFLD=1,KF_GP - IF(IVSET(JFLD) == MYSETV .OR. IVSET(JFLD) == -1) THEN - IFLDS = IFLDS+1 - IF(PRESENT(KPTRGP)) THEN - IFLDA(IFLDS) = KPTRGP(JFLD) - ELSE - IFLDA(IFLDS) = JFLD + + !....Pack loop......................................................... + !$ACC DATA IF(ISEND_COUNTS > 0) PRESENT(ZCOMBUFS) ASYNC(1) + + CALL GSTATS(1602,0) + DO INS=1,ISEND_COUNTS + ISEND=ISEND_TO_PROC(INS) + CALL PE2SET(ISEND,ISETA,ISETB,ISETW,ISETV) + + ISEND_FIELD_COUNT_V = ISEND_FIELD_COUNT(ISETV) + ICOMBUFS_OFFSET_V = ICOMBUFS_OFFSET(INS) + + IFLDS = 0 + DO JFLD=1,KF_GP + IF(IVSET(JFLD) == ISETV .OR. IVSET(JFLD) == -1 ) THEN + IFLDS = IFLDS+1 + IF(PRESENT(KPTRGP)) THEN + IFLDA(IFLDS)=KPTRGP(JFLD) + ELSE + IFLDA(IFLDS)=JFLD + ENDIF ENDIF + ENDDO + + !$ACC DATA COPYIN(IFLDA(1:ISEND_FIELD_COUNT_V)) ASYNC(1) + + ISEND_WSET_OFFSET_V = ISEND_WSET_OFFSET(ISETW) + ISEND_WSET_SIZE_V = ISEND_WSET_SIZE(ISETW) + IF(PRESENT(PGP)) THEN + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) ASYNC(1) + DO JFLD=1,ISEND_FIELD_COUNT_V + DO JL=1,ISEND_WSET_SIZE_V + JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + JI = (JFLD-1)*ISEND_WSET_SIZE_V+JL + ZCOMBUFS(ICOMBUFS_OFFSET_V+JI) = PGP(JK,IFLD,JBLK) + ENDDO + ENDDO + ELSE + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI,IOFF) ASYNC(1) + DO JFLD=1,ISEND_FIELD_COUNT_V + DO JL=1,ISEND_WSET_SIZE_V + JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + JI = ICOMBUFS_OFFSET_V+(JFLD-1)*ISEND_WSET_SIZE_V+JL + IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) + PBOUND=UBOUND(PGPUV,2) + ! TODO we could certainly reshape PGPXX arrays and we would simplify this + ZCOMBUFS(JI) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) + ZCOMBUFS(JI) = PGP2(JK,IOFF+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) + PBOUND=UBOUND(PGP3A,2) + ZCOMBUFS(JI) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) + PBOUND=UBOUND(PGP3B,2) + ZCOMBUFS(JI)= PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ENDIF + ENDDO + ENDDO ENDIF + !$ACC END DATA ENDDO + !$ACC WAIT(1) + CALL GSTATS(1602,1) - !$ACC DATA COPYIN(IFLDA(1:IFLDS)) ASYNC(1) + IF (LSYNC_TRANS) THEN + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) + ENDIF - ISEND_WSET_OFFSET_V = ISEND_WSET_OFFSET(MYSETW) - ISEND_WSET_SIZE_V = ISEND_WSET_SIZE(MYSETW) - IRECV_BUFR_TO_OUT_V = IRECV_BUFR_TO_OUT_OFFSET(MYPROC) - CALL GSTATS(1601,0) - IF(PRESENT(PGP)) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) - DO JFLD=1,KF_FS - DO JL=1,ISEND_WSET_SIZE_V - JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 - JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 - IFLD = IFLDA(JFLD) - IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ & - & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 - PREEL_REAL(IPOS) = PGP(JK,IFLD,JBLK) - ENDDO - ENDDO - ELSE - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) - DO JFLD=1,KF_FS - DO JL=1,ISEND_WSET_SIZE_V - JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 - JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 - IFLD = IFLDA(JFLD) - IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ & - & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 - IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) - PBOUND=UBOUND(PGPUV,2) - PREEL_REAL(IPOS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) - PREEL_REAL(IPOS) = PGP2(JK,IOFF+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) - PBOUND=UBOUND(PGP3A,2) - PREEL_REAL(IPOS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) - ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN - IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) - PBOUND=UBOUND(PGP3B,2) - PREEL_REAL(IPOS) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + CALL GSTATS(411,0) + IF (IRECV_COUNTS > 0) THEN + CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& + & 1_C_SIZE_T, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*SIZEOF(ZCOMBUFR(1))) + ENDIF + !$ACC DATA IF(IRECV_COUNTS > 0) PRESENT(ZCOMBUFR) + + IR=0 + + !$ACC HOST_DATA USE_DEVICE(ZCOMBUFR,ZCOMBUFS) + ! Receive loop......................................................... + DO INR=1,IRECV_COUNTS + IR=IR+1 + IPROC=IRECV_TO_PROC(INR) + CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)),IRECVTOT(IPROC), & + & TRGTOL_DTYPE,NPRCIDS(IPROC)-1,MTAGLG,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) + ENDDO + + !....Send loop......................................................... + DO INS=1,ISEND_COUNTS + IR=IR+1 + ISEND=ISEND_TO_PROC(INS) + CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT(ISEND), & + & TRGTOL_DTYPE,NPRCIDS(ISEND)-1,MTAGLG,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) + ENDDO + !$ACC END HOST_DATA + + ! Copy local contribution + IF(ISENDTOT(MYPROC) > 0 )THEN + ! I have to send something to myself... + + ! Input is KF_GP fields. We find the resulting KF_FS fields. + IFLDS = 0 + DO JFLD=1,KF_GP + IF(IVSET(JFLD) == MYSETV .OR. IVSET(JFLD) == -1) THEN + IFLDS = IFLDS+1 + IF(PRESENT(KPTRGP)) THEN + IFLDA(IFLDS) = KPTRGP(JFLD) + ELSE + IFLDA(IFLDS) = JFLD ENDIF - ENDDO + ENDIF ENDDO + + !$ACC DATA COPYIN(IFLDA(1:IFLDS)) ASYNC(1) + + ISEND_WSET_OFFSET_V = ISEND_WSET_OFFSET(MYSETW) + ISEND_WSET_SIZE_V = ISEND_WSET_SIZE(MYSETW) + IRECV_BUFR_TO_OUT_V = IRECV_BUFR_TO_OUT_OFFSET(MYPROC) + CALL GSTATS(1601,0) + IF(PRESENT(PGP)) THEN + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) + DO JFLD=1,KF_FS + DO JL=1,ISEND_WSET_SIZE_V + JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ & + & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 + PREEL_REAL(IPOS) = PGP(JK,IFLD,JBLK) + ENDDO + ENDDO + ELSE + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) ASYNC(1) + DO JFLD=1,KF_FS + DO JL=1,ISEND_WSET_SIZE_V + JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ & + & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 + IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) + PBOUND=UBOUND(PGPUV,2) + PREEL_REAL(IPOS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) + PREEL_REAL(IPOS) = PGP2(JK,IOFF+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) + PBOUND=UBOUND(PGP3A,2) + PREEL_REAL(IPOS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN + IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) + PBOUND=UBOUND(PGP3B,2) + PREEL_REAL(IPOS) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ENDIF + ENDDO + ENDDO + ENDIF + CALL GSTATS(1601,1) + + !$ACC END DATA + ENDIF - CALL GSTATS(1601,1) - - !$ACC END DATA - - ENDIF - - - IF(IR > 0) THEN - CALL MPL_WAIT(KREQUEST=IREQ(1:IR), & - & CDSTRING='TRGTOL: WAIT FOR SENDS AND RECEIVES') - ENDIF - IF (LSYNC_TRANS) THEN - CALL GSTATS(431,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(431,1) - ENDIF - CALL GSTATS(411,1) - - ! Unpack loop......................................................... - - CALL GSTATS(1603,0) - DO INR=1,IRECV_COUNTS - IPROC=IRECV_TO_PROC(INR) - ILEN = IRECVTOT(IPROC)/KF_FS - IRECV_BUFR_TO_OUT_V = IRECV_BUFR_TO_OUT_OFFSET(IPROC) - ICOMBUFR_OFFSET_V = ICOMBUFR_OFFSET(INR) - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(II) ASYNC(1) - DO JFLD=1,KF_FS - DO JL=1,ILEN - IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ & - & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 - PREEL_REAL(IPOS) = ZCOMBUFR(ICOMBUFR_OFFSET_V+JL+(JFLD-1)*ILEN) - ENDDO - ENDDO - ENDDO - !$ACC WAIT(1) - CALL GSTATS(1603,1) - !$ACC END DATA ! ZCOMBUFR - !$ACC END DATA ! IRECV_BUFR_TO_OUT,PGPINDICES - !$ACC END DATA !ZCOMBUFS (present) - !$ACC END DATA !PGP3B - !$ACC END DATA !PGP3A - !$ACC END DATA !PGP2 - !$ACC END DATA !PGPUV - !$ACC END DATA !PGP + IF(IR > 0) THEN + CALL MPL_WAIT(KREQUEST=IREQ(1:IR), & + & CDSTRING='TRGTOL: WAIT FOR SENDS AND RECEIVES') + ENDIF + IF (LSYNC_TRANS) THEN + CALL GSTATS(431,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(431,1) + ENDIF + CALL GSTATS(411,1) - IF (LHOOK) CALL DR_HOOK('TRGTOL',1,ZHOOK_HANDLE) + ! Unpack loop......................................................... + CALL GSTATS(1603,0) + DO INR=1,IRECV_COUNTS + IPROC=IRECV_TO_PROC(INR) + ILEN = IRECVTOT(IPROC)/KF_FS + IRECV_BUFR_TO_OUT_V = IRECV_BUFR_TO_OUT_OFFSET(IPROC) + ICOMBUFR_OFFSET_V = ICOMBUFR_OFFSET(INR) + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(II) ASYNC(1) + DO JFLD=1,KF_FS + DO JL=1,ILEN + IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ & + & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 + PREEL_REAL(IPOS) = ZCOMBUFR(ICOMBUFR_OFFSET_V+JL+(JFLD-1)*ILEN) + ENDDO + ENDDO + ENDDO + !$ACC WAIT(1) + CALL GSTATS(1603,1) + + !$ACC END DATA ! ZCOMBUFR + !$ACC END DATA ! IRECV_BUFR_TO_OUT,PGPINDICES + !$ACC END DATA !ZCOMBUFS (present) + !$ACC END DATA !PGP3B + !$ACC END DATA !PGP3A + !$ACC END DATA !PGP2 + !$ACC END DATA !PGPUV + !$ACC END DATA !PGP + + IF (LHOOK) CALL DR_HOOK('TRGTOL',1,ZHOOK_HANDLE) END SUBROUTINE TRGTOL - - END MODULE TRGTOL_MOD +END MODULE TRGTOL_MOD diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 3fd5944fe..90ea26e40 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -19,7 +19,6 @@ MODULE TRLTOG_MOD TYPE TRLTOG_HANDLE TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFR_AND_COMBUFS END TYPE - CONTAINS FUNCTION PREPARE_TRLTOG(ALLOCATOR,KF_FS,KF_GP) RESULT(HTRLTOG) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT @@ -40,702 +39,700 @@ FUNCTION PREPARE_TRLTOG(ALLOCATOR,KF_FS,KF_GP) RESULT(HTRLTOG) HTRLTOG%HCOMBUFR_AND_COMBUFS = RESERVE(ALLOCATOR, NELEM) END FUNCTION + SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KPTRGP,& - & KVSETUV,KVSETSC,KVSETSC3A,KVSETSC3B,KVSETSC2,& - & PGP,PGPUV,PGP3A,PGP3B,PGP2) - - !**** *trltog * - transposition of grid point data from latitudinal - ! to column structure. This takes place between inverse - ! FFT and grid point calculations. - ! TRLTOG is the inverse of TRGTOL - - ! Version using CUDA-aware MPI - - ! Purpose. - ! -------- - - - !** Interface. - ! ---------- - ! *call* *trltog(...) - - ! Explicit arguments : - ! -------------------- - ! PREEL_REAL - Latitudinal data ready for direct FFT (input) - ! PGP - Blocked grid point data (output) - ! KVSET - "v-set" for each field (input) - - ! Implicit arguments : - ! -------------------- - - ! Method. - ! ------- - ! See documentation - - ! Externals. - ! ---------- - - ! Reference. - ! ---------- - ! ECMWF Research Department documentation of the IFS - - ! Author. - ! ------- - ! MPP Group *ECMWF* - - ! Modifications. - ! -------------- - ! Original : 95-10-01 - ! D.Dent : 97-08-04 Reorganisation to allow NPRTRV - ! to differ from NPRGPEW - ! =99-03-29= Mats Hamrud and Deborah Salmond - ! JUMP in FFT's changed to 1 - ! INDEX introduced and ZCOMBUF not used for same PE - ! 01-11-23 Deborah Salmond and John Hague - ! LIMP_NOOLAP Option for non-overlapping message passing - ! and buffer packing - ! 01-12-18 Peter Towers - ! Improved vector performance of LTOG_PACK,LTOG_UNPACK - ! 03-0-02 G. Radnoti: Call barrier always when nproc>1 - ! 08-01-01 G.Mozdzynski: cleanup - ! 09-01-02 G.Mozdzynski: use non-blocking recv and send - ! ------------------------------------------------------------------ - - - - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB , JPRBT - USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - USE MPL_MODULE ,ONLY : MPL_WAIT, MPL_BARRIER - USE TPM_GEN ,ONLY : LSYNC_TRANS - USE EQ_REGIONS_MOD ,ONLY : MY_REGION_EW, MY_REGION_NS - USE TPM_DISTR ,ONLY : D,MYSETV, MYSETW, MTAGLG,NPRCIDS,MYPROC,NPROC,NPRTRW,NPRTRV - USE PE2SET_MOD ,ONLY : PE2SET - USE MPL_DATA_MODULE ,ONLY : MPL_COMM_OML - USE OML_MOD ,ONLY : OML_MY_THREAD - USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS - USE MPI - USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX - - USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, NPROMA + & KVSETUV,KVSETSC,KVSETSC3A,KVSETSC3B,KVSETSC2,& + & PGP,PGPUV,PGP3A,PGP3B,PGP2) + + !**** *trltog * - transposition of grid point data from latitudinal + ! to column structure. This takes place between inverse + ! FFT and grid point calculations. + ! TRLTOG is the inverse of TRGTOL + + ! Version using CUDA-aware MPI + + ! Purpose. + ! -------- + + + !** Interface. + ! ---------- + ! *call* *trltog(...) + + ! Explicit arguments : + ! -------------------- + ! PREEL_REAL - Latitudinal data ready for direct FFT (input) + ! PGP - Blocked grid point data (output) + ! KVSET - "v-set" for each field (input) + + ! Implicit arguments : + ! -------------------- + + ! Method. + ! ------- + ! See documentation + + ! Externals. + ! ---------- + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! MPP Group *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 95-10-01 + ! D.Dent : 97-08-04 Reorganisation to allow NPRTRV + ! to differ from NPRGPEW + ! =99-03-29= Mats Hamrud and Deborah Salmond + ! JUMP in FFT's changed to 1 + ! INDEX introduced and ZCOMBUF not used for same PE + ! 01-11-23 Deborah Salmond and John Hague + ! LIMP_NOOLAP Option for non-overlapping message passing + ! and buffer packing + ! 01-12-18 Peter Towers + ! Improved vector performance of LTOG_PACK,LTOG_UNPACK + ! 03-0-02 G. Radnoti: Call barrier always when nproc>1 + ! 08-01-01 G.Mozdzynski: cleanup + ! 09-01-02 G.Mozdzynski: use non-blocking recv and send + ! ------------------------------------------------------------------ + + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB , JPRBT + USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + USE MPL_MODULE ,ONLY : MPL_WAIT, MPL_BARRIER + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE EQ_REGIONS_MOD ,ONLY : MY_REGION_EW, MY_REGION_NS + USE TPM_DISTR ,ONLY : D,MYSETV, MYSETW, MTAGLG,NPRCIDS,MYPROC,NPROC,NPRTRW,NPRTRV + USE PE2SET_MOD ,ONLY : PE2SET + USE MPL_DATA_MODULE ,ONLY : MPL_COMM_OML + USE OML_MOD ,ONLY : OML_MY_THREAD + USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS + USE MPI + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + + USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, NPROMA - IMPLICIT NONE + IMPLICIT NONE - REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) - INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP - INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV_G, KF_SCALARS_G - INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) - REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP(:,:,:) - REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGPUV(:,:,:,:) - REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP3A(:,:,:,:) - REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP3B(:,:,:,:) - REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP2(:,:,:) - INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) - INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) - INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) - INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) - INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) - - TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR - TYPE(TRLTOG_HANDLE) :: HTRLTOG - - ! LOCAL VARIABLES - - REAL(KIND=JPRBT), POINTER :: ZCOMBUFS(:),ZCOMBUFR(:) - - INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) - INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) - INTEGER(KIND=JPIM) :: IREQ (NPROC*2) - INTEGER(KIND=JPIM) :: IRECV_TO_PROC(NPROC) - INTEGER(KIND=JPIM) :: ISEND_TO_PROC(NPROC) - - INTEGER(KIND=JPIM) :: JFLD, J, JI, J1, J2, JGL, JK, JL, IFLDS, JROC, INR, INS - INTEGER(KIND=JPIM) :: IFIRSTLAT, ILASTLAT, IFLD, IGL, IGLL,& - &IPOS, ISETA, ISETB, ISETV, ISEND, IRECV, ISETW, IPROC, & - &IR, ILOCAL_LAT, ISEND_COUNTS, IRECV_COUNTS, IERROR, II, ILEN, IBUFLENS, IBUFLENR, & - &JBLK, ILAT_STRIP - - ! Contains FIELD, PARS, LEVS - INTEGER(KIND=JPIM) :: IGP_OFFSETS(KF_GP,3) - INTEGER(KIND=JPIM), PARAMETER :: IGP_OFFSETS_UV=1, IGP_OFFSETS_GP2=2, IGP_OFFSETS_GP3A=3, IGP_OFFSETS_GP3B=4 - INTEGER(KIND=JPIM) :: IUVPAR,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR,IPAROFF,IOFF - - INTEGER(KIND=JPIM) :: IFLDA(KF_GP) - INTEGER(KIND=JPIM) :: IIN_TO_SEND_BUFR(D%NLENGTF,2),IIN_TO_SEND_BUFR_OFFSET(NPROC), IIN_TO_SEND_BUFR_V - INTEGER(KIND=JPIM) :: IRECV_FIELD_COUNT(NPRTRV),IRECV_FIELD_COUNT_V - INTEGER(KIND=JPIM) :: IRECV_WSET_SIZE(NPRTRW),IRECV_WSET_SIZE_V - INTEGER(KIND=JPIM) :: IRECV_WSET_OFFSET(NPRTRW+1), IRECV_WSET_OFFSET_V - INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) - INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V - - INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) - INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) - INTEGER(KIND=JPIM) :: IVSET(KF_GP) - INTEGER(KIND=JPIM) :: J3,IFGP2,IFGP3A,IFGP3B - - REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR - - #ifdef PARKINDTRANS_SINGLE - #define TRLTOG_DTYPE MPI_REAL - #else - #define TRLTOG_DTYPE MPI_DOUBLE_PRECISION - #endif - - - ! ------------------------------------------------------------------ - - !* 0. Some initializations - ! -------------------- - IF (LHOOK) CALL DR_HOOK('TRLTOG',0,ZHOOK_HANDLE) - - ! Note we have either - ! - KVSETUV and KVSETSC (with PGP, which has u, v, and scalar fields), or - ! - KVSETUV, KVSETSC2, KVSETSC3A KVSETSC3B (with PGPUV, GP3A, PGP3B and PGP2) - ! KVSETs are optionals. Their sizes canalso be inferred from KV_UV_G/KV_SCALARS_G (which - ! should match PSPXXX and PGPXXX arrays) - - - ! We first get the decomposition individually - IVSETUV(:) = -1 - IF (PRESENT(KVSETUV)) IVSETUV(:) = KVSETUV(:) - IVSETSC(:)=-1 - IF (PRESENT(KVSETSC)) THEN - IVSETSC(:) = KVSETSC(:) - ELSE - IOFF=0 - IF (PRESENT(KVSETSC2)) THEN - IVSETSC(IOFF+1:IOFF+SIZE(KVSETSC2))=KVSETSC2(:) - IOFF = IOFF+SIZE(KVSETSC2) - ENDIF - IF (PRESENT(KVSETSC3A)) THEN - DO J3=1,MERGE(UBOUND(PGP3A,3),UBOUND(PGP3A,3)/3,.NOT. LSCDERS) - IVSETSC(IOFF+1:IOFF+SIZE(KVSETSC3A))=KVSETSC3A(:) - IOFF=IOFF+SIZE(KVSETSC3A) - ENDDO - ENDIF - IF (PRESENT(KVSETSC3B)) THEN - ! If SCDERS is on, the size of PGP is 3X larger because it is - ! holding various derivatives. The problem is that those are - ! at different non-contiguous positions, hence we treat them - ! as separate fields - DO J3=1,MERGE(UBOUND(PGP3B,3),UBOUND(PGP3B,3)/3,.NOT. LSCDERS) - IVSETSC(IOFF+1:IOFF+SIZE(KVSETSC3B))=KVSETSC3B(:) - IOFF=IOFF+SIZE(KVSETSC3B) - ENDDO - ENDIF - IF (IOFF > 0 .AND. IOFF /= KF_SCALARS_G ) THEN - PRINT*, "TRLTOG: ERROR IN IVSETSC COMPUTATION" - STOP 39 + REAL(KIND=JPRBT), INTENT(INOUT), POINTER :: PREEL_REAL(:) + INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP + INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV_G, KF_SCALARS_G + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) + REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP(:,:,:) + REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGPUV(:,:,:,:) + REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP3A(:,:,:,:) + REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP3B(:,:,:,:) + REAL(KIND=JPRB),OPTIONAL,INTENT(OUT) :: PGP2(:,:,:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) + INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) + + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(TRLTOG_HANDLE) :: HTRLTOG + + ! LOCAL VARIABLES + + REAL(KIND=JPRBT), POINTER :: ZCOMBUFS(:),ZCOMBUFR(:) + + INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) + INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) + INTEGER(KIND=JPIM) :: IREQ (NPROC*2) + INTEGER(KIND=JPIM) :: IRECV_TO_PROC(NPROC) + INTEGER(KIND=JPIM) :: ISEND_TO_PROC(NPROC) + + INTEGER(KIND=JPIM) :: JFLD, J, JI, J1, J2, JGL, JK, JL, IFLDS, JROC, INR, INS + INTEGER(KIND=JPIM) :: IFIRSTLAT, ILASTLAT, IFLD, IGL, IGLL,& + &IPOS, ISETA, ISETB, ISETV, ISEND, IRECV, ISETW, IPROC, & + &IR, ILOCAL_LAT, ISEND_COUNTS, IRECV_COUNTS, IERROR, II, ILEN, IBUFLENS, IBUFLENR, & + &JBLK, ILAT_STRIP + + ! Contains FIELD, PARS, LEVS + INTEGER(KIND=JPIM) :: IGP_OFFSETS(KF_GP,3) + INTEGER(KIND=JPIM), PARAMETER :: IGP_OFFSETS_UV=1, IGP_OFFSETS_GP2=2, IGP_OFFSETS_GP3A=3, IGP_OFFSETS_GP3B=4 + INTEGER(KIND=JPIM) :: IUVPAR,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR,IPAROFF,IOFF + + INTEGER(KIND=JPIM) :: IFLDA(KF_GP) + INTEGER(KIND=JPIM) :: IIN_TO_SEND_BUFR(D%NLENGTF,2),IIN_TO_SEND_BUFR_OFFSET(NPROC), IIN_TO_SEND_BUFR_V + INTEGER(KIND=JPIM) :: IRECV_FIELD_COUNT(NPRTRV),IRECV_FIELD_COUNT_V + INTEGER(KIND=JPIM) :: IRECV_WSET_SIZE(NPRTRW),IRECV_WSET_SIZE_V + INTEGER(KIND=JPIM) :: IRECV_WSET_OFFSET(NPRTRW+1), IRECV_WSET_OFFSET_V + INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) + INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V + + INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) + INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) + INTEGER(KIND=JPIM) :: IVSET(KF_GP) + INTEGER(KIND=JPIM) :: J3,IFGP2,IFGP3A,IFGP3B + + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR + + #ifdef PARKINDTRANS_SINGLE + #define TRLTOG_DTYPE MPI_REAL + #else + #define TRLTOG_DTYPE MPI_DOUBLE_PRECISION + #endif + + + ! ------------------------------------------------------------------ + + !* 0. Some initializations + ! -------------------- + IF (LHOOK) CALL DR_HOOK('TRLTOG',0,ZHOOK_HANDLE) + + ! Note we have either + ! - KVSETUV and KVSETSC (with PGP, which has u, v, and scalar fields), or + ! - KVSETUV, KVSETSC2, KVSETSC3A KVSETSC3B (with PGPUV, GP3A, PGP3B and PGP2) + ! KVSETs are optionals. Their sizes canalso be inferred from KV_UV_G/KV_SCALARS_G (which + ! should match PSPXXX and PGPXXX arrays) + + + ! We first get the decomposition individually + IVSETUV(:) = -1 + IF (PRESENT(KVSETUV)) IVSETUV(:) = KVSETUV(:) + IVSETSC(:)=-1 + IF (PRESENT(KVSETSC)) THEN + IVSETSC(:) = KVSETSC(:) + ELSE + IOFF=0 + IF (PRESENT(KVSETSC2)) THEN + IVSETSC(IOFF+1:IOFF+SIZE(KVSETSC2))=KVSETSC2(:) + IOFF = IOFF+SIZE(KVSETSC2) + ENDIF + IF (PRESENT(KVSETSC3A)) THEN + DO J3=1,MERGE(UBOUND(PGP3A,3),UBOUND(PGP3A,3)/3,.NOT. LSCDERS) + IVSETSC(IOFF+1:IOFF+SIZE(KVSETSC3A))=KVSETSC3A(:) + IOFF=IOFF+SIZE(KVSETSC3A) + ENDDO + ENDIF + IF (PRESENT(KVSETSC3B)) THEN + ! If SCDERS is on, the size of PGP is 3X larger because it is + ! holding various derivatives. The problem is that those are + ! at different non-contiguous positions, hence we treat them + ! as separate fields + DO J3=1,MERGE(UBOUND(PGP3B,3),UBOUND(PGP3B,3)/3,.NOT. LSCDERS) + IVSETSC(IOFF+1:IOFF+SIZE(KVSETSC3B))=KVSETSC3B(:) + IOFF=IOFF+SIZE(KVSETSC3B) + ENDDO + ENDIF + IF (IOFF > 0 .AND. IOFF /= KF_SCALARS_G ) THEN + PRINT*, "TRLTOG: ERROR IN IVSETSC COMPUTATION" + STOP 39 + ENDIF ENDIF - ENDIF - ! Now from UV and Scalars decomposition we get the full decomposition - IOFF=0 - IF (KF_UV_G > 0) THEN - IF (LVORGP) THEN + ! Now from UV and Scalars decomposition we get the full decomposition + IOFF=0 + IF (KF_UV_G > 0) THEN + IF (LVORGP) THEN + IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) + IOFF=IOFF+KF_UV_G + ENDIF + IF ( LDIVGP) THEN + IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) + IOFF=IOFF+KF_UV_G + ENDIF IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) IOFF=IOFF+KF_UV_G - ENDIF - IF ( LDIVGP) THEN IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) IOFF=IOFF+KF_UV_G ENDIF - IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) - IOFF=IOFF+KF_UV_G - IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) - IOFF=IOFF+KF_UV_G - ENDIF - IF (KF_SCALARS_G > 0) THEN - IVSET(IOFF+1:IOFF+KF_SCALARS_G) = IVSETSC(:) - IOFF=IOFF+KF_SCALARS_G - IF (LSCDERS) THEN + IF (KF_SCALARS_G > 0) THEN IVSET(IOFF+1:IOFF+KF_SCALARS_G) = IVSETSC(:) IOFF=IOFF+KF_SCALARS_G + IF (LSCDERS) THEN + IVSET(IOFF+1:IOFF+KF_SCALARS_G) = IVSETSC(:) + IOFF=IOFF+KF_SCALARS_G + ENDIF ENDIF - ENDIF - IF (KF_UV_G > 0 .AND. LUVDER) THEN - IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) - IOFF=IOFF+KF_UV_G - IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) - IOFF=IOFF+KF_UV_G - ENDIF - IF (KF_SCALARS_G > 0) THEN - IF (LSCDERS) THEN - IVSET(IOFF+1:IOFF+KF_SCALARS_G) = IVSETSC(:) - IOFF=IOFF+KF_SCALARS_G - ENDIF - ENDIF - - IF (.NOT. PRESENT(PGP)) THEN - ! This is only relevant if we use the split interface (i.e. not PGP) - - IGP2PAR = 0 - IGP3APAR = 0 - IGP3ALEV = 0 - IGP3BPAR = 0 - IGP3BLEV = 0 - IF (PRESENT(PGP2)) THEN - IGP2PAR = UBOUND(PGP2,2) - IF(LSCDERS) IGP2PAR = IGP2PAR/3 - ENDIF - IF (PRESENT(PGP3A)) THEN - IGP3ALEV = UBOUND(PGP3A,2) - IGP3APAR = UBOUND(PGP3A,3) - IF(LSCDERS) IGP3APAR = IGP3APAR/3 - ENDIF - IF (PRESENT(PGP3B)) THEN - IGP3BLEV = UBOUND(PGP3B,2) - IGP3BPAR = UBOUND(PGP3B,3) - IF(LSCDERS) IGP3BPAR = IGP3BPAR/3 + IF (KF_UV_G > 0 .AND. LUVDER) THEN + IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) + IOFF=IOFF+KF_UV_G + IVSET(IOFF+1:IOFF+KF_UV_G) = IVSETUV(:) + IOFF=IOFF+KF_UV_G ENDIF - IF (IGP2PAR + IGP3ALEV*IGP3APAR + IGP3BPAR*IGP3BLEV /= KF_SCALARS_G) THEN - PRINT *, IGP2PAR, IGP3APAR, IGP3ALEV, IGP3BPAR, IGP3BLEV - CALL ABORT_TRANS("INCONSISTENCY IN SCALARS") + IF (KF_SCALARS_G > 0) THEN + IF (LSCDERS) THEN + IVSET(IOFF+1:IOFF+KF_SCALARS_G) = IVSETSC(:) + IOFF=IOFF+KF_SCALARS_G + ENDIF ENDIF - ! This is only relevant if we use the split interface (i.e. not PGP) - IUVPAR = 1 - IOFF=1 - IF(LVORGP) THEN + IF (.NOT. PRESENT(PGP)) THEN + ! This is only relevant if we use the split interface (i.e. not PGP) + + IGP2PAR = 0 + IGP3APAR = 0 + IGP3ALEV = 0 + IGP3BPAR = 0 + IGP3BLEV = 0 + IF (PRESENT(PGP2)) THEN + IGP2PAR = UBOUND(PGP2,2) + IF(LSCDERS) IGP2PAR = IGP2PAR/3 + ENDIF + IF (PRESENT(PGP3A)) THEN + IGP3ALEV = UBOUND(PGP3A,2) + IGP3APAR = UBOUND(PGP3A,3) + IF(LSCDERS) IGP3APAR = IGP3APAR/3 + ENDIF + IF (PRESENT(PGP3B)) THEN + IGP3BLEV = UBOUND(PGP3B,2) + IGP3BPAR = UBOUND(PGP3B,3) + IF(LSCDERS) IGP3BPAR = IGP3BPAR/3 + ENDIF + IF (IGP2PAR + IGP3ALEV*IGP3APAR + IGP3BPAR*IGP3BLEV /= KF_SCALARS_G) THEN + PRINT *, IGP2PAR, IGP3APAR, IGP3ALEV, IGP3BPAR, IGP3BLEV + CALL ABORT_TRANS("INCONSISTENCY IN SCALARS") + ENDIF + + ! This is only relevant if we use the split interface (i.e. not PGP) + IUVPAR = 1 + IOFF=1 + IF(LVORGP) THEN + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) + IUVPAR=IUVPAR+1 + IOFF=IOFF+KF_UV_G + ENDIF + + IF(LDIVGP) THEN + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) + IUVPAR=IUVPAR+1 + IOFF=IOFF+KF_UV_G + ENDIF + + ! U IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) IUVPAR=IUVPAR+1 IOFF=IOFF+KF_UV_G - ENDIF - IF(LDIVGP) THEN + ! V IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) IUVPAR=IUVPAR+1 IOFF=IOFF+KF_UV_G - ENDIF - ! U - IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV - IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR - IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) - IUVPAR=IUVPAR+1 - IOFF=IOFF+KF_UV_G - - ! V - IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV - IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR - IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) - IUVPAR=IUVPAR+1 - IOFF=IOFF+KF_UV_G - - ! Scalars - ! PGP2 - IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,1) = IGP_OFFSETS_GP2 - IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,2) = (/(J, J=1,IGP2PAR)/) - IOFF=IOFF+IGP2PAR - ! PGP3A - IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,1) = IGP_OFFSETS_GP3A - IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,2) = (/(1+J/IGP3ALEV, J=0,IGP3APAR*IGP3ALEV-1)/) - IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,3) = (/(1+MOD(J,IGP3ALEV), J=0,IGP3APAR*IGP3ALEV-1)/) - IOFF=IOFF+IGP3APAR*IGP3ALEV - ! PGP3B - IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,1) = IGP_OFFSETS_GP3B - IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,2) = (/(1+J/IGP3BLEV, J=0,IGP3BPAR*IGP3BLEV-1)/) - IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,3) = (/(1+MOD(J,IGP3BLEV), J=0,IGP3BPAR*IGP3BLEV-1)/) - IOFF=IOFF+IGP3BPAR*IGP3BLEV - - IF(LSCDERS) THEN - !Scalars NS Derivatives + ! Scalars ! PGP2 IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,1) = IGP_OFFSETS_GP2 - IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,2) = (/(J+IGP2PAR, J=1,IGP2PAR)/) + IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,2) = (/(J, J=1,IGP2PAR)/) IOFF=IOFF+IGP2PAR ! PGP3A IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,1) = IGP_OFFSETS_GP3A - IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,2) = (/(1+IGP3APAR+J/IGP3ALEV, J=0,IGP3APAR*IGP3ALEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,2) = (/(1+J/IGP3ALEV, J=0,IGP3APAR*IGP3ALEV-1)/) IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,3) = (/(1+MOD(J,IGP3ALEV), J=0,IGP3APAR*IGP3ALEV-1)/) IOFF=IOFF+IGP3APAR*IGP3ALEV ! PGP3B IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,1) = IGP_OFFSETS_GP3B - IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,2) = (/(1+IGP3BPAR+J/IGP3BLEV, J=0,IGP3BPAR*IGP3BLEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,2) = (/(1+J/IGP3BLEV, J=0,IGP3BPAR*IGP3BLEV-1)/) IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,3) = (/(1+MOD(J,IGP3BLEV), J=0,IGP3BPAR*IGP3BLEV-1)/) IOFF=IOFF+IGP3BPAR*IGP3BLEV - ENDIF - IF(LUVDER) THEN - ! U Derivative NS - IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV - IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR - IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) - IUVPAR=IUVPAR+1 - IOFF=IOFF+KF_UV_G + IF(LSCDERS) THEN + !Scalars NS Derivatives + ! PGP2 + IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,1) = IGP_OFFSETS_GP2 + IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,2) = (/(J+IGP2PAR, J=1,IGP2PAR)/) + IOFF=IOFF+IGP2PAR + ! PGP3A + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,1) = IGP_OFFSETS_GP3A + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,2) = (/(1+IGP3APAR+J/IGP3ALEV, J=0,IGP3APAR*IGP3ALEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,3) = (/(1+MOD(J,IGP3ALEV), J=0,IGP3APAR*IGP3ALEV-1)/) + IOFF=IOFF+IGP3APAR*IGP3ALEV + ! PGP3B + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,1) = IGP_OFFSETS_GP3B + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,2) = (/(1+IGP3BPAR+J/IGP3BLEV, J=0,IGP3BPAR*IGP3BLEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,3) = (/(1+MOD(J,IGP3BLEV), J=0,IGP3BPAR*IGP3BLEV-1)/) + IOFF=IOFF+IGP3BPAR*IGP3BLEV + ENDIF - ! V Derivative NS - IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV - IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR - IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) - IUVPAR=IUVPAR+1 - IOFF=IOFF+KF_UV_G + IF(LUVDER) THEN + ! U Derivative NS + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) + IUVPAR=IUVPAR+1 + IOFF=IOFF+KF_UV_G + + ! V Derivative NS + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,1) = IGP_OFFSETS_UV + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,2) = IUVPAR + IGP_OFFSETS(IOFF:IOFF+KF_UV_G-1,3) = (/(J, J=1,KF_UV_G)/) + IUVPAR=IUVPAR+1 + IOFF=IOFF+KF_UV_G + ENDIF + + IF(LSCDERS) THEN + !Scalars NS Derivatives + ! PGP2 + IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,1) = IGP_OFFSETS_GP2 + IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,2) = (/(J+2*IGP2PAR, J=1,IGP2PAR)/) + IOFF=IOFF+IGP2PAR + ! PGP3A + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,1) = IGP_OFFSETS_GP3A + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,2) = (/(1+2*IGP3APAR+J/IGP3ALEV, J=0,IGP3APAR*IGP3ALEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,3) = (/(1+MOD(J,IGP3ALEV), J=0,IGP3APAR*IGP3ALEV-1)/) + IOFF=IOFF+IGP3APAR*IGP3ALEV + ! PGP3B + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,1) = IGP_OFFSETS_GP3B + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,2) = (/(1+2*IGP3BPAR+J/IGP3BLEV, J=0,IGP3BPAR*IGP3BLEV-1)/) + IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,3) = (/(1+MOD(J,IGP3BLEV), J=0,IGP3BPAR*IGP3BLEV-1)/) + IOFF=IOFF+IGP3BPAR*IGP3BLEV + ENDIF ENDIF - IF(LSCDERS) THEN - !Scalars NS Derivatives - ! PGP2 - IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,1) = IGP_OFFSETS_GP2 - IGP_OFFSETS(IOFF:IOFF+IGP2PAR-1,2) = (/(J+2*IGP2PAR, J=1,IGP2PAR)/) - IOFF=IOFF+IGP2PAR - ! PGP3A - IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,1) = IGP_OFFSETS_GP3A - IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,2) = (/(1+2*IGP3APAR+J/IGP3ALEV, J=0,IGP3APAR*IGP3ALEV-1)/) - IGP_OFFSETS(IOFF:IOFF+IGP3APAR*IGP3ALEV-1,3) = (/(1+MOD(J,IGP3ALEV), J=0,IGP3APAR*IGP3ALEV-1)/) - IOFF=IOFF+IGP3APAR*IGP3ALEV - ! PGP3B - IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,1) = IGP_OFFSETS_GP3B - IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,2) = (/(1+2*IGP3BPAR+J/IGP3BLEV, J=0,IGP3BPAR*IGP3BLEV-1)/) - IGP_OFFSETS(IOFF:IOFF+IGP3BPAR*IGP3BLEV-1,3) = (/(1+MOD(J,IGP3BLEV), J=0,IGP3BPAR*IGP3BLEV-1)/) - IOFF=IOFF+IGP3BPAR*IGP3BLEV + CALL GSTATS(1806,0) + + ! Prepare receiver arrays + ! find number of fields on a certain V-set + IF(NPRTRV == 1) THEN + ! This is needed because KVSET(JFLD) == -1 if there is only one V-set + IRECV_FIELD_COUNT(1) = KF_GP + ELSE + IRECV_FIELD_COUNT(:) = 0 + DO JFLD=1,KF_GP + IRECV_FIELD_COUNT(IVSET(JFLD)) = IRECV_FIELD_COUNT(IVSET(JFLD)) + 1 + ENDDO ENDIF - ENDIF - - CALL GSTATS(1806,0) - - ! Prepare receiver arrays - ! find number of fields on a certain V-set - IF(NPRTRV == 1) THEN - ! This is needed because KVSET(JFLD) == -1 if there is only one V-set - IRECV_FIELD_COUNT(1) = KF_GP - ELSE - IRECV_FIELD_COUNT(:) = 0 - DO JFLD=1,KF_GP - IRECV_FIELD_COUNT(IVSET(JFLD)) = IRECV_FIELD_COUNT(IVSET(JFLD)) + 1 + ! find number of grid-points on a certain W-set that overlap with myself + IRECV_WSET_SIZE(:) = 0 + DO ILOCAL_LAT=D%NFRSTLAT(MY_REGION_NS),D%NLSTLAT(MY_REGION_NS) + ILAT_STRIP = ILOCAL_LAT-D%NFRSTLAT(MY_REGION_NS)+D%NPTRFLOFF+1 + IRECV_WSET_SIZE(D%NPROCL(ILOCAL_LAT)) = & + & IRECV_WSET_SIZE(D%NPROCL(ILOCAL_LAT))+D%NONL(ILAT_STRIP,MY_REGION_EW) ENDDO - ENDIF - ! find number of grid-points on a certain W-set that overlap with myself - IRECV_WSET_SIZE(:) = 0 - DO ILOCAL_LAT=D%NFRSTLAT(MY_REGION_NS),D%NLSTLAT(MY_REGION_NS) - ILAT_STRIP = ILOCAL_LAT-D%NFRSTLAT(MY_REGION_NS)+D%NPTRFLOFF+1 - IRECV_WSET_SIZE(D%NPROCL(ILOCAL_LAT)) = & - & IRECV_WSET_SIZE(D%NPROCL(ILOCAL_LAT))+D%NONL(ILAT_STRIP,MY_REGION_EW) - ENDDO - ! sum up offsets - IRECV_WSET_OFFSET(1) = 0 - DO JROC=1,NPRTRW - IRECV_WSET_OFFSET(JROC+1)=IRECV_WSET_OFFSET(JROC)+IRECV_WSET_SIZE(JROC) - ENDDO - DO JROC=1,NPROC - CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) - ! total recv size is # points per field * # fields - IRECVTOT(JROC) = IRECV_WSET_SIZE(ISETW)*IRECV_FIELD_COUNT(ISETV) - ENDDO - - ! Prepare sender arrays - IIN_TO_SEND_BUFR_OFFSET(1) = 0 - DO JROC=1,NPROC - ! Get new offset to my current KINDEX entry - IF (JROC > 1 .AND. KF_FS > 0) THEN - IIN_TO_SEND_BUFR_OFFSET(JROC) = IIN_TO_SEND_BUFR_OFFSET(JROC-1)+ISENDTOT(JROC-1)/KF_FS - ELSEIF (JROC > 1) THEN - IIN_TO_SEND_BUFR_OFFSET(JROC) = IIN_TO_SEND_BUFR_OFFSET(JROC-1) - ENDIF + ! sum up offsets + IRECV_WSET_OFFSET(1) = 0 + DO JROC=1,NPRTRW + IRECV_WSET_OFFSET(JROC+1)=IRECV_WSET_OFFSET(JROC)+IRECV_WSET_SIZE(JROC) + ENDDO + DO JROC=1,NPROC + CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) + ! total recv size is # points per field * # fields + IRECVTOT(JROC) = IRECV_WSET_SIZE(ISETW)*IRECV_FIELD_COUNT(ISETV) + ENDDO + + ! Prepare sender arrays + IIN_TO_SEND_BUFR_OFFSET(1) = 0 + DO JROC=1,NPROC + ! Get new offset to my current KINDEX entry + IF (JROC > 1 .AND. KF_FS > 0) THEN + IIN_TO_SEND_BUFR_OFFSET(JROC) = IIN_TO_SEND_BUFR_OFFSET(JROC-1)+ISENDTOT(JROC-1)/KF_FS + ELSEIF (JROC > 1) THEN + IIN_TO_SEND_BUFR_OFFSET(JROC) = IIN_TO_SEND_BUFR_OFFSET(JROC-1) + ENDIF - CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) - - ! MAX(Index of first fourier latitude for this W set, first latitude of a senders A set) - ! i.e. we find the overlap between what we have on sender side (others A set) and the receiver - ! (me, the W-set). Ideally those conincide, at least mostly. - IFIRSTLAT = MAX(D%NPTRLS(MYSETW),D%NFRSTLAT(ISETA)) - ! MIN(Index of last fourier latitude for this W set, last latitude of a senders A set) - ILASTLAT = MIN(D%NPTRLS(MYSETW)+D%NULTPP(MYSETW)-1,D%NLSTLAT(ISETA)) - - IPOS = 0 - DO JGL=IFIRSTLAT,ILASTLAT - ! get from "actual" latitude to the latitude strip offset - IGL = JGL-D%NFRSTLAT(ISETA)+D%NPTRFRSTLAT(ISETA) - ! get from "actual" latitude to the latitude offset - IGLL = JGL-D%NPTRLS(MYSETW)+1 - DO JL=1,D%NONL(IGL,ISETB) - IPOS = IPOS+1 - ! offset to first layer of this gridpoint - IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_OFFSET(JROC)+IPOS,1) = & - & KF_FS*D%NSTAGTF(IGLL)+(D%NSTA(IGL,ISETB)-1)+(JL-1) - ! distance between two layers of this gridpoint - IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_OFFSET(JROC)+IPOS,2) = & - & D%NSTAGTF(IGLL+1)-D%NSTAGTF(IGLL) + CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) + + ! MAX(Index of first fourier latitude for this W set, first latitude of a senders A set) + ! i.e. we find the overlap between what we have on sender side (others A set) and the receiver + ! (me, the W-set). Ideally those conincide, at least mostly. + IFIRSTLAT = MAX(D%NPTRLS(MYSETW),D%NFRSTLAT(ISETA)) + ! MIN(Index of last fourier latitude for this W set, last latitude of a senders A set) + ILASTLAT = MIN(D%NPTRLS(MYSETW)+D%NULTPP(MYSETW)-1,D%NLSTLAT(ISETA)) + + IPOS = 0 + DO JGL=IFIRSTLAT,ILASTLAT + ! get from "actual" latitude to the latitude strip offset + IGL = JGL-D%NFRSTLAT(ISETA)+D%NPTRFRSTLAT(ISETA) + ! get from "actual" latitude to the latitude offset + IGLL = JGL-D%NPTRLS(MYSETW)+1 + DO JL=1,D%NONL(IGL,ISETB) + IPOS = IPOS+1 + ! offset to first layer of this gridpoint + IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_OFFSET(JROC)+IPOS,1) = & + & KF_FS*D%NSTAGTF(IGLL)+(D%NSTA(IGL,ISETB)-1)+(JL-1) + ! distance between two layers of this gridpoint + IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_OFFSET(JROC)+IPOS,2) = & + & D%NSTAGTF(IGLL+1)-D%NSTAGTF(IGLL) + ENDDO ENDDO + !we always receive the full fourier space + ISENDTOT(JROC) = IPOS*KF_FS ENDDO - !we always receive the full fourier space - ISENDTOT(JROC) = IPOS*KF_FS - ENDDO - - !$ACC DATA COPYIN(IIN_TO_SEND_BUFR,IGP_OFFSETS) ASYNC(1) - - !$ACC DATA IF(PRESENT(PGP)) COPYOUT(PGP) ASYNC(1) - !$ACC DATA IF(PRESENT(PGPUV)) COPYOUT(PGPUV) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP2)) COPYOUT(PGP2) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3A)) COPYOUT(PGP3A) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3B)) COPYOUT(PGP3B) ASYNC(1) - - ! Present until self contribution and packing are done - !$ACC DATA PRESENT(PREEL_REAL) - CALL GSTATS(1806,1) - - ! Copy local contribution - IF(ISENDTOT(MYPROC) > 0) THEN - ! I have to send something to myself... - - ! Input is KF_GP fields. We find the resulting KF_FS fields. - IFLDS = 0 - DO JFLD=1,KF_GP - IF(IVSET(JFLD) == MYSETV .OR. IVSET(JFLD) == -1) THEN - IFLDS = IFLDS+1 - IF(PRESENT(KPTRGP)) THEN - IFLDA(IFLDS) = KPTRGP(JFLD) - ELSE - IFLDA(IFLDS) = JFLD + + !$ACC DATA COPYIN(IIN_TO_SEND_BUFR,IGP_OFFSETS) ASYNC(1) + + !$ACC DATA IF(PRESENT(PGP)) COPYOUT(PGP) ASYNC(1) + !$ACC DATA IF(PRESENT(PGPUV)) COPYOUT(PGPUV) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP2)) COPYOUT(PGP2) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3A)) COPYOUT(PGP3A) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3B)) COPYOUT(PGP3B) ASYNC(1) + + ! Present until self contribution and packing are done + !$ACC DATA PRESENT(PREEL_REAL) + CALL GSTATS(1806,1) + + ! Copy local contribution + IF(ISENDTOT(MYPROC) > 0) THEN + ! I have to send something to myself... + + ! Input is KF_GP fields. We find the resulting KF_FS fields. + IFLDS = 0 + DO JFLD=1,KF_GP + IF(IVSET(JFLD) == MYSETV .OR. IVSET(JFLD) == -1) THEN + IFLDS = IFLDS+1 + IF(PRESENT(KPTRGP)) THEN + IFLDA(IFLDS) = KPTRGP(JFLD) + ELSE + IFLDA(IFLDS) = JFLD + ENDIF + ENDIF + ENDDO + + !$ACC DATA COPYIN(IFLDA(1:IFLDS)) ASYNC(1) + + CALL GSTATS(1604,0) + + IRECV_WSET_OFFSET_V = IRECV_WSET_OFFSET(MYSETW) + IRECV_WSET_SIZE_V = IRECV_WSET_SIZE(MYSETW) + IIN_TO_SEND_BUFR_V = IIN_TO_SEND_BUFR_OFFSET(MYPROC) + IF (PRESENT(PGP)) THEN + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS) ASYNC(1) + DO JFLD=1,KF_FS + DO JL=1,IRECV_WSET_SIZE_V + JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,1)+ & + & (JFLD-1)*IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,2)+1 + PGP(JK,IFLD,JBLK) = PREEL_REAL(IPOS) + ENDDO + ENDDO + ELSE + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS) ASYNC(1) + DO JFLD=1,KF_FS + DO JL=1,IRECV_WSET_SIZE_V + JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD = IFLDA(JFLD) + IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,1)+ & + & (JFLD-1)*IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,2)+1 + IF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_UV) THEN + PGPUV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP2) THEN + PGP2(JK,IGP_OFFSETS(IFLD,2),JBLK)=PREEL_REAL(IPOS) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3A) THEN + PGP3A(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3B) THEN + PGP3B(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) + ENDIF + ENDDO + ENDDO + ENDIF + CALL GSTATS(1604,1) + + !$ACC END DATA + + ENDIF + + ! Figure out processes that send or recv something + ISEND_COUNTS = 0 + IRECV_COUNTS = 0 + DO JROC=1,NPROC + IF( JROC /= MYPROC) THEN + IF(IRECVTOT(JROC) > 0) THEN + ! I have to recv something, so let me store that + IRECV_COUNTS = IRECV_COUNTS + 1 + IRECV_TO_PROC(IRECV_COUNTS)=JROC + ENDIF + IF(ISENDTOT(JROC) > 0) THEN + ! I have to send something, so let me store that + ISEND_COUNTS = ISEND_COUNTS+1 + ISEND_TO_PROC(ISEND_COUNTS)=JROC ENDIF ENDIF ENDDO - !$ACC DATA COPYIN(IFLDA(1:IFLDS)) ASYNC(1) + ALLOCATE(ICOMBUFS_OFFSET(ISEND_COUNTS+1)) + ICOMBUFS_OFFSET(1) = 0 + DO JROC=1,ISEND_COUNTS + ICOMBUFS_OFFSET(JROC+1) = ICOMBUFS_OFFSET(JROC) + ISENDTOT(ISEND_TO_PROC(JROC)) + ENDDO + ALLOCATE(ICOMBUFR_OFFSET(IRECV_COUNTS+1)) + ICOMBUFR_OFFSET(1) = 0 + DO JROC=1,IRECV_COUNTS + ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(IRECV_TO_PROC(JROC)) + ENDDO - CALL GSTATS(1604,0) + IF (IRECV_COUNTS > 0) THEN + CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),& + & 1_C_SIZE_T, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*SIZEOF(ZCOMBUFR(1))) + ENDIF + IF (ISEND_COUNTS > 0) THEN + CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),& + & ALIGN(KF_GP*D%NGPTOT*SIZEOF(ZCOMBUFR(1)),128)+1, & + & ICOMBUFS_OFFSET(ISEND_COUNTS+1)*SIZEOF(ZCOMBUFS(1))) + ENDIF - IRECV_WSET_OFFSET_V = IRECV_WSET_OFFSET(MYSETW) - IRECV_WSET_SIZE_V = IRECV_WSET_SIZE(MYSETW) - IIN_TO_SEND_BUFR_V = IIN_TO_SEND_BUFR_OFFSET(MYPROC) - IF (PRESENT(PGP)) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS) ASYNC(1) - DO JFLD=1,KF_FS - DO JL=1,IRECV_WSET_SIZE_V - JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 - JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 - IFLD = IFLDA(JFLD) - IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,1)+ & - & (JFLD-1)*IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,2)+1 - PGP(JK,IFLD,JBLK) = PREEL_REAL(IPOS) - ENDDO - ENDDO - ELSE - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS) ASYNC(1) + !$ACC DATA PRESENT(ZCOMBUFS) + CALL GSTATS(1605,0) + DO INS=1,ISEND_COUNTS + IPROC = ISEND_TO_PROC(INS) + ILEN = ISENDTOT(IPROC)/KF_FS + IIN_TO_SEND_BUFR_V = IIN_TO_SEND_BUFR_OFFSET(IPROC) + ICOMBUFS_OFFSET_V = ICOMBUFS_OFFSET(INS) + !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IPOS) COLLAPSE(2) ASYNC(1) DO JFLD=1,KF_FS - DO JL=1,IRECV_WSET_SIZE_V - JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 - JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 - IFLD = IFLDA(JFLD) + DO JL=1,ILEN IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,1)+ & & (JFLD-1)*IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,2)+1 - IF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_UV) THEN - PGPUV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) - ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP2) THEN - PGP2(JK,IGP_OFFSETS(IFLD,2),JBLK)=PREEL_REAL(IPOS) - ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3A) THEN - PGP3A(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) - ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3B) THEN - PGP3B(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) - ENDIF + ZCOMBUFS(ICOMBUFS_OFFSET_V+(JFLD-1)*ILEN+JL) = PREEL_REAL(IPOS) ENDDO ENDDO - ENDIF - CALL GSTATS(1604,1) + ENDDO + CALL GSTATS(1605,1) + !$ACC END DATA ! ZCOMBUFS - !$ACC END DATA + !$ACC END DATA ! PREEL_REAL - ENDIF + !$ACC WAIT(1) - ! Figure out processes that send or recv something - ISEND_COUNTS = 0 - IRECV_COUNTS = 0 - DO JROC=1,NPROC - IF( JROC /= MYPROC) THEN - IF(IRECVTOT(JROC) > 0) THEN - ! I have to recv something, so let me store that - IRECV_COUNTS = IRECV_COUNTS + 1 - IRECV_TO_PROC(IRECV_COUNTS)=JROC - ENDIF - IF(ISENDTOT(JROC) > 0) THEN - ! I have to send something, so let me store that - ISEND_COUNTS = ISEND_COUNTS+1 - ISEND_TO_PROC(ISEND_COUNTS)=JROC - ENDIF + CALL GSTATS(805,0) + + IF (LSYNC_TRANS) THEN + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) ENDIF - ENDDO - - ALLOCATE(ICOMBUFS_OFFSET(ISEND_COUNTS+1)) - ICOMBUFS_OFFSET(1) = 0 - DO JROC=1,ISEND_COUNTS - ICOMBUFS_OFFSET(JROC+1) = ICOMBUFS_OFFSET(JROC) + ISENDTOT(ISEND_TO_PROC(JROC)) - ENDDO - ALLOCATE(ICOMBUFR_OFFSET(IRECV_COUNTS+1)) - ICOMBUFR_OFFSET(1) = 0 - DO JROC=1,IRECV_COUNTS - ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(IRECV_TO_PROC(JROC)) - ENDDO - - IF (IRECV_COUNTS > 0) THEN - CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),& - & 1_C_SIZE_T, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*SIZEOF(ZCOMBUFR(1))) - ENDIF - IF (ISEND_COUNTS > 0) THEN - CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),& - & ALIGN(KF_GP*D%NGPTOT*SIZEOF(ZCOMBUFR(1)),128)+1, & - & ICOMBUFS_OFFSET(ISEND_COUNTS+1)*SIZEOF(ZCOMBUFS(1))) - ENDIF - - !$ACC DATA PRESENT(ZCOMBUFS) - CALL GSTATS(1605,0) - DO INS=1,ISEND_COUNTS - IPROC = ISEND_TO_PROC(INS) - ILEN = ISENDTOT(IPROC)/KF_FS - IIN_TO_SEND_BUFR_V = IIN_TO_SEND_BUFR_OFFSET(IPROC) - ICOMBUFS_OFFSET_V = ICOMBUFS_OFFSET(INS) - !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IPOS) COLLAPSE(2) ASYNC(1) - DO JFLD=1,KF_FS - DO JL=1,ILEN - IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,1)+ & - & (JFLD-1)*IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,2)+1 - ZCOMBUFS(ICOMBUFS_OFFSET_V+(JFLD-1)*ILEN+JL) = PREEL_REAL(IPOS) - ENDDO + CALL GSTATS(421,0) + + IR=0 + !...Receive loop......................................................... + !$ACC HOST_DATA USE_DEVICE(ZCOMBUFS,ZCOMBUFR) + DO INR=1,IRECV_COUNTS + IR=IR+1 + IRECV=IRECV_TO_PROC(INR) + CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)), & + & IRECVTOT(IRECV), & + & TRLTOG_DTYPE,NPRCIDS(IRECV)-1, & + & MTAGLG, MPL_COMM_OML(OML_MY_THREAD()), IREQ(IR), & + & IERROR ) ENDDO - ENDDO - CALL GSTATS(1605,1) - !$ACC END DATA ! ZCOMBUFS - - !$ACC END DATA ! PREEL_REAL - - !$ACC WAIT(1) - - CALL GSTATS(805,0) - - IF (LSYNC_TRANS) THEN - CALL GSTATS(440,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(440,1) - ENDIF - CALL GSTATS(421,0) - - IR=0 - !...Receive loop......................................................... - !$ACC HOST_DATA USE_DEVICE(ZCOMBUFS,ZCOMBUFR) - DO INR=1,IRECV_COUNTS - IR=IR+1 - IRECV=IRECV_TO_PROC(INR) - CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)), & - & IRECVTOT(IRECV), & - & TRLTOG_DTYPE,NPRCIDS(IRECV)-1, & - & MTAGLG, MPL_COMM_OML(OML_MY_THREAD()), IREQ(IR), & - & IERROR ) - ENDDO - - !...Send loop......................................................... - DO INS=1,ISEND_COUNTS - IR=IR+1 - ISEND=ISEND_TO_PROC(INS) - CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT(ISEND), & - & TRLTOG_DTYPE, NPRCIDS(ISEND)-1,MTAGLG,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) - ENDDO - !$ACC END HOST_DATA - - IF(IR > 0) THEN - CALL MPL_WAIT(KREQUEST=IREQ(1:IR), & - & CDSTRING='TRLTOG_CUDAAWARE: WAIT FOR SENDS AND RECEIVES') - ENDIF - - IF (LSYNC_TRANS) THEN - CALL GSTATS(441,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(441,1) - ENDIF - CALL GSTATS(421,1) - - !$ACC DATA PRESENT(ZCOMBUFR) - CALL GSTATS(805,1) - - ! Unpack loop......................................................... - - CALL GSTATS(1606,0) - DO INR=1,IRECV_COUNTS - IRECV=IRECV_TO_PROC(INR) - CALL PE2SET(IRECV,ISETA,ISETB,ISETW,ISETV) - - IRECV_FIELD_COUNT_V = IRECV_FIELD_COUNT(ISETV) - ICOMBUFR_OFFSET_V = ICOMBUFR_OFFSET(INR) - - IFLDS = 0 - DO JFLD=1,KF_GP - IF(IVSET(JFLD) == ISETV .OR. IVSET(JFLD) == -1 ) THEN - IFLDS = IFLDS+1 - IF(PRESENT(KPTRGP)) THEN - IFLDA(IFLDS)=KPTRGP(JFLD) - ELSE - IFLDA(IFLDS)=JFLD - ENDIF - ENDIF + + !...Send loop......................................................... + DO INS=1,ISEND_COUNTS + IR=IR+1 + ISEND=ISEND_TO_PROC(INS) + CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT(ISEND), & + & TRLTOG_DTYPE, NPRCIDS(ISEND)-1,MTAGLG,MPL_COMM_OML(OML_MY_THREAD()),IREQ(IR),IERROR) ENDDO + !$ACC END HOST_DATA - !$ACC DATA COPYIN(IFLDA(1:IRECV_FIELD_COUNT_V)) ASYNC(1) - - IRECV_WSET_OFFSET_V = IRECV_WSET_OFFSET(ISETW) - IRECV_WSET_SIZE_V = IRECV_WSET_SIZE(ISETW) - IF (PRESENT(PGP)) THEN - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) ASYNC(1) - DO JFLD=1,IRECV_FIELD_COUNT_V - DO JL=1,IRECV_WSET_SIZE_V - JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 - JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 - IFLD=IFLDA(JFLD) - JI = ICOMBUFR_OFFSET_V+(JFLD-1)*IRECV_WSET_SIZE_V+JL - PGP(JK,IFLD,JBLK) = ZCOMBUFR(JI) - ENDDO - ENDDO - ELSE - !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) ASYNC(1) - DO JFLD=1,IRECV_FIELD_COUNT_V - DO JL=1,IRECV_WSET_SIZE_V - JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 - JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 - IFLD=IFLDA(JFLD) - JI = ICOMBUFR_OFFSET_V+(JFLD-1)*IRECV_WSET_SIZE_V+JL - IF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_UV) THEN - PGPUV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) - ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP2) THEN - PGP2(JK,IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) - ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3A) THEN - PGP3A(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) - ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3B) THEN - PGP3B(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) + IF(IR > 0) THEN + CALL MPL_WAIT(KREQUEST=IREQ(1:IR), & + & CDSTRING='TRLTOG_CUDAAWARE: WAIT FOR SENDS AND RECEIVES') + ENDIF + + IF (LSYNC_TRANS) THEN + CALL GSTATS(441,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(441,1) + ENDIF + CALL GSTATS(421,1) + + !$ACC DATA PRESENT(ZCOMBUFR) + CALL GSTATS(805,1) + + ! Unpack loop......................................................... + + CALL GSTATS(1606,0) + DO INR=1,IRECV_COUNTS + IRECV=IRECV_TO_PROC(INR) + CALL PE2SET(IRECV,ISETA,ISETB,ISETW,ISETV) + + IRECV_FIELD_COUNT_V = IRECV_FIELD_COUNT(ISETV) + ICOMBUFR_OFFSET_V = ICOMBUFR_OFFSET(INR) + + IFLDS = 0 + DO JFLD=1,KF_GP + IF(IVSET(JFLD) == ISETV .OR. IVSET(JFLD) == -1 ) THEN + IFLDS = IFLDS+1 + IF(PRESENT(KPTRGP)) THEN + IFLDA(IFLDS)=KPTRGP(JFLD) + ELSE + IFLDA(IFLDS)=JFLD ENDIF - ENDDO + ENDIF ENDDO - ENDIF - !$ACC END DATA - ENDDO - !$ACC END DATA ! ZOMBUFR - IF (LSYNC_TRANS) THEN - !$ACC WAIT(1) - CALL GSTATS(440,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(440,1) - ENDIF - CALL GSTATS(422,0) - !$ACC END DATA ! PGP3B - !$ACC END DATA ! PGP3A - !$ACC END DATA ! PGP2 - !$ACC END DATA ! PGPUV - !$ACC END DATA ! PGP - IF (LSYNC_TRANS) THEN - !$ACC WAIT(1) - CALL GSTATS(442,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(442,1) - ENDIF - CALL GSTATS(422,1) - !$ACC END DATA ! IRECVBUFR_TO_OUT,PGPINDICES + !$ACC DATA COPYIN(IFLDA(1:IRECV_FIELD_COUNT_V)) ASYNC(1) + + IRECV_WSET_OFFSET_V = IRECV_WSET_OFFSET(ISETW) + IRECV_WSET_SIZE_V = IRECV_WSET_SIZE(ISETW) + IF (PRESENT(PGP)) THEN + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) ASYNC(1) + DO JFLD=1,IRECV_FIELD_COUNT_V + DO JL=1,IRECV_WSET_SIZE_V + JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD=IFLDA(JFLD) + JI = ICOMBUFR_OFFSET_V+(JFLD-1)*IRECV_WSET_SIZE_V+JL + PGP(JK,IFLD,JBLK) = ZCOMBUFR(JI) + ENDDO + ENDDO + ELSE + !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) ASYNC(1) + DO JFLD=1,IRECV_FIELD_COUNT_V + DO JL=1,IRECV_WSET_SIZE_V + JK = MOD(IRECV_WSET_OFFSET_V+JL-1,NPROMA)+1 + JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 + IFLD=IFLDA(JFLD) + JI = ICOMBUFR_OFFSET_V+(JFLD-1)*IRECV_WSET_SIZE_V+JL + IF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_UV) THEN + PGPUV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP2) THEN + PGP2(JK,IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3A) THEN + PGP3A(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) + ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3B) THEN + PGP3B(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) + ENDIF + ENDDO + ENDDO + ENDIF + !$ACC END DATA + ENDDO - !$ACC WAIT(1) + !$ACC END DATA ! ZOMBUFR + IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) + ENDIF + CALL GSTATS(422,0) + !$ACC END DATA ! PGP3B + !$ACC END DATA ! PGP3A + !$ACC END DATA ! PGP2 + !$ACC END DATA ! PGPUV + !$ACC END DATA ! PGP + IF (LSYNC_TRANS) THEN + !$ACC WAIT(1) + CALL GSTATS(442,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(442,1) + ENDIF + CALL GSTATS(422,1) + !$ACC END DATA ! IRECVBUFR_TO_OUT,PGPINDICES - CALL GSTATS(1606,1) + !$ACC WAIT(1) - IF (LHOOK) CALL DR_HOOK('TRLTOG',1,ZHOOK_HANDLE) + CALL GSTATS(1606,1) + IF (LHOOK) CALL DR_HOOK('TRLTOG',1,ZHOOK_HANDLE) END SUBROUTINE TRLTOG_CUDAAWARE - END MODULE TRLTOG_MOD - +END MODULE TRLTOG_MOD + diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index ae282ef17..011952959 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -18,7 +18,6 @@ MODULE TRLTOM_MOD TYPE TRLTOM_HANDLE TYPE(ALLOCATION_RESERVATION_HANDLE) :: HPFBUF END TYPE - CONTAINS FUNCTION PREPARE_TRLTOM(ALLOCATOR, KF_FS) RESULT(HTRLTOM) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT @@ -34,157 +33,155 @@ FUNCTION PREPARE_TRLTOM(ALLOCATOR, KF_FS) RESULT(HTRLTOM) HTRLTOM%HPFBUF = RESERVE(ALLOCATOR, D%NLENGT1B*2*KF_FS*SIZEOF(DUMMY)) END FUNCTION -SUBROUTINE TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) - -!**** *TRLTOM * - transposition in Fourierspace - -! Purpose. -! -------- -! Transpose Fourier coefficients from partitioning -! over latitudes to partitioning over wave numbers -! This is done between inverse Legendre Transform -! and inverse FFT. -! This is the inverse routine of TRMTOL. - -!** Interface. -! ---------- -! *CALL* *TRLTOM(...)* - -! Explicit arguments : PFBUF - Fourier coefficient buffer. It is -! -------------------- used for both input and output. - -! KF_FS - Number of fields communicated - -! Implicit arguments : -! -------------------- - -! Method. -! ------- -! See documentation - -! Externals. -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! MPP Group *ECMWF* - -! Modifications. -! -------------- -! Original : 95-10-01 -! Modified : 97-06-18 G. Mozdzynski - control MPI mailbox use -! (NCOMBFLEN) for nphase.eq.1 -! Modified : 99-05-28 D.Salmond - Optimise copies. -! Modified : 00-02-02 M.Hamrud - Remove NPHASE -! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message -! passing and buffer packing -! G.Mozdzynski: 08-01-01 Cleanup -! Y.Seity : 07-08-30 Add barrier synchronisation under LSYNC_TRANS -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT -USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK -USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK -USE TPM_DISTR ,ONLY : D, NPRTRW, NPROC, MYPROC, MYSETW -USE TPM_GEN ,ONLY : LSYNC_TRANS -USE MPI -USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX - -IMPLICIT NONE - -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_FS -REAL(KIND=JPRBT) ,INTENT(OUT), POINTER :: PFBUF(:) -REAL(KIND=JPRBT) ,INTENT(INOUT), POINTER :: PFBUF_IN(:) - -INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) -INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE -INTEGER(KIND=JPIM) :: IERROR - -INTEGER::IGROUP_MS,IGROUP_WORLD,IRANKS1(NPRTRW),IRANKS2(NPRTRW) -TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR -TYPE(TRLTOM_HANDLE), INTENT(IN) :: HTRLTOM - -#ifdef PARKINDTRANS_SINGLE -#define TRLTOM_DTYPE MPI_REAL -#else -#define TRLTOM_DTYPE MPI_DOUBLE_PRECISION -#endif - -IF (LHOOK) CALL DR_HOOK('TRLTOM_CUDAAWARE',0,ZHOOK_HANDLE) - -CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION(ALLOCATOR, HTRLTOM%HPFBUF),& - & 1_C_SIZE_T, D%NLENGT1B*2*KF_FS*SIZEOF(PFBUF(1))) - -!$ACC DATA PRESENT(PFBUF,PFBUF_IN) - -IF(NPROC > 1) THEN - DO J=1,NPRTRW - ILENS(J) = D%NLTSGTB(J)*2*KF_FS - IOFFS(J) = D%NSTAGT0B(J)*2*KF_FS - ILENR(J) = D%NLTSFTB(J)*2*KF_FS - IOFFR(J) = D%NSTAGT1B(J)*2*KF_FS - ENDDO - - CALL GSTATS(806,0) - - ! copy to self workaround - IRANK = MPL_MYRANK(MPL_ALL_MS_COMM) - IF (ILENS(IRANK) .ne. ILENR(IRANK)) THEN - PRINT *, "ERROR", ILENS(IRANK), ILENR(IRANK) - stop 1 - ENDIF - IF (ILENS(IRANK) > 0) THEN - FROM_SEND = IOFFS(IRANK) + 1 - TO_SEND = FROM_SEND + ILENS(IRANK) - 1 - FROM_RECV = IOFFR(IRANK) + 1 - TO_RECV = FROM_RECV + ILENR(IRANK) - 1 - !$ACC KERNELS ASYNC(1) - PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND) - !$ACC END KERNELS - ILENS(IRANK) = 0 - ILENR(IRANK) = 0 - ENDIF - - IF (LSYNC_TRANS) THEN - CALL GSTATS(430,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(430,1) - ENDIF - CALL GSTATS(411,0) - !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) - CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,IOFFS,TRLTOM_DTYPE,& - & PFBUF,ILENR,IOFFR, TRLTOM_DTYPE, & - & MPL_ALL_MS_COMM,IERROR) - !$ACC END HOST_DATA - IF (LSYNC_TRANS) THEN - CALL GSTATS(431,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(431,1) - ENDIF - CALL GSTATS(411,1) - - !$ACC WAIT(1) - CALL GSTATS(806,1) -ELSE - ILEN = D%NLTSGTB(MYSETW)*2*KF_FS - ISTA = D%NSTAGT1B(MYSETW)*2*KF_FS+1 - CALL GSTATS(1607,0) - !$ACC PARALLEL LOOP DEFAULT(NONE) - DO J=ISTA,ISTA+ILEN-1 - PFBUF(J) = PFBUF_IN(J) - ENDDO - CALL GSTATS(1607,1) -ENDIF - -!$ACC END DATA - -IF (LHOOK) CALL DR_HOOK('TRLTOM_CUDAAWARE',1,ZHOOK_HANDLE) -! ------------------------------------------------------------------ -END SUBROUTINE TRLTOM_CUDAAWARE + SUBROUTINE TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) + !**** *TRLTOM * - transposition in Fourierspace + + ! Purpose. + ! -------- + ! Transpose Fourier coefficients from partitioning + ! over latitudes to partitioning over wave numbers + ! This is done between inverse Legendre Transform + ! and inverse FFT. + ! This is the inverse routine of TRMTOL. + + !** Interface. + ! ---------- + ! *CALL* *TRLTOM(...)* + + ! Explicit arguments : PFBUF - Fourier coefficient buffer. It is + ! -------------------- used for both input and output. + + ! KF_FS - Number of fields communicated + + ! Implicit arguments : + ! -------------------- + + ! Method. + ! ------- + ! See documentation + + ! Externals. + ! ---------- + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! MPP Group *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 95-10-01 + ! Modified : 97-06-18 G. Mozdzynski - control MPI mailbox use + ! (NCOMBFLEN) for nphase.eq.1 + ! Modified : 99-05-28 D.Salmond - Optimise copies. + ! Modified : 00-02-02 M.Hamrud - Remove NPHASE + ! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message + ! passing and buffer packing + ! G.Mozdzynski: 08-01-01 Cleanup + ! Y.Seity : 07-08-30 Add barrier synchronisation under LSYNC_TRANS + ! ------------------------------------------------------------------ + + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT + USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK + USE TPM_DISTR ,ONLY : D, NPRTRW, NPROC, MYPROC, MYSETW + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE MPI + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + + IMPLICIT NONE + + INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_FS + REAL(KIND=JPRBT) ,INTENT(OUT), POINTER :: PFBUF(:) + REAL(KIND=JPRBT) ,INTENT(INOUT), POINTER :: PFBUF_IN(:) + + INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) + INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + INTEGER(KIND=JPIM) :: IERROR + + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(TRLTOM_HANDLE), INTENT(IN) :: HTRLTOM + + #ifdef PARKINDTRANS_SINGLE + #define TRLTOM_DTYPE MPI_REAL + #else + #define TRLTOM_DTYPE MPI_DOUBLE_PRECISION + #endif + + IF (LHOOK) CALL DR_HOOK('TRLTOM_CUDAAWARE',0,ZHOOK_HANDLE) + + CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION(ALLOCATOR, HTRLTOM%HPFBUF),& + & 1_C_SIZE_T, D%NLENGT1B*2*KF_FS*SIZEOF(PFBUF(1))) + + !$ACC DATA PRESENT(PFBUF,PFBUF_IN) + + IF(NPROC > 1) THEN + DO J=1,NPRTRW + ILENS(J) = D%NLTSGTB(J)*2*KF_FS + IOFFS(J) = D%NSTAGT0B(J)*2*KF_FS + ILENR(J) = D%NLTSFTB(J)*2*KF_FS + IOFFR(J) = D%NSTAGT1B(J)*2*KF_FS + ENDDO + + CALL GSTATS(806,0) + + ! copy to self workaround + IRANK = MPL_MYRANK(MPL_ALL_MS_COMM) + IF (ILENS(IRANK) .ne. ILENR(IRANK)) THEN + PRINT *, "ERROR", ILENS(IRANK), ILENR(IRANK) + stop 1 + ENDIF + IF (ILENS(IRANK) > 0) THEN + FROM_SEND = IOFFS(IRANK) + 1 + TO_SEND = FROM_SEND + ILENS(IRANK) - 1 + FROM_RECV = IOFFR(IRANK) + 1 + TO_RECV = FROM_RECV + ILENR(IRANK) - 1 + !$ACC KERNELS ASYNC(1) + PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND) + !$ACC END KERNELS + ILENS(IRANK) = 0 + ILENR(IRANK) = 0 + ENDIF + + IF (LSYNC_TRANS) THEN + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) + ENDIF + CALL GSTATS(411,0) + !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) + CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,IOFFS,TRLTOM_DTYPE,& + & PFBUF,ILENR,IOFFR, TRLTOM_DTYPE, & + & MPL_ALL_MS_COMM,IERROR) + !$ACC END HOST_DATA + IF (LSYNC_TRANS) THEN + CALL GSTATS(431,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(431,1) + ENDIF + CALL GSTATS(411,1) + + !$ACC WAIT(1) + CALL GSTATS(806,1) + ELSE + ILEN = D%NLTSGTB(MYSETW)*2*KF_FS + ISTA = D%NSTAGT1B(MYSETW)*2*KF_FS+1 + CALL GSTATS(1607,0) + !$ACC PARALLEL LOOP DEFAULT(NONE) + DO J=ISTA,ISTA+ILEN-1 + PFBUF(J) = PFBUF_IN(J) + ENDDO + CALL GSTATS(1607,1) + ENDIF + + !$ACC END DATA + + IF (LHOOK) CALL DR_HOOK('TRLTOM_CUDAAWARE',1,ZHOOK_HANDLE) + ! ------------------------------------------------------------------ + END SUBROUTINE TRLTOM_CUDAAWARE END MODULE TRLTOM_MOD diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 6ba362a7b..b640e201c 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -18,7 +18,6 @@ MODULE TRMTOL_MOD TYPE TRMTOL_HANDLE TYPE(ALLOCATION_RESERVATION_HANDLE) :: HPFBUF END TYPE - CONTAINS FUNCTION PREPARE_TRMTOL(ALLOCATOR, KF_LEG) RESULT(HTRMTOL) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT @@ -34,153 +33,152 @@ FUNCTION PREPARE_TRMTOL(ALLOCATOR, KF_LEG) RESULT(HTRMTOL) HTRMTOL%HPFBUF = RESERVE(ALLOCATOR, D%NLENGT0B*2*KF_LEG*SIZEOF(DUMMY)) END FUNCTION -SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) - -!**** *trmtol * - transposition in Fourier space - -! Purpose. -! -------- -! Transpose Fourier buffer data from partitioning -! over wave numbers to partitioning over latitudes. -! It is called between direct FFT and direct Legendre -! transform. -! This routine is the inverse of TRLTOM. - - -!** Interface. -! ---------- -! *call* *trmtol(...)* - -! Explicit arguments : PFBUF - Fourier coefficient buffer. It is -! -------------------- used for both input and output. -! KF_LEG - Number of fields communicated - -! Implicit arguments : -! -------------------- - -! Method. -! ------- -! See documentation - -! Externals. -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! MPP Group *ECMWF* - -! Modifications. -! -------------- -! Original : 95-10-01 -! Modified : 97-06-17 G. Mozdzynski - control MPI mailbox use -! (NCOMBFLEN) for nphase.eq.1 -! Modified : 99-05-28 D.Salmond - Optimise copies. -! Modified : 00-02-02 M.Hamrud - Remove NPHASE -! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message -! passing and buffer packing -! G.Mozdzynski: 08-01-01 Cleanup -! Y.Seity : 07-08-31 add barrier synchronisation under LSYNC_TRANS -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT -USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK -USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK -USE TPM_DISTR ,ONLY : D, NPRTRW, NPROC, MYPROC, MYSETW -USE TPM_GEN ,ONLY : LSYNC_TRANS -USE MPI -USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX - -IMPLICIT NONE - -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_LEG -REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PFBUF(:) -REAL(KIND=JPRBT), INTENT(IN) :: PFBUF_IN(:) - -INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) -INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE -INTEGER(KIND=JPIM) :: IERROR - -TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR -TYPE(TRMTOL_HANDLE), INTENT(IN) :: HTRMTOL - -#ifdef PARKINDTRANS_SINGLE -#define TRMTOL_DTYPE MPI_REAL -#else -#define TRMTOL_DTYPE MPI_DOUBLE_PRECISION -#endif - -IF (LHOOK) CALL DR_HOOK('TRMTOL_CUDAAWARE',0,ZHOOK_HANDLE) - -CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION(ALLOCATOR, HTRMTOL%HPFBUF),& - & 1_C_SIZE_T, D%NLENGT0B*2*KF_LEG*SIZEOF(PFBUF(1))) - -IF(NPROC > 1) THEN - DO J=1,NPRTRW - ILENS(J) = D%NLTSFTB(J)*2*KF_LEG - IOFFS(J) = D%NSTAGT1B(J)*2*KF_LEG - ILENR(J) = D%NLTSGTB(J)*2*KF_LEG - IOFFR(J) = D%NSTAGT0B(J)*2*KF_LEG - ENDDO - - CALL GSTATS(807,0) - - ! copy to self workaround - IRANK = MPL_MYRANK(MPL_ALL_MS_COMM) - IF (ILENS(IRANK) .ne. ILENR(IRANK)) THEN - PRINT *, "ERROR", ILENS(IRANK), ILENR(IRANK) - stop 1 - ENDIF - IF (ILENS(IRANK) > 0) THEN - FROM_SEND = IOFFS(IRANK) + 1 - TO_SEND = FROM_SEND + ILENS(IRANK) - 1 - FROM_RECV = IOFFR(IRANK) + 1 - TO_RECV = FROM_RECV + ILENR(IRANK) - 1 - !$ACC KERNELS ASYNC(1) DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) - PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND) - !$ACC END KERNELS - ILENS(IRANK) = 0 - ILENR(IRANK) = 0 - ENDIF - - IF (LSYNC_TRANS) THEN - CALL GSTATS(440,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(440,1) - ENDIF - CALL GSTATS(421,0) - !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) - CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,IOFFS,TRMTOL_DTYPE,& - & PFBUF,ILENR,IOFFR,TRMTOL_DTYPE,& - & MPL_ALL_MS_COMM,IERROR) - !$ACC END HOST_DATA - IF (LSYNC_TRANS) THEN - CALL GSTATS(441,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(441,1) - ENDIF - CALL GSTATS(421,1) - - !$ACC WAIT(1) - CALL GSTATS(807,1) -ELSE - ILEN = D%NLTSGTB(MYSETW)*2*KF_LEG - ISTA = D%NSTAGT0B(MYSETW)*2*KF_LEG+1 - CALL GSTATS(1608,0) - !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) - DO J=ISTA,ISTA+ILEN-1 - PFBUF(J) = PFBUF_IN(J) - ENDDO - CALL GSTATS(1608,1) -ENDIF - -IF (LHOOK) CALL DR_HOOK('TRMTOL_CUDAAWARE',1,ZHOOK_HANDLE) - -! ------------------------------------------------------------------ -END SUBROUTINE TRMTOL_CUDAAWARE + SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) + !**** *trmtol * - transposition in Fourier space + + ! Purpose. + ! -------- + ! Transpose Fourier buffer data from partitioning + ! over wave numbers to partitioning over latitudes. + ! It is called between direct FFT and direct Legendre + ! transform. + ! This routine is the inverse of TRLTOM. + + + !** Interface. + ! ---------- + ! *call* *trmtol(...)* + + ! Explicit arguments : PFBUF - Fourier coefficient buffer. It is + ! -------------------- used for both input and output. + ! KF_LEG - Number of fields communicated + + ! Implicit arguments : + ! -------------------- + + ! Method. + ! ------- + ! See documentation + + ! Externals. + ! ---------- + + ! Reference. + ! ---------- + ! ECMWF Research Department documentation of the IFS + + ! Author. + ! ------- + ! MPP Group *ECMWF* + + ! Modifications. + ! -------------- + ! Original : 95-10-01 + ! Modified : 97-06-17 G. Mozdzynski - control MPI mailbox use + ! (NCOMBFLEN) for nphase.eq.1 + ! Modified : 99-05-28 D.Salmond - Optimise copies. + ! Modified : 00-02-02 M.Hamrud - Remove NPHASE + ! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message + ! passing and buffer packing + ! G.Mozdzynski: 08-01-01 Cleanup + ! Y.Seity : 07-08-31 add barrier synchronisation under LSYNC_TRANS + ! ------------------------------------------------------------------ + + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT + USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK + USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK + USE TPM_DISTR ,ONLY : D, NPRTRW, NPROC, MYPROC, MYSETW + USE TPM_GEN ,ONLY : LSYNC_TRANS + USE MPI + USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + + IMPLICIT NONE + + INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_LEG + REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PFBUF(:) + REAL(KIND=JPRBT), INTENT(IN) :: PFBUF_IN(:) + + INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) + INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + INTEGER(KIND=JPIM) :: IERROR + + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(TRMTOL_HANDLE), INTENT(IN) :: HTRMTOL + + #ifdef PARKINDTRANS_SINGLE + #define TRMTOL_DTYPE MPI_REAL + #else + #define TRMTOL_DTYPE MPI_DOUBLE_PRECISION + #endif + + IF (LHOOK) CALL DR_HOOK('TRMTOL_CUDAAWARE',0,ZHOOK_HANDLE) + + CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION(ALLOCATOR, HTRMTOL%HPFBUF),& + & 1_C_SIZE_T, D%NLENGT0B*2*KF_LEG*SIZEOF(PFBUF(1))) + + IF(NPROC > 1) THEN + DO J=1,NPRTRW + ILENS(J) = D%NLTSFTB(J)*2*KF_LEG + IOFFS(J) = D%NSTAGT1B(J)*2*KF_LEG + ILENR(J) = D%NLTSGTB(J)*2*KF_LEG + IOFFR(J) = D%NSTAGT0B(J)*2*KF_LEG + ENDDO + + CALL GSTATS(807,0) + + ! copy to self workaround + IRANK = MPL_MYRANK(MPL_ALL_MS_COMM) + IF (ILENS(IRANK) .ne. ILENR(IRANK)) THEN + PRINT *, "ERROR", ILENS(IRANK), ILENR(IRANK) + stop 1 + ENDIF + IF (ILENS(IRANK) > 0) THEN + FROM_SEND = IOFFS(IRANK) + 1 + TO_SEND = FROM_SEND + ILENS(IRANK) - 1 + FROM_RECV = IOFFR(IRANK) + 1 + TO_RECV = FROM_RECV + ILENR(IRANK) - 1 + !$ACC KERNELS ASYNC(1) DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) + PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND) + !$ACC END KERNELS + ILENS(IRANK) = 0 + ILENR(IRANK) = 0 + ENDIF + + IF (LSYNC_TRANS) THEN + CALL GSTATS(440,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(440,1) + ENDIF + CALL GSTATS(421,0) + !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) + CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,IOFFS,TRMTOL_DTYPE,& + & PFBUF,ILENR,IOFFR,TRMTOL_DTYPE,& + & MPL_ALL_MS_COMM,IERROR) + !$ACC END HOST_DATA + IF (LSYNC_TRANS) THEN + CALL GSTATS(441,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(441,1) + ENDIF + CALL GSTATS(421,1) + + !$ACC WAIT(1) + CALL GSTATS(807,1) + ELSE + ILEN = D%NLTSGTB(MYSETW)*2*KF_LEG + ISTA = D%NSTAGT0B(MYSETW)*2*KF_LEG+1 + CALL GSTATS(1608,0) + !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) + DO J=ISTA,ISTA+ILEN-1 + PFBUF(J) = PFBUF_IN(J) + ENDDO + CALL GSTATS(1608,1) + ENDIF + + IF (LHOOK) CALL DR_HOOK('TRMTOL_CUDAAWARE',1,ZHOOK_HANDLE) + + ! ------------------------------------------------------------------ + END SUBROUTINE TRMTOL_CUDAAWARE END MODULE TRMTOL_MOD From de1ef2eb903b3ee0b03e64c7d25948fcfb4912e9 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Thu, 2 Jun 2022 05:30:04 -0700 Subject: [PATCH 238/263] Re-enable gpnorm after breaking in 'Add allocator / incomplete cleanup but working' --- src/trans/gpu/external/gpnorm_trans.F90 | 34 ++++++++++++------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/src/trans/gpu/external/gpnorm_trans.F90 b/src/trans/gpu/external/gpnorm_trans.F90 index fa1e2faa4..f81a0f04e 100755 --- a/src/trans/gpu/external/gpnorm_trans.F90 +++ b/src/trans/gpu/external/gpnorm_trans.F90 @@ -62,11 +62,13 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) USE TPM_GEOMETRY ,ONLY : G,G_NLOEN,G_NLOEN_MAX USE TPM_FIELDS ,ONLY : F USE SET_RESOL_MOD ,ONLY : SET_RESOL -USE TRGTOL_MOD ,ONLY : TRGTOL_CUDAAWARE USE SET2PE_MOD ,ONLY : SET2PE USE MPL_MODULE ,ONLY : MPL_RECV, MPL_SEND, JP_BLOCKING_STANDARD USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK +USE TRGTOL_MOD +USE TPM_TRANS, ONLY:REUSE_PTR +USE ALLOCATOR_MOD !endif INTERFACE @@ -93,7 +95,7 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) INTEGER(KIND=JPIM),ALLOCATABLE :: IVSETG(:,:) !GPU REAL(KIND=JPRBT) :: V -REAL(KIND=JPRBT),ALLOCATABLE,SAVE :: ZGTF(:) +REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:) !REAL(KIND=JPRD),ALLOCATABLE :: ZAVE(:,:) !REAL(KIND=JPRBT),ALLOCATABLE :: ZMINGL(:,:) !REAL(KIND=JPRBT),ALLOCATABLE :: ZMAXGL(:,:) @@ -106,6 +108,8 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) REAL(KIND=JPRD),ALLOCATABLE :: ZRCV(:) INTEGER(KIND=JPIM) :: J,JGL,IGL,JL,JF,IF_GP,IF_SCALARS_G,IF_FS,JSETV,JSETW,IWLATS,JMAX INTEGER(KIND=JPIM) :: IPROC,ITAG,ILEN,ILENR,IBEG,IEND,IND +TYPE(BUFFERED_ALLOCATOR) :: ALLOCATOR +TYPE(TRGTOL_HANDLE) :: HTRGTOL !INTEGER(KIND=JPIM) :: iunit @@ -138,7 +142,7 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) IF_GP=KFIELDS -IF_SCALARS_G=0 +IF_SCALARS_G=KFIELDS IF_FS=0 DO J=1,KFIELDS @@ -161,11 +165,6 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) ZMAXGPN = 0._JPRBT !$ACC ENTER DATA COPYIN(ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) -if (.not. allocated(zgtf)) then -ALLOCATE(ZGTF(IF_FS*D%NLENGTF)) -write(nout,*)'ZGTF :',size(ZGTF) -!$ACC ENTER DATA CREATE(ZGTF) -endif endif ALLOCATE(IVSETS(NPRTRV)) @@ -187,13 +186,16 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) ! write(iunit,*) 'PGP field=',JF,PGP(1,JF,1),PGP(NPROMA,JF,1),PGP(1,JF,NGPBLKS) !ENDDO +ALLOCATOR = MAKE_BUFFERED_ALLOCATOR() +HTRGTOL = PREPARE_TRGTOL(ALLOCATOR,IF_GP,IF_FS) +CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, REUSE_PTR) + ! done in setup_trans LGPNORM=.TRUE. -CALL TRGTOL_CUDAAWARE(ZGTF,IF_FS,IF_GP,IVSET,PGP=PGP) +CALL TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,IF_FS,IF_GP,0,IF_SCALARS_G,& + & KVSETSC=IVSET,PGP=PGP) LGPNORM=.FALSE. -! ZGTF is now on GPU - IBEG=1 IEND=D%NDGL_FS @@ -203,11 +205,11 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) !$ACC data & !$ACC& COPY(F,F%RW) & !$ACC& COPY(D,D_NSTAGTF,D_NPTRLS,G_NLOEN,G_NLOEN_MAX) & - !$ACC& present(ZGTF,ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) + !$ACC& present(PREEL_REAL,ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) !$ACC KERNELS DO JF=1,IF_FS - V = ZGTF(IF_FS*D_NSTAGTF(1)+(JF-1)*(D%NSTAGTF(2)-D%NSTAGTF(1))) + V = PREEL_REAL(IF_FS*D_NSTAGTF(1)+(JF-1)*(D%NSTAGTF(2)-D%NSTAGTF(1))) ZMINGL(JF,IBEG:IEND)=HUGE(1_JPRBT) ZMAXGL(JF,IBEG:IEND)=-HUGE(1_JPRBT) ENDDO @@ -222,7 +224,7 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) ZAVE(JF,JGL)=0.0_JPRB !$ACC loop DO JL=1,G_NLOEN(IGL) - V = ZGTF(IF_FS*D%NSTAGTF(JGL)+(JF-1)*(D%NSTAGTF(JGL+1)-D%NSTAGTF(JGL))+JL) + V = PREEL_REAL(IF_FS*D%NSTAGTF(JGL)+(JF-1)*(D%NSTAGTF(JGL+1)-D%NSTAGTF(JGL))+JL) ZAVE(JF,JGL)=ZAVE(JF,JGL)+V ZMINGL(JF,JGL)=MIN(ZMINGL(JF,JGL),V) ZMAXGL(JF,JGL)=MAX(ZMAXGL(JF,JGL),V) @@ -481,10 +483,6 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) ENDIF -!DEALLOCATE(ZGTF) -!DEALLOCATE(ZAVE) -!DEALLOCATE(ZMIN) -!DEALLOCATE(ZMAX) DEALLOCATE(ZAVEG) DEALLOCATE(ZMING) DEALLOCATE(ZMAXG) From 2d30553cb22db25a37da7300e035482b4e13f137 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:38 -0700 Subject: [PATCH 239/263] Remove adjoint fuctions (we should write them if needed) --- src/programs/CMakeLists.txt | 29 +- src/trans/gpu/CMakeLists.txt | 67 +- src/trans/gpu/external/dir_transad.F90 | 505 -------------- src/trans/gpu/external/inv_transad.F90 | 618 ------------------ src/trans/gpu/external/vordiv_to_uv.F90 | 178 ----- src/trans/gpu/internal/asre1ad_mod.F90 | 91 --- src/trans/gpu/internal/asre1bad_mod.F90 | 107 --- .../gpu/internal/dir_trans_ctlad_mod.F90 | 193 ------ src/trans/gpu/internal/fourier_inad_mod.F90 | 73 --- src/trans/gpu/internal/fourier_outad_mod.F90 | 72 -- src/trans/gpu/internal/fscad_mod.F90 | 145 ---- src/trans/gpu/internal/ftdir_ctlad_mod.F90 | 188 ------ src/trans/gpu/internal/ftdirad_mod.F90 | 119 ---- src/trans/gpu/internal/ftinv_ctlad_mod.F90 | 295 --------- src/trans/gpu/internal/ftinvad_mod.F90 | 124 ---- .../gpu/internal/inv_trans_ctlad_mod.F90 | 295 --------- src/trans/gpu/internal/ldfou2ad_mod.F90 | 96 --- src/trans/gpu/internal/ledirad_mod.F90 | 206 ------ src/trans/gpu/internal/leinvad_mod.F90 | 196 ------ src/trans/gpu/internal/ltdir_ctlad_mod.F90 | 110 ---- src/trans/gpu/internal/ltdirad_mod.F90 | 188 ------ src/trans/gpu/internal/ltinv_ctlad_mod.F90 | 119 ---- src/trans/gpu/internal/ltinvad_mod.F90 | 239 ------- src/trans/gpu/internal/prfi1ad_mod.F90 | 112 ---- src/trans/gpu/internal/prfi1bad_mod.F90 | 111 ---- src/trans/gpu/internal/prfi2ad_mod.F90 | 90 --- src/trans/gpu/internal/prfi2bad_mod.F90 | 98 --- src/trans/gpu/internal/spnsdead_mod.F90 | 118 ---- src/trans/gpu/internal/updspad_mod.F90 | 177 ----- src/trans/gpu/internal/updspbad_mod.F90 | 159 ----- src/trans/gpu/internal/uvtvdad_mod.F90 | 138 ---- src/trans/gpu/internal/vd2uv_ctl_mod.F90 | 80 --- src/trans/gpu/internal/vd2uv_mod.F90 | 156 ----- src/trans/gpu/internal/vdtuvad_mod.F90 | 144 ---- 34 files changed, 2 insertions(+), 5634 deletions(-) delete mode 100755 src/trans/gpu/external/dir_transad.F90 delete mode 100755 src/trans/gpu/external/inv_transad.F90 delete mode 100755 src/trans/gpu/external/vordiv_to_uv.F90 delete mode 100755 src/trans/gpu/internal/asre1ad_mod.F90 delete mode 100755 src/trans/gpu/internal/asre1bad_mod.F90 delete mode 100755 src/trans/gpu/internal/dir_trans_ctlad_mod.F90 delete mode 100755 src/trans/gpu/internal/fourier_inad_mod.F90 delete mode 100755 src/trans/gpu/internal/fourier_outad_mod.F90 delete mode 100755 src/trans/gpu/internal/fscad_mod.F90 delete mode 100755 src/trans/gpu/internal/ftdir_ctlad_mod.F90 delete mode 100755 src/trans/gpu/internal/ftdirad_mod.F90 delete mode 100755 src/trans/gpu/internal/ftinv_ctlad_mod.F90 delete mode 100755 src/trans/gpu/internal/ftinvad_mod.F90 delete mode 100755 src/trans/gpu/internal/inv_trans_ctlad_mod.F90 delete mode 100755 src/trans/gpu/internal/ldfou2ad_mod.F90 delete mode 100755 src/trans/gpu/internal/ledirad_mod.F90 delete mode 100755 src/trans/gpu/internal/leinvad_mod.F90 delete mode 100755 src/trans/gpu/internal/ltdir_ctlad_mod.F90 delete mode 100755 src/trans/gpu/internal/ltdirad_mod.F90 delete mode 100755 src/trans/gpu/internal/ltinv_ctlad_mod.F90 delete mode 100755 src/trans/gpu/internal/ltinvad_mod.F90 delete mode 100755 src/trans/gpu/internal/prfi1ad_mod.F90 delete mode 100755 src/trans/gpu/internal/prfi1bad_mod.F90 delete mode 100755 src/trans/gpu/internal/prfi2ad_mod.F90 delete mode 100755 src/trans/gpu/internal/prfi2bad_mod.F90 delete mode 100755 src/trans/gpu/internal/spnsdead_mod.F90 delete mode 100755 src/trans/gpu/internal/updspad_mod.F90 delete mode 100755 src/trans/gpu/internal/updspbad_mod.F90 delete mode 100755 src/trans/gpu/internal/uvtvdad_mod.F90 delete mode 100755 src/trans/gpu/internal/vd2uv_ctl_mod.F90 delete mode 100755 src/trans/gpu/internal/vd2uv_mod.F90 delete mode 100755 src/trans/gpu/internal/vdtuvad_mod.F90 diff --git a/src/programs/CMakeLists.txt b/src/programs/CMakeLists.txt index d4f232621..a4f2aa5f0 100644 --- a/src/programs/CMakeLists.txt +++ b/src/programs/CMakeLists.txt @@ -26,35 +26,16 @@ if( HAVE_TOOLS AND TARGET eccodes_f90 ) LIBS ${trans} eccodes_f90 LINKER_LANGUAGE Fortran DEFINITIONS ECTRANS_TOOLS_RTABLE_PATH="${ECTRANS_TOOLS_RTABLE_PATH}" ) - endforeach() - - endif() - set( HAVE_dp ${HAVE_DOUBLE_PRECISION} ) set( HAVE_sp ${HAVE_SINGLE_PRECISION} ) if( HAVE_GPU ) foreach( prec sp dp ) if( HAVE_${prec} ) - ecbuild_add_executable(TARGET driver-spectrans-${prec} - SOURCES driver-spectraltransform.F90 - INCLUDES - ${MPI_Fortran_INCLUDE_PATH} - $ - LIBS - fiat parkind_${prec} - eccodes_f90 eccodes_memfs - ${MPI_Fortran_LIBRARIES} - trans_gpu_static_${prec} - gpu - OpenACC::OpenACC_Fortran - ${LAPACK_LIBRARIES} - nvhpcwrapnvtx - ) - ecbuild_add_executable(TARGET driver-spectrans-CA-${prec} + ecbuild_add_executable(TARGET driver-spectrans-CA-${prec} SOURCES driver-spectraltransform.F90 INCLUDES ${MPI_Fortran_INCLUDE_PATH} @@ -69,16 +50,8 @@ if( HAVE_GPU ) ${LAPACK_LIBRARIES} nvhpcwrapnvtx ) - #trans_gpu_static_${prec} - #gpu - #${CMAKE_BINARY_DIR}/lib/libtrans_gpu_static_${prec}.a - #${CMAKE_BINARY_DIR}/lib/libgpu.a - #target_link_libraries( driver-spectrans PRIVATE OpenACC::OpenACC_Fortran ) - set_property( TARGET driver-spectrans-${prec} PROPERTY CUDA_ARCHITECTURES 70 ) set_property( TARGET driver-spectrans-CA-${prec} PROPERTY CUDA_ARCHITECTURES 70 ) - target_compile_options( driver-spectrans-${prec} PRIVATE $<$:-g -acc -Minfo=acc -gpu=cc70,lineinfo,deepcopy,fastmath,nordc,pinned -cudalib=cufft,cublas -fpic> ) target_compile_options( driver-spectrans-CA-${prec} PRIVATE $<$:-g -acc -Minfo=acc -gpu=cc70,lineinfo,deepcopy,fastmath,nordc,pinned -cudalib=cufft,cublas -fpic> ) - set_target_properties(driver-spectrans-${prec} PROPERTIES LINK_FLAGS "-acc -cudalib=cufft,cublas -fpic -gpu=cc70,pinned") set_target_properties(driver-spectrans-CA-${prec} PROPERTIES LINK_FLAGS "-acc -cudalib=cufft,cublas -fpic -gpu=cc70,pinned") message("Building ${prec} GPU driver") endif() diff --git a/src/trans/gpu/CMakeLists.txt b/src/trans/gpu/CMakeLists.txt index 749bec737..05e4beca8 100644 --- a/src/trans/gpu/CMakeLists.txt +++ b/src/trans/gpu/CMakeLists.txt @@ -26,43 +26,12 @@ endif() set( HAVE_dp ${HAVE_DOUBLE_PRECISION} ) set( HAVE_sp ${HAVE_SINGLE_PRECISION} ) -set( FFTW_dp FFTW::fftw3 ) -set( FFTW_sp FFTW::fftw3f ) set( IFS_ACC_Fortran_LIBRARIES OpenACC::OpenACC_Fortran PARENT_SCOPE) foreach( prec sp dp ) if( HAVE_${prec} ) - ecbuild_add_library( - TARGET trans_gpu_${prec} - LINKER_LANGUAGE Fortran - SOURCES ${trans_src} - PUBLIC_INCLUDES $ - $ - $ - $ - $ - PRIVATE_INCLUDES ${MPI_Fortran_INCLUDE_PATH} - PUBLIC_LIBS parkind_${prec} - fiat - PRIVATE_LIBS ${LAPACK_LIBRARIES} - ) - ecbuild_add_library( - TARGET trans_gpu_static_${prec} - LINKER_LANGUAGE Fortran - SOURCES ${trans_src} - TYPE STATIC - PUBLIC_INCLUDES $ - $ - $ - $ - $ - PRIVATE_INCLUDES ${MPI_Fortran_INCLUDE_PATH} - PUBLIC_LIBS parkind_${prec} - fiat - PRIVATE_LIBS ${LAPACK_LIBRARIES} - ) ecbuild_add_library( TARGET trans_gpu_static_CA_${prec} LINKER_LANGUAGE Fortran @@ -79,57 +48,23 @@ foreach( prec sp dp ) PRIVATE_LIBS ${LAPACK_LIBRARIES} ) - ectrans_target_fortran_module_directory( - TARGET trans_gpu_${prec} - MODULE_DIRECTORY ${PROJECT_BINARY_DIR}/module/trans_gpu_${prec} - INSTALL_DIRECTORY module/trans_gpu_${prec} - ) - ectrans_target_fortran_module_directory( - TARGET trans_gpu_static_${prec} - MODULE_DIRECTORY ${PROJECT_BINARY_DIR}/module/trans_gpu_static_${prec} - INSTALL_DIRECTORY module/trans_gpu_static_${prec} - ) ectrans_target_fortran_module_directory( TARGET trans_gpu_static_CA_${prec} MODULE_DIRECTORY ${PROJECT_BINARY_DIR}/module/trans_gpu_static_CA_${prec} INSTALL_DIRECTORY module/trans_gpu_static_CA_${prec} ) - if( HAVE_FFTW ) - target_link_libraries( trans_gpu_${prec} PRIVATE ${FFTW_LIBRARIES} ) - target_link_libraries( trans_gpu_static_${prec} PRIVATE ${FFTW_LIBRARIES} ) - target_link_libraries( trans_gpu_static_CA_${prec} PRIVATE ${FFTW_LIBRARIES} ) - target_include_directories( trans_gpu_${prec} PRIVATE ${FFTW_INCLUDE_DIRS} ) - target_include_directories( trans_gpu_static_${prec} PRIVATE ${FFTW_INCLUDE_DIRS} ) - target_include_directories( trans_gpu_static_CA_${prec} PRIVATE ${FFTW_INCLUDE_DIRS} ) - target_compile_definitions( trans_gpu_${prec} PRIVATE WITH_FFTW ) - endif() - if( HAVE_OMP ) - target_link_libraries( trans_gpu_${prec} PRIVATE OpenMP::OpenMP_Fortran ) - target_link_libraries( trans_gpu_static_${prec} PRIVATE OpenMP::OpenMP_Fortran ) target_link_libraries( trans_gpu_static_CA_${prec} PRIVATE OpenMP::OpenMP_Fortran ) endif() - target_link_libraries( trans_gpu_${prec} PRIVATE OpenACC::OpenACC_Fortran ) - set_property( TARGET trans_gpu_${prec} PROPERTY CUDA_ARCHITECTURES 70 ) - target_compile_options( trans_gpu_${prec} PRIVATE $<$:-g -acc -Minfo=acc -gpu=cc70,lineinfo,deepcopy,fastmath,nordc> ) - target_compile_options( trans_gpu_static_${prec} PRIVATE $<$:-g -acc -Minfo=acc -gpu=cc70,lineinfo,deepcopy,fastmath,nordc> ) target_compile_options( trans_gpu_static_CA_${prec} PRIVATE $<$:-g -acc -Minfo=acc -gpu=cc70,lineinfo,deepcopy,fastmath,nordc> ) - - target_link_libraries( trans_gpu_static_${prec} PRIVATE OpenACC::OpenACC_Fortran ) - set_property( TARGET trans_gpu_static_${prec} PROPERTY CUDA_ARCHITECTURES 70 ) - target_link_libraries( trans_gpu_static_CA_${prec} PRIVATE OpenACC::OpenACC_Fortran ) set_property( TARGET trans_gpu_static_CA_${prec} PROPERTY CUDA_ARCHITECTURES 70 ) if( prec STREQUAL sp ) - target_compile_definitions( trans_gpu_${prec} PUBLIC TRANS_SINGLE PARKINDTRANS_SINGLE ) - target_compile_definitions( trans_gpu_static_${prec} PUBLIC TRANS_SINGLE PARKINDTRANS_SINGLE ) - target_compile_definitions( trans_gpu_static_CA_${prec} PUBLIC TRANS_SINGLE PARKINDTRANS_SINGLE ) + target_compile_definitions( trans_gpu_static_CA_${prec} PUBLIC TRANS_SINGLE PARKINDTRANS_SINGLE ) endif() - target_compile_definitions( trans_gpu_static_CA_${prec} PUBLIC USE_CUDA_AWARE_MPI_FT ) - endif() endforeach() diff --git a/src/trans/gpu/external/dir_transad.F90 b/src/trans/gpu/external/dir_transad.F90 deleted file mode 100755 index 4b0ac951d..000000000 --- a/src/trans/gpu/external/dir_transad.F90 +++ /dev/null @@ -1,505 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -SUBROUTINE DIR_TRANSAD(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& -& KPROMA,KVSETUV,KVSETSC,KRESOL,KVSETSC3A,KVSETSC3B,KVSETSC2,& -& PGP,PGPUV,PGP3A,PGP3B,PGP2) - - -!**** *DIR_TRANSAD* - Direct spectral transform - adjoint. - -! Purpose. -! -------- -! Interface routine for the direct spectral transform - adjoint - -!** Interface. -! ---------- -! CALL DIR_TRANSAD(...) - -! Explicit arguments : All arguments except from PGP are optional. -! -------------------- -! PSPVOR(:,:) - spectral vorticity (output) -! PSPDIV(:,:) - spectral divergence (output) -! PSPSCALAR(:,:) - spectral scalarvalued fields (output) -! PSPSC3A(:,:,:) - alternative to use of PSPSCALAR, see PGP3A below (input) -! PSPSC3B(:,:,:) - alternative to use of PSPSCALAR, see PGP3B below (input) -! PSPSC2(:,:) - alternative to use of PSPSCALAR, see PGP2 below (input) -! KPROMA - required blocking factor for gridpoint output -! KVSETUV(:) - indicating which 'b-set' in spectral space owns a -! vor/div field. Equivalant to NBSETLEV in the IFS. -! The length of KVSETUV should be the GLOBAL number -! of u/v fields which is the dimension of u and v releated -! fields in grid-point space. -! KVESETSC(:) - indicating which 'b-set' in spectral space owns a -! scalar field. As for KVSETUV this argument is required -! if the total number of processors is greater than -! the number of processors used for distribution in -! spectral wave space. -! KVSETSC3A(:) - as KVESETSC for PSPSC3A (distribution on first dimension) -! KVSETSC3B(:) - as KVESETSC for PSPSC3B (distribution on first dimension) -! KVSETSC2(:) - as KVESETSC for PSPSC2 (distribution on first dimension) -! KRESOL - resolution tag which is required ,default is the -! first defined resulution (input) -! PGP(:,:,:) - gridpoint fields (input) -! PGP need to dimensioned (NPROMA,IF_GP,NGPBLKS) where -! NPROMA is the blocking factor, IF_GP the total number -! of output fields and NGPBLKS the number of NPROMA blocks. -! The ordering of the output fields is as follows (all -! parts are optional depending on the input switches): -! -! u : IF_UV_G fields (if psvor present) -! v : IF_UV_G fields (if psvor present) -! scalar fields : IF_SCALARS_G fields (if pspscalar present) -! -! Here IF_UV_G is the GLOBAL number of u/v fields as given by the length -! of KVSETUV (or by PSPVOR if no split in spectral 'b-set' direction -! IF_SCALARS_G is the GLOBAL number of scalar fields as giben by the -! length of KVESETSC (or by number of fields in PSPSCALAR if no spectral -! 'b-set' split -! -! As an alternative to using PGP you can also use a combination of the -! following arrays. The reason for introducing these alternative ways -! of calling DIR_TRANS is to avoid uneccessary copies where your data -! structures don't fit in to the 'PSPVOR,PSPDIV, PSPSCALAR, PGP' layout. -! The use of any of these precludes the use of PGP and vice versa. - -! PGPUV(:,:,:,:) - the 'u-v' related grid-point variables in the order -! described for PGP. The second dimension of PGPUV should -! be the same as the "global" first dimension of -! PSPVOR,PSPDIV (in the IFS this is the number of levels) -! PGPUV need to be dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) -! IFLDS is the number of 'variables' (u,v) -! PGP3A(:,:,:,:) - grid-point array directly connected with PSPSC3A -! dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) -! IFLDS is the number of 'variables' (the same as in -! PSPSC3A ) -! PGP3B(:,:,:,:) - grid-point array directly connected with PSPSC3B -! dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) -! IFLDS is the number of 'variables' (the same as in -! PSPSC3B) -! PGP2(:,:,:) - grid-point array directly connected with PSPSC2 -! dimensioned(NPROMA,IFLDS,NGPBLKS) -! IFLDS is the number of 'variables' (the same as in -! PSPSC2 ) -! -! Method. -! ------- - -! Externals. SET_RESOL - set resolution -! ---------- DIR_TRANS_CTLAD - control routine -! - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 00-03-03 - -! ------------------------------------------------------------------ - -USE PARKIND1 ,ONLY : JPIM ,JPRB - -!ifndef INTERFACE - -USE TPM_GEN ,ONLY : NERR, NOUT -USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, & - & NGPBLKS, NF_SC2, NF_SC3A, NF_SC3B, NPROMA -USE TPM_DISTR ,ONLY : D, MYSETV, NPRTRV - -USE SET_RESOL_MOD ,ONLY : SET_RESOL -USE DIR_TRANS_CTLAD_MOD ,ONLY : DIR_TRANS_CTLAD -USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS -USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - -!endif INTERFACE - -IMPLICIT NONE - -! Declaration of arguments - -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPVOR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPDIV(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSCALAR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSC3A(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSC3B(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSC2(:,:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPROMA -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL - -REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP(:,:,:) -REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGPUV(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3A(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP3B(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL ,INTENT(OUT) :: PGP2(:,:,:) - -!ifndef INTERFACE - -! Local variables -INTEGER(KIND=JPIM) :: IUBOUND(4),J -INTEGER(KIND=JPIM) :: IF_UV,IF_UV_G,IF_SCALARS,IF_SCALARS_G,IF_FS,IF_GP -INTEGER(KIND=JPIM) :: IF_SC2_G,IF_SC3A_G,IF_SC3B_G -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - -! ------------------------------------------------------------------ -IF (LHOOK) CALL DR_HOOK('DIR_TRANSAD',0,ZHOOK_HANDLE) - -CALL GSTATS(1810,0) -! Set current resolution - -CALL SET_RESOL(KRESOL) - -! Set defaults - -IF_UV = 0 -IF_UV_G = 0 -IF_SCALARS = 0 -IF_SCALARS_G = 0 -NF_SC2 = 0 -NF_SC3A = 0 -NF_SC3B = 0 -IF_SC2_G = 0 -IF_SC3A_G = 0 -IF_SC3B_G = 0 -NPROMA = D%NGPTOT -LSCDERS=.FALSE. ! This is for use in TRLTOG which is shared with inverse transform -LVORGP=.FALSE. -LDIVGP=.FALSE. -LUVDER=.FALSE. - -! Decide requirements - - -IF(PRESENT(KVSETUV)) THEN - IF_UV_G = UBOUND(KVSETUV,1) - DO J=1,IF_UV_G - IF(KVSETUV(J) > NPRTRV) THEN - WRITE(NERR,*) 'DIR_TRANSAD:KVSETUV(J) > NPRTRV ',J,KVSETUV(J),NPRTRV - CALL ABORT_TRANS('DIR_TRANSAD:KVSETUV CONTAINS VALUES OUTSIDE RANGE') - ENDIF - IF(KVSETUV(J) == MYSETV) THEN - IF_UV = IF_UV+1 - ENDIF - ENDDO -ELSEIF(PRESENT(PSPVOR)) THEN - IF_UV = UBOUND(PSPVOR,1) - IF_UV_G = IF_UV -ENDIF - -IF(PRESENT(KVSETSC)) THEN - IF_SCALARS_G = UBOUND(KVSETSC,1) - DO J=1,IF_SCALARS_G - IF(KVSETSC(J) > NPRTRV) THEN - WRITE(NERR,*) 'DIR_TRANSAD:KVSETSC(J) > NPRTRV ',J,KVSETSC(J),NPRTRV - CALL ABORT_TRANS('DIR_TRANSAD:KVSETSC CONTAINS VALUES OUTSIDE RANGE') - ENDIF - IF(KVSETSC(J) == MYSETV) THEN - IF_SCALARS = IF_SCALARS+1 - ENDIF - ENDDO -ELSEIF(PRESENT(PSPSCALAR)) THEN - IF_SCALARS = UBOUND(PSPSCALAR,1) - IF_SCALARS_G = IF_SCALARS -ENDIF - -IF(PRESENT(KVSETSC2)) THEN - IF(.NOT.PRESENT(PSPSC2)) THEN - CALL ABORT_TRANS('DIR_TRANSAD:KVSETSC2 BUT NOT PSPSC2') - ENDIF - IF_SC2_G = UBOUND(KVSETSC2,1) - IF_SCALARS_G = IF_SCALARS_G+IF_SC2_G - DO J=1,UBOUND(KVSETSC2,1) - IF(KVSETSC2(J) > NPRTRV .OR. KVSETSC2(J) < 1) THEN - WRITE(NERR,*) 'DIR_TRANS:KVSETSC2(J) > NPRTRV ',J,KVSETSC2(J),NPRTRV - CALL ABORT_TRANS('DIR_TRANSAD:KVSETSC2 TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') - ENDIF - IF(KVSETSC2(J) == MYSETV) THEN - IF_SCALARS = IF_SCALARS+1 - NF_SC2 = NF_SC2+1 - ENDIF - ENDDO -ELSEIF(PRESENT(PSPSC2)) THEN - IF_SC2_G = UBOUND(PSPSC2,1) - NF_SC2 = UBOUND(PSPSC2,1) - IF_SCALARS = IF_SCALARS+NF_SC2 - IF_SCALARS_G = IF_SCALARS_G +IF_SC2_G -ENDIF - -IF(PRESENT(KVSETSC3A)) THEN - IF(.NOT.PRESENT(PSPSC3A)) THEN - CALL ABORT_TRANS('DIR_TRANSAD:KVSETSC3A BUT NOT PSPSC3A') - ENDIF - IF_SC3A_G = UBOUND(KVSETSC3A,1) - IF_SCALARS_G = IF_SCALARS_G+IF_SC3A_G*UBOUND(PSPSC3A,3) - DO J=1,UBOUND(KVSETSC3A,1) - IF(KVSETSC3A(J) > NPRTRV .OR. KVSETSC3A(J) < 1) THEN - WRITE(NERR,*) 'DIR_TRANSAD:KVSETSC3A(J) > NPRTRV ',J,KVSETSC3A(J),NPRTRV - CALL ABORT_TRANS('DIR_TRANSAD:KVSETSC3A TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') - ENDIF - IF(KVSETSC3A(J) == MYSETV) THEN - IF_SCALARS = IF_SCALARS+UBOUND(PSPSC3A,3) - NF_SC3A = NF_SC3A+1 - ENDIF - ENDDO -ELSEIF(PRESENT(PSPSC3A)) THEN - IF_SCALARS = IF_SCALARS+UBOUND(PSPSC3A,1)*UBOUND(PSPSC3A,3) - IF_SC3A_G = UBOUND(PSPSC3A,1) - IF_SCALARS_G = IF_SCALARS_G +IF_SC3A_G*UBOUND(PSPSC3A,3) - NF_SC3A = UBOUND(PSPSC3A,1) -ENDIF - -IF(PRESENT(KVSETSC3B)) THEN - IF(.NOT.PRESENT(PSPSC3B)) THEN - CALL ABORT_TRANS('DIR_TRANSAD:KVSETSC3B BUT NOT PSPSC3B') - ENDIF - IF_SC3B_G = UBOUND(KVSETSC3B,1) - IF_SCALARS_G = IF_SCALARS_G+IF_SC3B_G*UBOUND(PSPSC3B,3) - DO J=1,UBOUND(KVSETSC3B,1) - IF(KVSETSC3B(J) > NPRTRV .OR. KVSETSC3B(J) < 1) THEN - WRITE(NERR,*) 'DIR_TRANSAD:KVSETSC3B(J) > NPRTRV ',J,KVSETSC3B(J),NPRTRV - CALL ABORT_TRANS('DIR_TRANSAD:KVSETSC3B TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') - ENDIF - IF(KVSETSC3B(J) == MYSETV) THEN - IF_SCALARS = IF_SCALARS+UBOUND(PSPSC3B,3) - NF_SC3B = NF_SC3B+1 - ENDIF - ENDDO -ELSEIF(PRESENT(PSPSC3B)) THEN - IF_SCALARS = IF_SCALARS+UBOUND(PSPSC3B,1)*UBOUND(PSPSC3B,3) - IF_SC3B_G = UBOUND(PSPSC3B,1) - IF_SCALARS_G = IF_SCALARS_G +IF_SC3B_G*UBOUND(PSPSC3B,3) - NF_SC3B = UBOUND(PSPSC3B,1) -ENDIF - -IF(PRESENT(KPROMA)) THEN - NPROMA = KPROMA -ENDIF - -! Compute derived variables - - -NGPBLKS = (D%NGPTOT-1)/NPROMA+1 - -IF_FS = 2*IF_UV + IF_SCALARS - -IF_GP = 2*IF_UV_G+IF_SCALARS_G - -! Consistency checks - -IF (IF_UV > 0) THEN - IF(.NOT. PRESENT(PSPVOR) ) THEN - CALL ABORT_TRANS('DIR_TRANSAD : IF_UV > 0 BUT PSPVOR MISSING') - ENDIF - IF(UBOUND(PSPVOR,1) < IF_UV) THEN - WRITE(NERR,*)'DIR_TRANSAD : UBOUND(PSPVOR,1) < IF_UV ',& - & UBOUND(PSPVOR,1),IF_UV - CALL ABORT_TRANS('DIR_TRANSAD : PSPVOR TOO SHORT') - ENDIF - IF(.NOT. PRESENT(PSPDIV) ) THEN - CALL ABORT_TRANS('DIR_TRANSAD : PSPVOR PRESENT BUT PSPDIV MISSING') - ENDIF - IF(UBOUND(PSPDIV,1) /= IF_UV) THEN - WRITE(NERR,*)'DIR_TRANSAD : UBOUND(PSPDIV,1) < IF_UV ',& - & UBOUND(PSPDIV,1),IF_UV - CALL ABORT_TRANS('DIR_TRANSAD : INCONSISTENT FIRST DIM. OF PSPVOR AND PSPDIV') - ENDIF -ENDIF - -IF (IF_SCALARS > 0) THEN - IF(PRESENT(PSPSCALAR)) THEN - IF(UBOUND(PSPSCALAR,1) < IF_SCALARS) THEN - WRITE(NERR,*)'DIR_TRANS : UBOUND(PSPSCALAR,1) < IF_SCALARS) ',& - & UBOUND(PSPSCALAR,1),IF_SCALARS - CALL ABORT_TRANS('DIR_TRANSAD : PSPSCALAR TOO SHORT') - ENDIF - IF(PRESENT(PSPSC3A))THEN - CALL ABORT_TRANS('DIR_TRANSAD : PSPSCALAR AND PSPSC3A BOTH PRESENT') - ENDIF - IF(PRESENT(PSPSC3B))THEN - CALL ABORT_TRANS('DIR_TRANSAD : PSPSCALAR AND PSPSC3B BOTH PRESENT') - ENDIF - IF(PRESENT(PSPSC2))THEN - CALL ABORT_TRANS('DIR_TRANSAD : PSPSCALAR AND PSPSC2 BOTH PRESENT') - ENDIF - ENDIF -ENDIF - -IF(NPRTRV >1) THEN - IF(IF_UV > 0 .AND. .NOT. PRESENT(KVSETUV)) THEN - WRITE(NERR,*)'NPRTRV >1 AND IF_UV > 0 AND NOT PRESENT(KVSETUV)',& - &NPRTRV,IF_UV - CALL ABORT_TRANS('DIR_TRANSAD: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') - ENDIF - IF(PRESENT(PSPSCALAR) .AND. .NOT. PRESENT(KVSETSC)) THEN - WRITE(NERR,*)'NPRTRV >1 AND PRESENT(PSPSCALAR) AND NOT PRESENT(KVSETSC)',& - &NPRTRV - CALL ABORT_TRANS('DIR_TRANSAD: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') - ENDIF - IF(PRESENT(PSPSC2) .AND. .NOT. PRESENT(KVSETSC2)) THEN - WRITE(NERR,*)'NPRTRV >1 AND PRESENT(PSPSC2) AND NOT PRESENT(KVSETSC2)',& - &NPRTRV - CALL ABORT_TRANS('DIR_TRANSAD: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') - ENDIF - IF(PRESENT(PSPSC3A) .AND. .NOT. PRESENT(KVSETSC3A)) THEN - WRITE(NERR,*)'NPRTRV >1 AND PRESENT(PSPSC3A) AND NOT PRESENT(KVSETSC3A)',& - &NPRTRV - CALL ABORT_TRANS('DIR_TRANSAD: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') - ENDIF - IF(PRESENT(PSPSC3B) .AND. .NOT. PRESENT(KVSETSC3B)) THEN - WRITE(NERR,*)'NPRTRV >1 AND PRESENT(PSPSC3B) AND NOT PRESENT(KVSETSC3B)',& - &NPRTRV - CALL ABORT_TRANS('DIR_TRANSAD: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') - ENDIF -ENDIF - -IF(PRESENT(PGP)) THEN - IUBOUND(1:3)=UBOUND(PGP) - IF(IUBOUND(1) < NPROMA) THEN - WRITE(NOUT,*)'DIR_TRANSAD:FIRST DIM. OF PGP TOO SMALL ',IUBOUND(1),NPROMA - CALL ABORT_TRANS('DIR_TRANSAD:FIRST DIMENSION OF PGP TOO SMALL ') - ENDIF - IF(IUBOUND(2) < IF_GP) THEN - WRITE(NOUT,*)'DIR_TRANSAD:SEC. DIM. OF PGP TOO SMALL ',IUBOUND(2),IF_GP - CALL ABORT_TRANS('DIR_TRANSAD:SECOND DIMENSION OF PGP TOO SMALL ') - ENDIF - IF(IUBOUND(3) < NGPBLKS) THEN - WRITE(NOUT,*)'DIR_TRANSAD:THIRD DIM. OF PGP TOO SMALL ',IUBOUND(3),NGPBLKS - CALL ABORT_TRANS('DIR_TRANSAD:THIRD DIMENSION OF PGP TOO SMALL ') - ENDIF -ENDIF - -IF(PRESENT(PGPUV)) THEN - IF(.NOT.PRESENT(PSPVOR)) THEN - CALL ABORT_TRANS('DIR_TRANSAD:PSPVOR HAS TO BE PRESENT WHEN PGPUV IS') - ENDIF - IUBOUND=UBOUND(PGPUV) - IF(IUBOUND(1) < NPROMA) THEN - WRITE(NOUT,*)'DIR_TRANSAD:FIRST DIM. OF PGPUV TOO SMALL ',IUBOUND(1),NPROMA - CALL ABORT_TRANS('DIR_TRANSAD:FIRST DIMENSION OF PGPUV TOO SMALL ') - ENDIF - IF(IUBOUND(2) /= IF_UV_G) THEN - WRITE(NOUT,*)'DIR_TRANSAD:SEC. DIM. OF PGPUV INCONSISTENT ',IUBOUND(2),IF_UV_G - CALL ABORT_TRANS('DIR_TRANSAD:SEC. DIMENSION OF PGPUV INCONSISTENT ') - ENDIF - IF(IUBOUND(3) < 2) THEN - WRITE(NOUT,*)'DIR_TRANSAD:THIRD DIM. OF PGPUV TOO SMALL ',IUBOUND(3),2 - CALL ABORT_TRANS('DIR_TRANSAD:THIRD DIMENSION OF PGPUV TOO SMALL ') - ENDIF - IF(IUBOUND(4) < NGPBLKS) THEN - WRITE(NOUT,*)'DIR_TRANSAD:FOURTH DIM. OF PGPUV TOO SMALL ',IUBOUND(4),NGPBLKS - CALL ABORT_TRANS('DIR_TRANSAD:FOURTH DIMENSION OF PGPUV TOO SMALL ') - ENDIF -ENDIF - -IF(PRESENT(PGP2)) THEN - IF(.NOT.PRESENT(PSPSC2)) THEN - CALL ABORT_TRANS('DIR_TRANSAD:PSPSC2 HAS TO BE PRESENT WHEN PGP2 IS') - ENDIF -ENDIF -IF(IF_SC2_G > 0) THEN - IF(PRESENT(PGP2)) THEN - IUBOUND(1:3)=UBOUND(PGP2) - IF(IUBOUND(1) < NPROMA) THEN - WRITE(NOUT,*)'DIR_TRANSAD:FIRST DIM. OF PGP2 TOO SMALL ',IUBOUND(1),NPROMA - CALL ABORT_TRANS('DIR_TRANSAD:FIRST DIMENSION OF PGP2 TOO SMALL ') - ENDIF - IF(IUBOUND(2) /= IF_SC2_G) THEN - WRITE(NOUT,*)'DIR_TRANSAD:SEC. DIM. OF PGP2 INCONSISTENT ',IUBOUND(2),IF_SC2_G - CALL ABORT_TRANS('DIR_TRANSAD:SEC. DIMENSION OF PGP2 INCONSISTENT') - ENDIF - IF(IUBOUND(3) < NGPBLKS) THEN - WRITE(NOUT,*)'DIR_TRANSAD:THIRD DIM. OF PGP2 TOO SMALL ',IUBOUND(3),NGPBLKS - CALL ABORT_TRANS('DIR_TRANSAD:THIRD DIMENSION OF PGP2 TOO SMALL ') - ENDIF - ELSE - CALL ABORT_TRANS('DIR_TRANSAD:PGP2 MISSING') - ENDIF -ENDIF - -IF(PRESENT(PGP3A)) THEN - IF(.NOT.PRESENT(PSPSC3A)) THEN - CALL ABORT_TRANS('DIR_TRANSAD:PSPSC3A HAS TO BE PRESENT WHEN PGP3A IS') - ENDIF -ENDIF -IF(IF_SC3A_G > 0) THEN - IF(PRESENT(PGP3A)) THEN - IUBOUND=UBOUND(PGP3A) - IF(IUBOUND(1) < NPROMA) THEN - WRITE(NOUT,*)'DIR_TRANSAD:FIRST DIM. OF PGP3A TOO SMALL ',IUBOUND(1),NPROMA - CALL ABORT_TRANS('DIR_TRANSAD:FIRST DIMENSION OF PGP3A TOO SMALL ') - ENDIF - IF(IUBOUND(2) /= IF_SC3A_G) THEN - WRITE(NOUT,*)'DIR_TRANSAD:SEC DIM. OF PGP3A INCONSISTENT ',IUBOUND(2),IF_SC3A_G - CALL ABORT_TRANS('DIR_TRANSAD:SEC. DIMENSION OF PGP3A INCONSISTENT ') - ENDIF - IF(IUBOUND(3) /= UBOUND(PSPSC3A,3) ) THEN - WRITE(NOUT,*)'DIR_TRANSAD:THIRD DIM. OF PGP3A INCONSISTENT ',& - & IUBOUND(3),UBOUND(PSPSC3A,3) - CALL ABORT_TRANS('DIR_TRANSAD:THIRD DIMENSION OF PGP3A INCONSISTENT ') - ENDIF - IF(IUBOUND(4) < NGPBLKS) THEN - WRITE(NOUT,*)'DIR_TRANSAD:FOURTH DIM. OF PGP3A TOO SMALL ',IUBOUND(4),NGPBLKS - CALL ABORT_TRANS('DIR_TRANSAD:FOURTH DIMENSION OF PGP3A TOO SMALL ') - ENDIF - ELSE - CALL ABORT_TRANS('DIR_TRANSAD:PGP3A MISSING') - ENDIF -ENDIF - -IF(PRESENT(PGP3B)) THEN - IF(.NOT.PRESENT(PSPSC3B)) THEN - CALL ABORT_TRANS('DIR_TRANSAD:PSPSC3B HAS TO BE PRESENT WHEN PGP3B IS') - ENDIF -ENDIF -IF(IF_SC3B_G > 0) THEN - IF(PRESENT(PGP3B)) THEN - IUBOUND=UBOUND(PGP3B) - IF(IUBOUND(1) < NPROMA) THEN - WRITE(NOUT,*)'DIR_TRANSAD:FIRST DIM. OF PGP3B TOO SMALL ',IUBOUND(1),NPROMA - CALL ABORT_TRANS('DIR_TRANSAD:FIRST DIMENSION OF PGP3B TOO SMALL ') - ENDIF - IF(IUBOUND(2) /= IF_SC3B_G) THEN - WRITE(NOUT,*)'DIR_TRANSAD:SEC DIM. OF PGP3B INCONSISTENT ',IUBOUND(2),IF_SC3B_G - CALL ABORT_TRANS('DIR_TRANSAD:SEC. DIMENSION OF PGP3B INCONSISTENT ') - ENDIF - IF(IUBOUND(3) /= UBOUND(PSPSC3B,3) ) THEN - WRITE(NOUT,*)'DIR_TRANSAD:THIRD DIM. OF PGP3B INCONSISTENT ',& - & IUBOUND(3),UBOUND(PSPSC3B,3) - CALL ABORT_TRANS('DIR_TRANSAD:THIRD DIMENSION OF PGP3B INCONSISTENT ') - ENDIF - IF(IUBOUND(4) < NGPBLKS) THEN - WRITE(NOUT,*)'DIR_TRANSAD:FOURTH DIM. OF PGP3B TOO SMALL ',IUBOUND(4),NGPBLKS - CALL ABORT_TRANS('DIR_TRANSAD:FOURTH DIMENSION OF PGP3B TOO SMALL ') - ENDIF - ELSE - CALL ABORT_TRANS('DIR_TRANSAD:PGP3B MISSING') - ENDIF -ENDIF -CALL GSTATS(1810,1) - -! Perform transform - -CALL DIR_TRANS_CTLAD(IF_UV_G,IF_SCALARS_G,IF_GP,IF_FS,IF_UV,IF_SCALARS,& - & PSPVOR,PSPDIV,PSPSCALAR,KVSETUV,KVSETSC,PGP,& - & PSPSC3A,PSPSC3B,PSPSC2,KVSETSC3A,KVSETSC3B,KVSETSC2,PGPUV,PGP3A,PGP3B,PGP2) - -IF (LHOOK) CALL DR_HOOK('DIR_TRANSAD',1,ZHOOK_HANDLE) - -! ------------------------------------------------------------------ -!endif INTERFACE - -END SUBROUTINE DIR_TRANSAD - - diff --git a/src/trans/gpu/external/inv_transad.F90 b/src/trans/gpu/external/inv_transad.F90 deleted file mode 100755 index f4ad3d0fd..000000000 --- a/src/trans/gpu/external/inv_transad.F90 +++ /dev/null @@ -1,618 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -SUBROUTINE INV_TRANSAD(PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2,& - & FSPGL_PROC,& - & LDSCDERS,LDVORGP,LDDIVGP,LDUVDER,KPROMA,KVSETUV,KVSETSC,KRESOL,& - & KVSETSC3A,KVSETSC3B,KVSETSC2,& - & PGP,PGPUV,PGP3A,PGP3B,PGP2) - -!**** *INV_TRANSAD* - Inverse spectral transform - adjoint. - -! Purpose. -! -------- -! Interface routine for the inverse spectral transform - adjoint - -!** Interface. -! ---------- -! CALL INV_TRANSAD(...) - -! Explicit arguments : All arguments except from PGP are optional. -! -------------------- -! PSPVOR(:,:) - spectral vorticity (input) -! PSPDIV(:,:) - spectral divergence (input) -! PSPSCALAR(:,:) - spectral scalarvalued fields (input) -! PSPSC3A(:,:,:) - alternative to use of PSPSCALAR, see PGP3A below (input) -! PSPSC3B(:,:,:) - alternative to use of PSPSCALAR, see PGP3B below (input) -! PSPSC2(:,:) - alternative to use of PSPSCALAR, see PGP2 below (input) -! FSPGL_PROC - external procedure to be executed in fourier space -! before transposition -! LDSCDERS - indicating if derivatives of scalar variables are req. -! LDVORGP - indicating if grid-point vorticity is req. -! LDDIVGP - indicating if grid-point divergence is req. -! LDUVDER - indicating if E-W derivatives of u and v are req. -! KPROMA - required blocking factor for gridpoint output -! KVSETUV(:) - indicating which 'b-set' in spectral space owns a -! vor/div field. Equivalant to NBSETLEV in the IFS. -! The length of KVSETUV should be the GLOBAL number -! of u/v fields which is the dimension of u and v releated -! fields in grid-point space. -! KVESETSC(:) - indicating which 'b-set' in spectral space owns a -! scalar field. As for KVSETUV this argument is required -! if the total number of processors is greater than -! the number of processors used for distribution in -! spectral wave space. -! KVSETSC3A(:) - as KVESETSC for PSPSC3A (distribution on first dimension) -! KVSETSC3B(:) - as KVESETSC for PSPSC3B (distribution on first dimension) -! KVSETSC2(:) - as KVESETSC for PSPSC2 (distribution on first dimension) -! KRESOL - resolution tag which is required ,default is the -! first defined resulution (input) -! PGP(:,:,:) - gridpoint fields (output) -! PGP need to dimensioned (NPROMA,IF_GP,NGPBLKS) where -! NPROMA is the blocking factor, IF_GP the total number -! of output fields and NGPBLKS the number of NPROMA blocks. -! The ordering of the output fields is as follows (all -! parts are optional depending on the input switches): -! -! vorticity : IF_UV_G fields (if psvor present and LDVORGP) -! divergence : IF_UV_G fields (if psvor present and LDDIVGP) -! u : IF_UV_G fields (if psvor present) -! v : IF_UV_G fields (if psvor present) -! scalar fields : IF_SCALARS_G fields (if pspscalar present) -! N-S derivative of scalar fields : IF_SCALARS_G fields (if pspscalar -! present and LDSCDERS) -! E-W derivative of u : IF_UV_G fields (if psvor present and and LDUVDER) -! E-W derivative of v : IF_UV_G fields (if psvor present and and LDUVDER) -! E-W derivative of scalar fields : IF_SCALARS_G fields (if pspscalar -! present and LDSCDERS) -! -! Here IF_UV_G is the GLOBAL number of u/v fields as given by the length -! of KVSETUV (or by PSPVOR if no split in spectral 'b-set' direction -! IF_SCALARS_G is the GLOBAL number of scalar fields as giben by the -! length of KVESETSC (or by number of fields in PSPSCALAR if no spectral -! 'b-set' split - -! As an alternative to using PGP you can also use a combination of the -! following arrays. The reason for introducing these alternative ways -! of calling INV_TRANS is to avoid uneccessary copies where your data -! structures don't fit in to the 'PSPVOR,PSPDIV, PSPSCALAR, PGP' layout. -! The use of any of these precludes the use of PGP and vice versa. -! -! PGPUV(:,:,:,:) - the 'u-v' related grid-point variables in the order -! described for PGP. The second dimension of PGPUV should -! be the same as the "global" first dimension of -! PSPVOR,PSPDIV (in the IFS this is the number of levels) -! PGPUV need to be dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) -! IFLDS is the number of 'variables' (u,v,vor,div ...) -! PGP3A(:,:,:,:) - grid-point array directly connected with PSPSC3A -! dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) -! IFLDS is the number of 'variables' (the same as in -! PSPSC3A if no derivatives, 3 times that with der.) -! PGP3B(:,:,:,:) - grid-point array directly connected with PSPSC3B -! dimensioned(NPROMA,ILEVS,IFLDS,NGPBLKS) -! IFLDS is the number of 'variables' (the same as in -! PSPSC3B if no derivatives, 3 times that with der.) -! PGP2(:,:,:) - grid-point array directly connected with PSPSC2 -! dimensioned(NPROMA,IFLDS,NGPBLKS) -! IFLDS is the number of 'variables' (the same as in -! PSPSC2 if no derivatives, 3 times that with der.) - -! Method. -! ------- - -! Externals. SET_RESOL - set resolution -! ---------- LTDIR_CTLAD - control of Legendre transform -! FTDIR_CTLAD - control of Fourier transform - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 00-03-03 - -! ------------------------------------------------------------------ - -USE PARKIND1 ,ONLY : JPIM ,JPRB - -!ifndef INTERFACE - -USE TPM_GEN ,ONLY : NERR, NOUT, NPROMATR -!USE TPM_DIM -USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, & - & NF_SC2, NF_SC3A, NF_SC3B, & - & NGPBLKS, NPROMA -USE TPM_DISTR ,ONLY : D, NPRTRV, MYSETV -!USE TPM_GEOMETRY -!USE TPM_FIELDS -!USE TPM_FFT - -USE SET_RESOL_MOD ,ONLY : SET_RESOL -USE INV_TRANS_CTLAD_MOD ,ONLY : INV_TRANS_CTLAD -USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS -USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - -!endif INTERFACE - -IMPLICIT NONE - -! Declaration of arguments - -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3A(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3B(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC2(:,:) -LOGICAL ,OPTIONAL, INTENT(IN) :: LDSCDERS -LOGICAL ,OPTIONAL, INTENT(IN) :: LDVORGP -LOGICAL ,OPTIONAL, INTENT(IN) :: LDDIVGP -LOGICAL ,OPTIONAL, INTENT(IN) :: LDUVDER -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPROMA -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KRESOL -EXTERNAL FSPGL_PROC -OPTIONAL FSPGL_PROC -REAL(KIND=JPRB),OPTIONAL ,INTENT(IN) :: PGP(:,:,:) -REAL(KIND=JPRB),OPTIONAL ,INTENT(IN) :: PGPUV(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL ,INTENT(IN) :: PGP3A(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL ,INTENT(IN) :: PGP3B(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL ,INTENT(IN) :: PGP2(:,:,:) - -!ifndef INTERFACE - -! Local varaibles -INTEGER(KIND=JPIM) :: IUBOUND(4),J -INTEGER(KIND=JPIM) :: IF_UV,IF_UV_G,IF_SCALARS,IF_SCALARS_G,IF_FS,IF_GP,IF_OUT_LT -INTEGER(KIND=JPIM) :: IF_SCDERS,IF_UV_PAR -INTEGER(KIND=JPIM) :: IF_SC2_G,IF_SC3A_G2,IF_SC3A_G3,IF_SC3B_G2,IF_SC3B_G3 -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE -! ------------------------------------------------------------------ - -IF (LHOOK) CALL DR_HOOK('INV_TRANSAD',0,ZHOOK_HANDLE) -CALL GSTATS(1809,0) -! Set current resolution -CALL SET_RESOL(KRESOL) - -! Set defaults - -LVORGP = .FALSE. -LDIVGP = .FALSE. -LUVDER = .FALSE. -IF_UV = 0 -IF_UV_G = 0 -IF_UV_PAR = 0 -IF_SCALARS = 0 -IF_SCALARS_G = 0 -IF_SCDERS = 0 -NF_SC2 = 0 -NF_SC3A = 0 -NF_SC3B = 0 -IF_SC2_G = 0 -IF_SC3A_G2 = 0 -IF_SC3B_G2 = 0 -IF_SC3A_G3 = 0 -IF_SC3B_G3 = 0 -NPROMA = D%NGPTOT -LSCDERS = .FALSE. - -! Decide requirements - -IF(PRESENT(KVSETUV)) THEN - IF_UV_G = UBOUND(KVSETUV,1) - IF_UV_PAR = 2 - DO J=1,IF_UV_G - IF(KVSETUV(J) > NPRTRV .OR. KVSETUV(J) < 1) THEN - WRITE(NERR,*) 'INV_TRANSAD:KVSETUV(J) > NPRTRV ',J,KVSETUV(J),NPRTRV - CALL ABORT_TRANS('INV_TRANSAD:KVSETUV TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') - ENDIF - IF(KVSETUV(J) == MYSETV) THEN - IF_UV = IF_UV+1 - ENDIF - ENDDO -ELSEIF(PRESENT(PSPVOR)) THEN - IF_UV = UBOUND(PSPVOR,1) - IF_UV_G = IF_UV - IF_UV_PAR = 2 -ENDIF - -IF(PRESENT(KVSETSC)) THEN - IF_SCALARS_G = UBOUND(KVSETSC,1) - DO J=1,IF_SCALARS_G - IF(KVSETSC(J) > NPRTRV .OR. KVSETSC(J) < 1) THEN - WRITE(NERR,*) 'INV_TRANSAD:KVSETSC(J) > NPRTRV ',J,KVSETSC(J),NPRTRV - CALL ABORT_TRANS('INV_TRANSAD:KVSETSC TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') - ENDIF - IF(KVSETSC(J) == MYSETV) THEN - IF_SCALARS = IF_SCALARS+1 - ENDIF - ENDDO -ELSEIF(PRESENT(PSPSCALAR)) THEN - IF_SCALARS = UBOUND(PSPSCALAR,1) - IF_SCALARS_G = IF_SCALARS -ENDIF - -IF(PRESENT(KVSETSC2)) THEN - IF(.NOT.PRESENT(PSPSC2)) THEN - CALL ABORT_TRANS('INV_TRANSAD:KVSETSC2 BUT NOT PSPSC2') - ENDIF - IF_SC2_G = UBOUND(KVSETSC2,1) - IF_SCALARS_G = IF_SCALARS_G+UBOUND(KVSETSC2,1) - DO J=1,UBOUND(KVSETSC2,1) - IF(KVSETSC2(J) > NPRTRV .OR. KVSETSC2(J) < 1) THEN - WRITE(NERR,*) 'INV_TRANSAD:KVSETSC2(J) > NPRTRV ',J,KVSETSC2(J),NPRTRV - CALL ABORT_TRANS('INV_TRANSAD:KVSETSC2 TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') - ENDIF - IF(KVSETSC2(J) == MYSETV) THEN - IF_SCALARS = IF_SCALARS+1 - NF_SC2 = NF_SC2+1 - ENDIF - ENDDO -ELSEIF(PRESENT(PSPSC2)) THEN - IF_SC2_G = UBOUND(PSPSC2,1) - IF_SCALARS = IF_SCALARS+UBOUND(PSPSC2,1) - IF_SCALARS_G = IF_SCALARS_G +UBOUND(PSPSC2,1) - NF_SC2 = UBOUND(PSPSC2,1) -ENDIF - -IF(PRESENT(KVSETSC3A)) THEN - IF(.NOT.PRESENT(PSPSC3A)) THEN - CALL ABORT_TRANS('INV_TRANSAD:KVSETSC3A BUT NOT PSPSC3A') - ENDIF - IF_SC3A_G2 = UBOUND(KVSETSC3A,1) - IF_SC3A_G3 = UBOUND(PSPSC3A,3) - IF_SCALARS_G = IF_SCALARS_G+IF_SC3A_G2*IF_SC3A_G3 - DO J=1,UBOUND(KVSETSC3A,1) - IF(KVSETSC3A(J) > NPRTRV .OR. KVSETSC3A(J) < 1) THEN - WRITE(NERR,*) 'INV_TRANSAD:KVSETSC3A(J) > NPRTRV ',J,KVSETSC3A(J),NPRTRV - CALL ABORT_TRANS& - &('INV_TRANSAD:KVSETSC3A TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') - ENDIF - IF(KVSETSC3A(J) == MYSETV) THEN - IF_SCALARS = IF_SCALARS+UBOUND(PSPSC3A,3) - NF_SC3A = NF_SC3A+1 - ENDIF - ENDDO -ELSEIF(PRESENT(PSPSC3A)) THEN - IF_SCALARS = IF_SCALARS+UBOUND(PSPSC3A,1)*UBOUND(PSPSC3A,3) - IF_SC3A_G2 = UBOUND(PSPSC3A,1) - IF_SC3A_G3 = UBOUND(PSPSC3A,3) - IF_SCALARS_G = IF_SCALARS_G + IF_SC3A_G2*IF_SC3A_G3 - NF_SC3A = UBOUND(PSPSC3A,1) -ENDIF - -IF(PRESENT(KVSETSC3B)) THEN - IF(.NOT.PRESENT(PSPSC3B)) THEN - CALL ABORT_TRANS('INV_TRANSAD:KVSETSC3B BUT NOT PSPSC3B') - ENDIF - IF_SC3B_G2 = UBOUND(KVSETSC3B,1) - IF_SC3B_G3 = UBOUND(PSPSC3B,3) - IF_SCALARS_G = IF_SCALARS_G+IF_SC3B_G2*IF_SC3B_G3 - DO J=1,UBOUND(KVSETSC3B,1) - IF(KVSETSC3B(J) > NPRTRV .OR. KVSETSC3B(J) < 1) THEN - WRITE(NERR,*) 'INV_TRANSAD:KVSETSC3B(J) > NPRTRV ',J,KVSETSC3B(J),NPRTRV - CALL ABORT_TRANS('INV_TRANSAD:KVSETSC3B TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') - ENDIF - IF(KVSETSC3B(J) == MYSETV) THEN - IF_SCALARS = IF_SCALARS+UBOUND(PSPSC3B,3) - NF_SC3B = NF_SC3B+1 - ENDIF - ENDDO -ELSEIF(PRESENT(PSPSC3B)) THEN - IF_SCALARS = IF_SCALARS+UBOUND(PSPSC3B,1)*UBOUND(PSPSC3B,3) - IF_SC3B_G2 = UBOUND(PSPSC3B,1) - IF_SC3B_G3 = UBOUND(PSPSC3B,3) - IF_SCALARS_G = IF_SCALARS_G + IF_SC3B_G2*IF_SC3B_G3 - NF_SC3B = UBOUND(PSPSC3B,1) -ENDIF - - -IF (IF_SCALARS > 0) THEN - IF(PRESENT(LDSCDERS)) THEN - LSCDERS = LDSCDERS - IF (LSCDERS) IF_SCDERS = IF_SCALARS - ENDIF -ENDIF - -IF(PRESENT(KPROMA)) THEN - NPROMA = KPROMA -ENDIF - -IF(PRESENT(LDVORGP)) THEN - LVORGP = LDVORGP -ENDIF - -IF(PRESENT(LDDIVGP)) THEN - LDIVGP = LDDIVGP -ENDIF - -IF(PRESENT(LDUVDER)) THEN - LUVDER = LDUVDER -ENDIF - - - -! Compute derived variables - - -IF(LVORGP) LDIVGP = .TRUE. - -NGPBLKS = (D%NGPTOT-1)/NPROMA+1 - -IF_OUT_LT = 2*IF_UV + IF_SCALARS+IF_SCDERS - -IF(IF_UV > 0 .AND. LVORGP) THEN - IF_OUT_LT = IF_OUT_LT+IF_UV -ENDIF -IF(IF_UV > 0 .AND. LDIVGP) THEN - IF_OUT_LT = IF_OUT_LT+IF_UV -ENDIF -IF_FS = IF_OUT_LT+IF_SCDERS -IF(IF_UV > 0 .AND. LUVDER) THEN - IF_FS = IF_FS+2*IF_UV -ENDIF - -IF_GP = 2*IF_UV_G+IF_SCALARS_G -IF(LSCDERS) THEN - IF_GP = IF_GP+2*IF_SCALARS_G - IF_SC2_G = IF_SC2_G*3 - IF_SC3A_G3 = IF_SC3A_G3*3 - IF_SC3B_G3 = IF_SC3B_G3*3 -ENDIF -IF(IF_UV_G > 0 .AND. LVORGP) THEN - IF_GP = IF_GP+IF_UV_G - IF_UV_PAR = IF_UV_PAR+1 -ENDIF -IF(IF_UV_G > 0 .AND. LDIVGP) THEN - IF_GP = IF_GP+IF_UV_G - IF_UV_PAR = IF_UV_PAR+1 -ENDIF -IF(IF_UV_G > 0 .AND. LUVDER) THEN - IF_GP = IF_GP+2*IF_UV_G - IF_UV_PAR = IF_UV_PAR+2 -ENDIF - -! Consistency checks - -IF (IF_UV > 0) THEN - IF(.NOT. PRESENT(PSPVOR) ) THEN - CALL ABORT_TRANS("INV_TRANSAD : IF_UV > 0 BUT PSPVOR MISSING") - ENDIF - IF(UBOUND(PSPVOR,1) < IF_UV) THEN - WRITE(NERR,*)'INV_TRANSAD : UBOUND(PSPVOR,1) < IF_UV ',& - & UBOUND(PSPVOR,1),IF_UV - CALL ABORT_TRANS("INV_TRANSAD : PSPVOR TOO SHORT") - ENDIF - IF(.NOT. PRESENT(PSPDIV) ) THEN - CALL ABORT_TRANS("INV_TRANSAD : IF_UV > 0 BUT PSPDIV MISSING") - ENDIF - IF(UBOUND(PSPDIV,1) < IF_UV) THEN - WRITE(NERR,*)'INV_TRANSAD : UBOUND(PSPDIV,1) < IF_UV ',& - & UBOUND(PSPDIV,1),IF_UV - CALL ABORT_TRANS("INV_TRANSAD : PSPDIV TOO SHORT") - ENDIF -ENDIF - -IF (IF_SCALARS > 0) THEN - IF(PRESENT(PSPSCALAR)) THEN - IF(PRESENT(PSPSC3A))THEN - CALL ABORT_TRANS('INV_TRANS : PSPSCALAR AND PSPSC3A BOTH PRESENT') - ENDIF - IF(PRESENT(PSPSC3B))THEN - CALL ABORT_TRANS('INV_TRANS : PSPSCALAR AND PSPSC3B BOTH PRESENT') - ENDIF - IF(PRESENT(PSPSC2))THEN - CALL ABORT_TRANS('INV_TRANS : PSPSCALAR AND PSPSC2 BOTH PRESENT') - ENDIF - IF(UBOUND(PSPSCALAR,1) < IF_SCALARS) THEN - WRITE(NERR,*)'INV_TRANS : UBOUND(PSPSCALAR,1) < IF_SCALARS) ',& - & UBOUND(PSPSCALAR,1),IF_SCALARS - CALL ABORT_TRANS('INV_TRANS : PSPSCALAR TOO SHORT') - ENDIF - ELSEIF(PRESENT(PSPSC3A)) THEN - ENDIF -ENDIF - -IF(IF_UV_G == 0) THEN - LUVDER = .FALSE. -ENDIF - -IF(NPRTRV >1) THEN - IF(IF_UV > 0 .AND. .NOT. PRESENT(KVSETUV)) THEN - WRITE(NERR,*)'NPRTRV >1 AND IF_UV > 0 AND NOT PRESENT(KVSETUV)',& - &NPRTRV,IF_UV - CALL ABORT_TRANS('INV_TRANSAD: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') - ENDIF - IF(PRESENT(PSPSCALAR) .AND. .NOT. PRESENT(KVSETSC)) THEN - WRITE(NERR,*)'NPRTRV >1 AND PRESENT(PSPSCALAR) AND NOT PRESENT(KVSETSC)',& - &NPRTRV - CALL ABORT_TRANS('INV_TRANSAD: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') - ENDIF - IF(PRESENT(PSPSC2) .AND. .NOT. PRESENT(KVSETSC2)) THEN - WRITE(NERR,*)'NPRTRV >1 AND PRESENT(PSPSC2) AND NOT PRESENT(KVSETSC2)',& - &NPRTRV - CALL ABORT_TRANS('INV_TRANSAD: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') - ENDIF - IF(PRESENT(PSPSC3A) .AND. .NOT. PRESENT(KVSETSC3A)) THEN - WRITE(NERR,*)'NPRTRV >1 AND PRESENT(PSPSC3A) AND NOT PRESENT(KVSETSC3A)',& - &NPRTRV - CALL ABORT_TRANS('INV_TRANSAD: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') - ENDIF - IF(PRESENT(PSPSC3B) .AND. .NOT. PRESENT(KVSETSC3B)) THEN - WRITE(NERR,*)'NPRTRV >1 AND PRESENT(PSPSC3B) AND NOT PRESENT(KVSETSC3B)',& - &NPRTRV - CALL ABORT_TRANS('INV_TRANSAD: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') - ENDIF -ENDIF - -IF(PRESENT(PGP)) THEN - IF(PRESENT(PGPUV)) THEN - CALL ABORT_TRANS('INV_TRANSAD:PGP AND PGPUV CAN NOT BOTH BE PRESENT') - ENDIF - IF(PRESENT(PGP3A)) THEN - CALL ABORT_TRANS('INV_TRANSAD:PGP AND PGP3A CAN NOT BOTH BE PRESENT') - ENDIF - IF(PRESENT(PGP3B)) THEN - CALL ABORT_TRANS('INV_TRANSAD:PGP AND PGP3B CAN NOT BOTH BE PRESENT') - ENDIF - IF(PRESENT(PGP2)) THEN - CALL ABORT_TRANS('INV_TRANSAD:PGP AND PGP2 CAN NOT BOTH BE PRESENT') - ENDIF - IUBOUND(1:3)=UBOUND(PGP) - IF(IUBOUND(1) < NPROMA) THEN - WRITE(NOUT,*)'INV_TRANSAD:FIRST DIM. OF PGP TOO SMALL ',IUBOUND(1),NPROMA - CALL ABORT_TRANS('INV_TRANSAD:FIRST DIMENSION OF PGP TOO SMALL ') - ENDIF - IF(IUBOUND(2) < IF_GP) THEN - WRITE(NOUT,*)'INV_TRANSAD:SEC. DIM. OF PGP TOO SMALL ',IUBOUND(2),IF_GP - WRITE(NOUT,*)'IF_UV_G,IF_SCALARS_G,LSCDERS,LVORGP,LDIVGP,LUVDER ',& - & IF_UV_G,IF_SCALARS_G,LSCDERS,LVORGP,LDIVGP,LUVDER - CALL ABORT_TRANS('INV_TRANSAD:SECOND DIMENSION OF PGP TOO SMALL ') - ENDIF - IF(IUBOUND(3) < NGPBLKS) THEN - WRITE(NOUT,*)'INV_TRANSAD:THIRD DIM. OF PGP TOO SMALL ',IUBOUND(3),NGPBLKS - CALL ABORT_TRANS('INV_TRANSAD:THIRD DIMENSION OF PGP TOO SMALL ') - ENDIF -ELSE - IF(NPROMATR > 0 .AND. 2*IF_UV_G+IF_SCALARS_G > NPROMATR) THEN - CALL ABORT_TRANS('INV_TRANSAD:ALTERNATIVES TO USING PGP NOT SUPPORTED WITH NPROMATR>0') - ENDIF -ENDIF - -IF(PRESENT(PGPUV)) THEN - IF(.NOT.PRESENT(PSPVOR)) THEN - CALL ABORT_TRANS('INV_TRANSAD:PSPVOR HAS TO BE PRESENT WHEN PGPUV IS') - ENDIF - IUBOUND(1:4)=UBOUND(PGPUV) - IF(IUBOUND(1) < NPROMA) THEN - WRITE(NOUT,*)'INV_TRANSAD:FIRST DIM. OF PGPUV TOO SMALL ',IUBOUND(1),NPROMA - CALL ABORT_TRANS('INV_TRANSAD:FIRST DIMENSION OF PGPUV TOO SMALL ') - ENDIF - IF(IUBOUND(2) /= IF_UV_G) THEN - WRITE(NOUT,*)'INV_TRANSAD:SEC. DIM. OF PGPUV INCONSISTENT ',IUBOUND(2),IF_UV_G - CALL ABORT_TRANS('INV_TRANSAD:SEC. DIMENSION OF PGPUV INCONSISTENT ') - ENDIF - IF(IUBOUND(3) < IF_UV_PAR) THEN - WRITE(NOUT,*)'INV_TRANSAD:THIRD DIM. OF PGPUV TOO SMALL ',IUBOUND(3),IF_UV_PAR - CALL ABORT_TRANS('INV_TRANSAD:THIRD DIMENSION OF PGPUV TOO SMALL ') - ENDIF - IF(IUBOUND(4) < NGPBLKS) THEN - WRITE(NOUT,*)'INV_TRANSAD:THIRD DIM. OF PGPUV TOO SMALL ',IUBOUND(4),NGPBLKS - CALL ABORT_TRANS('INV_TRANSAD:THIRD DIMENSION OF PGPUV TOO SMALL ') - ENDIF -ENDIF - -IF(PRESENT(PGP2)) THEN - IF(.NOT.PRESENT(PSPSC2)) THEN - CALL ABORT_TRANS('INV_TRANSAD:PSPSC2 HAS TO BE PRESENT WHEN PGP2 IS') - ENDIF -ENDIF -IF(IF_SC2_G > 0) THEN - IF(PRESENT(PGP2)) THEN - IUBOUND(1:3)=UBOUND(PGP2) - IF(IUBOUND(1) < NPROMA) THEN - WRITE(NOUT,*)'INV_TRANSAD:FIRST DIM. OF PGP2 TOO SMALL ',IUBOUND(1),NPROMA - CALL ABORT_TRANS('INV_TRANSAD:FIRST DIMENSION OF PGP2 TOO SMALL ') - ENDIF - IF(IUBOUND(2) /= IF_SC2_G) THEN - WRITE(NOUT,*)'INV_TRANSAD:SEC. DIM. OF PGP2 INCONSISTENT ',IUBOUND(2),IF_SC2_G - CALL ABORT_TRANS('INV_TRANSAD:SEC. DIMENSION OF PGP2 INCONSISTENT') - ENDIF - IF(IUBOUND(3) < NGPBLKS) THEN - WRITE(NOUT,*)'INV_TRANSAD:THIRD DIM. OF PGP2 TOO SMALL ',IUBOUND(3),NGPBLKS - CALL ABORT_TRANS('INV_TRANSAD:THIRD DIMENSION OF PGP2 TOO SMALL ') - ENDIF - ELSE - CALL ABORT_TRANS('INV_TRANSAD:PGP2 MISSING') - ENDIF -ENDIF - -IF(PRESENT(PGP3A)) THEN - IF(.NOT.PRESENT(PSPSC3A)) THEN - CALL ABORT_TRANS('INV_TRANSAD:PSPSC3A HAS TO BE PRESENT WHEN PGP3A IS') - ENDIF -ENDIF -IF(IF_SC3A_G3 > 0) THEN - IF(PRESENT(PGP3A)) THEN - IUBOUND=UBOUND(PGP3A) - IF(IUBOUND(1) < NPROMA) THEN - WRITE(NOUT,*)'INV_TRANSAD:FIRST DIM. OF PGP3A TOO SMALL ',IUBOUND(1),NPROMA - CALL ABORT_TRANS('INV_TRANSAD:FIRST DIMENSION OF PGP3A TOO SMALL ') - ENDIF - IF(IUBOUND(2) /= IF_SC3A_G2) THEN - WRITE(NOUT,*)'INV_TRANSAD:SEC DIM. OF PGP3A INCONSISTENT ',IUBOUND(2),IF_SC3A_G2 - CALL ABORT_TRANS('INV_TRANSAD:SEC. DIMENSION OF PGP3A INCONSISTENT ') - ENDIF - IF(IUBOUND(3) /= IF_SC3A_G3 ) THEN - WRITE(NOUT,*)'INV_TRANSAD:THIRD DIM. OF PGP3A INCONSISTENT ',& - & IUBOUND(3),IF_SC3A_G3 - CALL ABORT_TRANS('INV_TRANSAD:THIRD DIMENSION OF PGP3A INCONSISTENT ') - ENDIF - IF(IUBOUND(4) < NGPBLKS) THEN - WRITE(NOUT,*)'INV_TRANSAD:FOURTH DIM. OF PGP3A TOO SMALL ',IUBOUND(4),NGPBLKS - CALL ABORT_TRANS('INV_TRANSAD:FOURTH DIMENSION OF PGP3A TOO SMALL ') - ENDIF - ELSE - CALL ABORT_TRANS('INV_TRANSAD:PGP3A MISSING') - ENDIF -ENDIF - -IF(PRESENT(PGP3B)) THEN - IF(.NOT.PRESENT(PSPSC3B)) THEN - CALL ABORT_TRANS('INV_TRANSAD:PSPSC3B HAS TO BE PRESENT WHEN PGP3B IS') - ENDIF -ENDIF -IF(IF_SC3B_G3 > 0) THEN - IF(PRESENT(PGP3B)) THEN - IUBOUND=UBOUND(PGP3B) - IF(IUBOUND(1) < NPROMA) THEN - WRITE(NOUT,*)'INV_TRANSAD:FIRST DIM. OF PGP3B TOO SMALL ',IUBOUND(1),NPROMA - CALL ABORT_TRANS('INV_TRANSAD:FIRST DIMENSION OF PGP3B TOO SMALL ') - ENDIF - IF(IUBOUND(2) /= IF_SC3B_G2) THEN - WRITE(NOUT,*)'INV_TRANSAD:SEC DIM. OF PGP3B INCONSISTENT ',IUBOUND(2),IF_SC3B_G2 - CALL ABORT_TRANS('INV_TRANSAD:SEC. DIMENSION OF PGP3B INCONSISTENT ') - ENDIF - IF(IUBOUND(3) /= IF_SC3B_G3 ) THEN - WRITE(NOUT,*)'INV_TRANSAD:THIRD DIM. OF PGP3B INCONSISTENT ',& - & IUBOUND(3),IF_SC3B_G3 - CALL ABORT_TRANS('INV_TRANSAD:THIRD DIMENSION OF PGP3B INCONSISTENT ') - ENDIF - IF(IUBOUND(4) < NGPBLKS) THEN - WRITE(NOUT,*)'INV_TRANSAD:FOURTH DIM. OF PGP3B TOO SMALL ',IUBOUND(4),NGPBLKS - CALL ABORT_TRANS('INV_TRANSAD:FOURTH DIMENSION OF PGP3B TOO SMALL ') - ENDIF - ELSE - CALL ABORT_TRANS('INV_TRANSAD:PGP3B MISSING') - ENDIF -ENDIF -CALL GSTATS(1809,1) - -! ------------------------------------------------------------------ - -! Perform transform - -CALL INV_TRANS_CTLAD(IF_UV_G,IF_SCALARS_G,IF_GP,IF_FS,IF_OUT_LT,& - & IF_UV,IF_SCALARS,IF_SCDERS,& - & PSPVOR,PSPDIV,PSPSCALAR,KVSETUV,KVSETSC,PGP,& - & PSPSC3A,PSPSC3B,PSPSC2,KVSETSC3A,KVSETSC3B,KVSETSC2,PGPUV,PGP3A,PGP3B,PGP2) - -IF (LHOOK) CALL DR_HOOK('INV_TRANSAD',1,ZHOOK_HANDLE) - -! ------------------------------------------------------------------ - -!endif INTERFACE - -END SUBROUTINE INV_TRANSAD - diff --git a/src/trans/gpu/external/vordiv_to_uv.F90 b/src/trans/gpu/external/vordiv_to_uv.F90 deleted file mode 100755 index 6b7a71e83..000000000 --- a/src/trans/gpu/external/vordiv_to_uv.F90 +++ /dev/null @@ -1,178 +0,0 @@ -! (C) Copyright 2015- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -SUBROUTINE VORDIV_TO_UV(PSPVOR,PSPDIV,PSPU,PSPV,KSMAX,KVSETUV) - -!**** *VORDIV_TO_UV* - Convert spectral vorticity and divergence to spectral U (u*cos(theta)) and V (v*cos(theta). - -! Purpose. -! -------- -! Interface routine for Convert spectral vorticity and divergence to spectral U and V - -!** Interface. -! ---------- -! CALL VORDIV_TO_UV(...) - -! Explicit arguments : -! -------------------- -! PSPVOR(:,:) - spectral vorticity (input) -! PSPDIV(:,:) - spectral divergence (input) -! PSPU(:,:) - spectral U (u*cos(theta) (output) -! PSPV(:,:) - spectral V (v*cos(theta) (output) -! KSMAX - spectral resolution (input) -! KVSETUV(:) - Optionally indicating which 'b-set' in spectral space owns a -! vor/div field. Equivalant to NBSETLEV in the IFS. -! The length of KVSETUV should be the GLOBAL number -! of u/v fields which is the dimension of u and v releated -! fields in grid-point space. - -! Method. -! ------- - -! Externals. SET_RESOL - set resolution -! ---------- VD2UV_CTL - control vordiv to uv - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 15-06-15 - - -! ------------------------------------------------------------------ - -USE PARKIND1 ,ONLY : JPIM ,JPRB - -!ifndef INTERFACE - -USE TPM_GEN ,ONLY : NERR, NOUT,MSETUP0 -USE TPM_DISTR ,ONLY : D, NPRTRV, MYSETV - -USE SET_RESOL_MOD ,ONLY : SET_RESOL -USE VD2UV_CTL_MOD ,ONLY : VD2UV_CTL -USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS -USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - -!endif INTERFACE - -IMPLICIT NONE - -! Declaration of arguments - -REAL(KIND=JPRB), INTENT(IN) :: PSPVOR(:,:) -REAL(KIND=JPRB), INTENT(IN) :: PSPDIV(:,:) -REAL(KIND=JPRB), INTENT(OUT) :: PSPU(:,:) -REAL(KIND=JPRB), INTENT(OUT) :: PSPV(:,:) -INTEGER(KIND=JPIM) , INTENT(IN) :: KSMAX -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) - -!ifndef INTERFACE - -! Local varaibles -INTEGER(KIND=JPIM) :: IUBOUND(4),J -INTEGER(KIND=JPIM) :: IF_UV,IF_UV_G,IRESOL,IDGL -LOGICAL :: LTMP_SETUP0 -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - -#include "setup_trans0.h" -#include "setup_trans.h" -#include "trans_release.h" -#include "trans_end.h" - -! ------------------------------------------------------------------ - -IF (LHOOK) CALL DR_HOOK('VORDIV_TO_UV',0,ZHOOK_HANDLE) - -!CALL GSTATS(XXXX,0) - -IF(MSETUP0 == 0) THEN - CALL SETUP_TRANS0() - LTMP_SETUP0 = .TRUE. -ELSE - LTMP_SETUP0 = .FALSE. -ENDIF -IDGL = 2 ! It doesn't matter as long as it's a positive even number -CALL SETUP_TRANS(KSMAX,IDGL,LDSPSETUPONLY=.TRUE.,KRESOL=IRESOL) -CALL SET_RESOL(IRESOL) - - -! Set defaults - -IF_UV = 0 -IF_UV_G = 0 -! Decide requirements - -IF(PRESENT(KVSETUV)) THEN - IF_UV_G = UBOUND(KVSETUV,1) - DO J=1,IF_UV_G - IF(KVSETUV(J) > NPRTRV .OR. KVSETUV(J) < 1) THEN - WRITE(NERR,*) 'VORDIV_TO_UV:KVSETUV(J) > NPRTRV ',J,KVSETUV(J),NPRTRV - CALL ABORT_TRANS('VORDIV_TO_UV:KVSETUV TOO LONG OR CONTAINS VALUES OUTSIDE RANGE') - ENDIF - IF(KVSETUV(J) == MYSETV) THEN - IF_UV = IF_UV+1 - ENDIF - ENDDO -ELSE - IF_UV = UBOUND(PSPVOR,1) - IF_UV_G = IF_UV -ENDIF - -! Consistency checks - -IF (IF_UV > 0) THEN - IF(UBOUND(PSPVOR,1) < IF_UV) THEN - WRITE(NERR,*)'VORDIV_TO_UV : UBOUND(PSPVOR,1) < IF_UV ',UBOUND(PSPVOR,1),IF_UV - CALL ABORT_TRANS('VORDIV_TO_UV : PSPVOR TOO SHORT') - ENDIF - IF(UBOUND(PSPDIV,1) < IF_UV) THEN - WRITE(NERR,*)'VORDIV_TO_UV : UBOUND(PSPDIV,1) < IF_UV ',UBOUND(PSPDIV,1),IF_UV - CALL ABORT_TRANS('VORDIV_TO_UV : PSPDIV TOO SHORT') - ENDIF - IF(UBOUND(PSPU,1) < IF_UV) THEN - WRITE(NERR,*)'VORDIV_TO_UV : UBOUND(PSPU,1) < IF_UV ',UBOUND(PSPU,1),IF_UV - CALL ABORT_TRANS('VORDIV_TO_UV : PSPU TOO SHORT') - ENDIF - IF(UBOUND(PSPV,1) < IF_UV) THEN - WRITE(NERR,*)'VORDIV_TO_UV : UBOUND(PSPV,1) < IF_UV ',UBOUND(PSPV,1),IF_UV - CALL ABORT_TRANS('VORDIV_TO_UV : PSPV TOO SHORT') - ENDIF -ENDIF - - -IF(NPRTRV >1) THEN - IF(IF_UV > 0 .AND. .NOT. PRESENT(KVSETUV)) THEN - WRITE(NERR,*)'NPRTRV >1 AND IF_UV > 0 AND NOT PRESENT(KVSETUV)',& - &NPRTRV,IF_UV - CALL ABORT_TRANS('VORDIV_TO_UV: SPECIFY VERTICAL SPECTRAL DISTRIBUTION!') - ENDIF -ENDIF - -!CALL GSTATS(XXXX,1) - -! ------------------------------------------------------------------ - -! Perform transform - -CALL VD2UV_CTL(IF_UV,PSPVOR,PSPDIV,PSPU,PSPV) - -CALL TRANS_RELEASE(IRESOL) -IF (LTMP_SETUP0) THEN - CALL TRANS_END() -ENDIF - -IF (LHOOK) CALL DR_HOOK('VORDIV_TO_UV',1,ZHOOK_HANDLE) -! ------------------------------------------------------------------ - -!endif INTERFACE - -END SUBROUTINE VORDIV_TO_UV - diff --git a/src/trans/gpu/internal/asre1ad_mod.F90 b/src/trans/gpu/internal/asre1ad_mod.F90 deleted file mode 100755 index 2c9ca158a..000000000 --- a/src/trans/gpu/internal/asre1ad_mod.F90 +++ /dev/null @@ -1,91 +0,0 @@ -! (C) Copyright 2001- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE ASRE1AD_MOD -CONTAINS -SUBROUTINE ASRE1AD(KM,KMLOC,KF_OUT_LT,PAOA1,PSOA1) - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - -!USE TPM_TRANS - -USE ASRE1BAD_MOD ,ONLY : ASRE1BAD - - -!**** *ASRE1AD* - Recombine antisymmetric and symmetric parts - adjoint - -! Purpose. -! -------- -! To recombine the antisymmetric and symmetric parts of the -! Fourier arrays and update the correct parts of the state -! variables. - -!** Interface. -! ---------- -! *CALL* *ASRE1AD(...) - -! Explicit arguments : -! -------------------- -! KM - zonal wavenumber -! KMLOC - local zonal wavenumber -! PAOA1 - antisymmetric part of Fourier -! fields for zonal wavenumber KM (basic -! variables and N-S derivatives) -! PSOA1 - symmetric part of Fourier -! fields for zonal wavenumber KM (basic -! variables and N-S derivatives) - -! Implicit arguments : None -! -------------------- - -! Method. -! ------- - -! Externals. ASRE1BAD - basic recombination routine -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 00-02-01 From ASRE1AD in IFS CY22R1 - -! ------------------------------------------------------------------ - - -IMPLICIT NONE - - -! DUMMY INTEGER SCALARS -INTEGER(KIND=JPIM) , INTENT(IN) :: KM -INTEGER(KIND=JPIM) , INTENT(IN) :: KMLOC -INTEGER(KIND=JPIM) , INTENT(IN) :: KF_OUT_LT - -REAL(KIND=JPRBT) , INTENT(OUT) :: PSOA1(:,:), PAOA1(:,:) - -! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: IFLDS - - -! ------------------------------------------------------------------ - -IFLDS = KF_OUT_LT - -CALL ASRE1BAD(IFLDS,KM,KMLOC,PAOA1,PSOA1) - -! ------------------------------------------------------------------ - -END SUBROUTINE ASRE1AD -END MODULE ASRE1AD_MOD diff --git a/src/trans/gpu/internal/asre1bad_mod.F90 b/src/trans/gpu/internal/asre1bad_mod.F90 deleted file mode 100755 index 34854f4ef..000000000 --- a/src/trans/gpu/internal/asre1bad_mod.F90 +++ /dev/null @@ -1,107 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE ASRE1BAD_MOD -CONTAINS -SUBROUTINE ASRE1BAD(KFIELD,KM,KMLOC,PAOA,PSOA) - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - -USE TPM_DIM ,ONLY : R -USE TPM_TRANS ,ONLY : FOUBUF_IN -USE TPM_GEOMETRY ,ONLY : G -USE TPM_DISTR ,ONLY : D - - -!**** *ASRE1BAD* - Recombine antisymmetric and symmetric parts - adjoint - -! Purpose. -! -------- -! To recombine the antisymmetric and symmetric parts of the -! Fourier arrays and update the correct parts of the state -! variables. - -!** Interface. -! ---------- -! *CALL* *ASRE1BAD(..) - -! Explicit arguments : -! ------------------- KFIELD - number of fields (input-c) -! KM - zonal wavenumber(input-c) -! KMLOC - local version of KM (input-c) -! PAOA - antisymmetric part of Fourier -! fields for zonal wavenumber KM (input) -! PSOA - symmetric part of Fourier -! fields for zonal wavenumber KM (input) - -! Implicit arguments : FOUBUF_IN - output buffer (output) -! -------------------- - -! Method. -! ------- - -! Externals. None. -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 00-02-01 From ASRE1BAD in IFS CY22R1 - -! ------------------------------------------------------------------ - - -IMPLICIT NONE - -INTEGER(KIND=JPIM), INTENT(IN) :: KFIELD,KM,KMLOC -REAL(KIND=JPRBT), INTENT(OUT) :: PSOA(:,:) -REAL(KIND=JPRBT), INTENT(OUT) :: PAOA(:,:) - -! LOCAL INTEGERS -INTEGER(KIND=JPIM) :: ISL, IGLS, JFLD, JGL ,IPROC, IPROCS, IDGNH -INTEGER(KIND=JPIM) :: ISTAN(R%NDGNH),ISTAS(R%NDGNH) - -! ------------------------------------------------------------------ - -!* 1. RECOMBINATION OF SYMMETRIC AND ANTSYMMETRIC PARTS. -! --------------------------------------------------- - -ISL = MAX(R%NDGNH-G%NDGLU(KM)+1,1) -IDGNH = R%NDGNH - -!* 1.2 RECOMBINE - -DO JGL=ISL,IDGNH - IPROC = D%NPROCL(JGL) - ISTAN(JGL) = (D%NSTAGT0B(IPROC) + D%NPNTGTB1(KMLOC,JGL))*2*KFIELD - IGLS = R%NDGL+1-JGL - IPROCS = D%NPROCL(IGLS) - ISTAS(JGL) = (D%NSTAGT0B(IPROCS) + D%NPNTGTB1(KMLOC,IGLS))*2*KFIELD -ENDDO - -DO JGL=ISL,IDGNH -!OCL NOVREC - DO JFLD=1,2*KFIELD - PSOA(JFLD,JGL) = FOUBUF_IN(ISTAN(JGL)+JFLD)+FOUBUF_IN(ISTAS(JGL)+JFLD) - PAOA(JFLD,JGL) = FOUBUF_IN(ISTAN(JGL)+JFLD)-FOUBUF_IN(ISTAS(JGL)+JFLD) - ENDDO -ENDDO - -! ------------------------------------------------------------------ - -END SUBROUTINE ASRE1BAD -END MODULE ASRE1BAD_MOD - diff --git a/src/trans/gpu/internal/dir_trans_ctlad_mod.F90 b/src/trans/gpu/internal/dir_trans_ctlad_mod.F90 deleted file mode 100755 index b78148979..000000000 --- a/src/trans/gpu/internal/dir_trans_ctlad_mod.F90 +++ /dev/null @@ -1,193 +0,0 @@ -! (C) Copyright 2001- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE DIR_TRANS_CTLAD_MOD -CONTAINS -SUBROUTINE DIR_TRANS_CTLAD(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& - & PSPVOR,PSPDIV,PSPSCALAR,KVSETUV,KVSETSC,PGP,& - & PSPSC3A,PSPSC3B,PSPSC2,KVSETSC3A,KVSETSC3B,KVSETSC2,PGPUV,PGP3A,PGP3B,PGP2) - -!**** *DIR_TRANS_CTLAD* - Control routine for direct spectral transform-adj. - -! Purpose. -! -------- -! Control routine for the direct spectral transform - -!** Interface. -! ---------- -! CALL DIR_TRANS_CTLAD(...) - -! Explicit arguments : -! -------------------- -! KF_UV_G - global number of spectral u-v fields -! KF_SCALARS_G - global number of scalar spectral fields -! KF_GP - total number of output gridpoint fields -! KF_FS - total number of fields in fourier space -! KF_UV - local number of spectral u-v fields -! KF_SCALARS - local number of scalar spectral fields -! PSPVOR(:,:) - spectral vorticity -! PSPDIV(:,:) - spectral divergence -! PSPSCALAR(:,:) - spectral scalarvalued fields -! KVSETUV(:) - indicating which 'b-set' in spectral space owns a -! vor/div field. Equivalant to NBSETLEV in the IFS. -! The length of KVSETUV should be the GLOBAL number -! of u/v fields which is the dimension of u and v releated -! fields in grid-point space. -! KVESETSC(:) - indicating which 'b-set' in spectral space owns a -! scalar field. As for KVSETUV this argument is required -! if the total number of processors is greater than -! the number of processors used for distribution in -! spectral wave space. -! PGP(:,:,:) - gridpoint fields - -! The ordering of the output fields is as follows (all -! parts are optional depending on the input switches): - -! u : KF_UV_G fields -! v : KF_UV_G fields -! scalar fields : KF_SCALARS_G fields - -! Method. -! ------- - -! Externals. SHUFFLE - reshuffle fields for load balancing -! ---------- FIELD_SPLIT - split fields in NPROMATR packets -! LTDIR_CTLAD - control of Legendre transform -! FTDIR_CTLAD - control of Fourier transform - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 01-01-03 - -! ------------------------------------------------------------------ - -USE PARKIND1 ,ONLY : JPIM ,JPRB -USE TPM_GEN ,ONLY : NPROMATR -!USE TPM_TRANS -!USE TPM_DISTR - -USE SHUFFLE_MOD ,ONLY : SHUFFLE -USE FIELD_SPLIT_MOD ,ONLY : FIELD_SPLIT -USE LTDIR_CTLAD_MOD ,ONLY : LTDIR_CTLAD -USE FTDIR_CTLAD_MOD ,ONLY : FTDIR_CTLAD -! - -IMPLICIT NONE - -! Declaration of arguments - -INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV_G -INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS_G -INTEGER(KIND=JPIM), INTENT(IN) :: KF_GP -INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS -INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV -INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPVOR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPDIV(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSCALAR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSC3A(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSC3B(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSC2(:,:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PGP(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PGPUV(:,:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PGP3A(:,:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PGP3B(:,:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PGP2(:,:,:) - -! Local variables - -INTEGER(KIND=JPIM) :: IPTRGP(KF_GP),IPTRSPUV(NPROMATR),IPTRSPSC(NPROMATR) -INTEGER(KIND=JPIM) :: ISHFUV_G(KF_GP),ISHFSC_G(KF_GP) -INTEGER(KIND=JPIM) :: IVSETUV(KF_GP),IVSETSC(KF_GP) -INTEGER(KIND=JPIM) :: IBLKS,JBLK,ISTUV_G,IENUV_G -INTEGER(KIND=JPIM) :: IF_UV_G,IF_UV,ISTUV,IF_SCALARS,IF_SCALARS_G,IF_FS,IF_GP -INTEGER(KIND=JPIM) :: JFLD,ISTSC_G,IENSC_G,ISTSC,IENSC,IENUV,IF_GPB - - -! ------------------------------------------------------------------ - -! Perform transform - -IF_GPB = 2*KF_UV_G+KF_SCALARS_G -IF(NPROMATR > 0 .AND. IF_GPB > NPROMATR) THEN - - ! Fields to be split into packets - - CALL SHUFFLE(KF_UV_G,KF_SCALARS_G,ISHFUV_G,IVSETUV,ISHFSC_G,IVSETSC,& - & KVSETUV,KVSETSC) - - IBLKS=(IF_GPB-1)/NPROMATR+1 - - DO JBLK=1,IBLKS - - CALL FIELD_SPLIT(JBLK,KF_GP,KF_UV_G,IVSETUV,IVSETSC,& - & ISTUV_G,IENUV_G,IF_UV_G,ISTSC_G,IENSC_G,IF_SCALARS_G,& - & ISTUV,IENUV,IF_UV,ISTSC,IENSC,IF_SCALARS) - - IF_FS = 2*IF_UV + IF_SCALARS - IF_GP = 2*IF_UV_G+IF_SCALARS_G - DO JFLD=1,IF_UV_G - IPTRGP(JFLD) = ISHFUV_G(ISTUV_G+JFLD-1) - IPTRGP(JFLD+IF_UV_G) = KF_UV_G+ISHFUV_G(ISTUV_G+JFLD-1) - ENDDO - DO JFLD=1,IF_SCALARS_G - IPTRGP(JFLD+2*IF_UV_G) = 2*KF_UV_G+ISHFSC_G(ISTSC_G+JFLD-1) - ENDDO - DO JFLD=1,IF_UV - IPTRSPUV(JFLD) = ISTUV+JFLD-1 - ENDDO - DO JFLD=1,IF_SCALARS - IPTRSPSC(JFLD) = ISTSC+JFLD-1 - ENDDO - - CALL LTDIR_CTLAD(IF_FS,IF_UV,IF_SCALARS, & - & PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& - & KFLDPTRUV=IPTRSPUV,KFLDPTRSC=IPTRSPSC) - IF(IF_UV_G > 0 .AND. IF_SCALARS_G > 0) THEN - CALL FTDIR_CTLAD(IF_UV_G,IF_SCALARS_G,IF_GP,IF_FS,& - & KVSETUV=IVSETUV(ISTUV_G:IENUV_G),& - & KVSETSC=IVSETSC(ISTSC_G:IENSC_G),KPTRGP=IPTRGP,PGP=PGP) - ELSEIF(IF_UV_G > 0) THEN - CALL FTDIR_CTLAD(IF_UV_G,IF_SCALARS_G,IF_GP,IF_FS,& - & KVSETUV=IVSETUV(ISTUV_G:IENUV_G),& - & KPTRGP=IPTRGP,PGP=PGP) - ELSEIF(IF_SCALARS_G > 0) THEN - CALL FTDIR_CTLAD(IF_UV_G,IF_SCALARS_G,IF_GP,IF_FS,& - & KVSETSC=IVSETSC(ISTSC_G:IENSC_G),KPTRGP=IPTRGP,PGP=PGP) - ENDIF - ENDDO - -ELSE - - ! No splitting of fields, transform done in one go - - CALL LTDIR_CTLAD(KF_FS,KF_UV,KF_SCALARS, & - &PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& - &PSPSC3A=PSPSC3A,PSPSC3B=PSPSC3B,PSPSC2=PSPSC2) - - CALL FTDIR_CTLAD(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,& - & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& - & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& - & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) - -ENDIF - -! ------------------------------------------------------------------ - -END SUBROUTINE DIR_TRANS_CTLAD -END MODULE DIR_TRANS_CTLAD_MOD diff --git a/src/trans/gpu/internal/fourier_inad_mod.F90 b/src/trans/gpu/internal/fourier_inad_mod.F90 deleted file mode 100755 index 4d5a0945e..000000000 --- a/src/trans/gpu/internal/fourier_inad_mod.F90 +++ /dev/null @@ -1,73 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE FOURIER_INAD_MOD -CONTAINS -SUBROUTINE FOURIER_INAD(PREEL,KFIELDS,KGL) - -!**** *FOURIER_INAD* - Copy fourier data from buffer to local array - adjoint - -! Purpose. -! -------- -! Routine for copying fourier data from buffer to local array - -!** Interface. -! ---------- -! CALL FOURIER_INAD(...) - -! Explicit arguments : PREEL - local fourier/GP array -! -------------------- KFIELDS - number of fields -! -! Externals. None. -! ---------- - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 2000-04-01 - -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - -USE TPM_DISTR ,ONLY : D, MYSETW -USE TPM_TRANS ,ONLY : FOUBUF -USE TPM_GEOMETRY ,ONLY : G -! - -IMPLICIT NONE - -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS,KGL - -REAL(KIND=JPRBT), INTENT(IN) :: PREEL(:,:) - -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,IR,II,ISTA - -! ------------------------------------------------------------------ - -IGLG = D%NPTRLS(MYSETW)+KGL-1 -DO JM=0,G%NMEN(IGLG) - IPROC = D%NPROCM(JM) - IR = 2*JM+1+D%NSTAGTF(KGL) - II = 2*JM+2+D%NSTAGTF(KGL) - ISTA = (D%NSTAGT0B(D%MSTABF(IPROC))+D%NPNTGTB0(JM,KGL))*2*KFIELDS - DO JF=1,KFIELDS - FOUBUF(ISTA+2*JF-1) = PREEL(JF,IR) - FOUBUF(ISTA+2*JF ) = PREEL(JF,II) - ENDDO -ENDDO - -! ------------------------------------------------------------------ - -END SUBROUTINE FOURIER_INAD -END MODULE FOURIER_INAD_MOD - diff --git a/src/trans/gpu/internal/fourier_outad_mod.F90 b/src/trans/gpu/internal/fourier_outad_mod.F90 deleted file mode 100755 index f87540e05..000000000 --- a/src/trans/gpu/internal/fourier_outad_mod.F90 +++ /dev/null @@ -1,72 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE FOURIER_OUTAD_MOD -CONTAINS -SUBROUTINE FOURIER_OUTAD(PREEL,KFIELDS,KGL) - -!**** *FOURIER_OUTAD* - Copy fourier data from local array to buffer - adjoint - -! Purpose. -! -------- -! Routine for copying fourier data from local array to buffer - -!** Interface. -! ---------- -! CALL FOURIER_OUTAD(...) - -! Explicit arguments : PREEL - local fourier/GP array -! -------------------- KFIELDS - number of fields -! -! Externals. None. -! ---------- - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 2000-04-01 - -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - -USE TPM_DISTR ,ONLY : D, MYSETW -USE TPM_TRANS ,ONLY : FOUBUF_IN -USE TPM_GEOMETRY ,ONLY : G -! - -IMPLICIT NONE - -REAL(KIND=JPRBT), INTENT(OUT) :: PREEL(:,:) -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS,KGL - -INTEGER(KIND=JPIM) :: JM,JF,IGLG,IPROC,IR,II,ISTA - -! ------------------------------------------------------------------ - -IGLG = D%NPTRLS(MYSETW)+KGL-1 -DO JM=0,G%NMEN(IGLG) - IPROC = D%NPROCM(JM) - IR = 2*JM+1+D%NSTAGTF(KGL) - II = 2*JM+2+D%NSTAGTF(KGL) - ISTA = (D%NSTAGT1B(D%MSTABF(IPROC))+D%NPNTGTB0(JM,KGL))*2*KFIELDS - DO JF=1,KFIELDS - PREEL(JF,IR) = FOUBUF_IN(ISTA+2*JF-1) - PREEL(JF,II) = FOUBUF_IN(ISTA+2*JF ) - ENDDO -ENDDO - -! ------------------------------------------------------------------ - -END SUBROUTINE FOURIER_OUTAD -END MODULE FOURIER_OUTAD_MOD - diff --git a/src/trans/gpu/internal/fscad_mod.F90 b/src/trans/gpu/internal/fscad_mod.F90 deleted file mode 100755 index 098381fcb..000000000 --- a/src/trans/gpu/internal/fscad_mod.F90 +++ /dev/null @@ -1,145 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE FSCAD_MOD -CONTAINS -SUBROUTINE FSCAD(KGL,KF_UV,KF_SCALARS,KF_SCDERS,& - & PUV,PSCALAR,PNSDERS,PEWDERS,PUVDERS) - -!**** *FSCAD - Division by a*cos(theta), east-west derivatives - adjoint - -! Purpose. -! -------- -! In Fourier space divide u and v and all north-south -! derivatives by a*cos(theta). Also compute east-west derivatives -! of u,v,thermodynamic, passiv scalar variables and surface -! pressure. - -!** Interface. -! ---------- -! CALL FSCAD(..) -! Explicit arguments : PUV - u and v -! -------------------- PSCALAR - scalar valued varaibles -! PNSDERS - N-S derivative of S.V.V. -! PEWDERS - E-W derivative of S.V.V. -! PUVDERS - E-W derivative of u and v -! Method. -! ------- - -! Externals. None. -! ---------- - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 00-03-03 (From SC2FSC) - -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - -USE TPM_TRANS ,ONLY : LUVDER -USE TPM_DISTR ,ONLY : D, MYSETW -USE TPM_FIELDS ,ONLY : F -USE TPM_GEOMETRY ,ONLY : G -! - -IMPLICIT NONE - -INTEGER(KIND=JPIM) , INTENT(IN) :: KGL,KF_UV,KF_SCALARS,KF_SCDERS -REAL(KIND=JPRBT) , INTENT(INOUT) :: PUV(:,:) -REAL(KIND=JPRBT) , INTENT(INOUT) :: PSCALAR(:,:) -REAL(KIND=JPRBT) , INTENT(INOUT) :: PNSDERS(:,:) -REAL(KIND=JPRBT) , INTENT(INOUT) :: PEWDERS(:,:) -REAL(KIND=JPRBT) , INTENT(INOUT) :: PUVDERS(:,:) - -REAL(KIND=JPRBT) :: ZACHTE,ZMUL -INTEGER(KIND=JPIM) :: IMEN,ISTAGTF - - -INTEGER(KIND=JPIM) :: JLON,JF,IGLG,II,IR,JM - -! ------------------------------------------------------------------ - -IGLG = D%NPTRLS(MYSETW)+KGL-1 -ZACHTE = F%RACTHE(IGLG) -IMEN = G%NMEN(IGLG) -ISTAGTF = D%NSTAGTF(KGL) - - -! ------------------------------------------------------------------ - -!* 2. EAST-WEST DERIVATIVES -! --------------------- - -!* 2.1 U AND V. - -IF(LUVDER)THEN - DO JM=0,IMEN - IR = ISTAGTF+2*JM+1 - II = IR+1 - ZMUL = ZACHTE*JM - DO JF=1,2*KF_UV - PUV(JF,II) = PUV(JF,II) - PUVDERS(JF,IR)*ZMUL - PUV(JF,IR) = PUV(JF,IR) + PUVDERS(JF,II)*ZMUL -! PUVDERS(JF,IR) = _ZERO_ -! PUVDERS(JF,II) = _ZERO_ - ENDDO - ENDDO -ENDIF - -!* 2.2 SCALAR VARIABLES - -IF(KF_SCDERS > 0)THEN - DO JM=0,IMEN - IR = ISTAGTF+2*JM+1 - II = IR+1 - ZMUL = ZACHTE*JM - DO JF=1,KF_SCALARS - PSCALAR(JF,II) = PSCALAR(JF,II) - PEWDERS(JF,IR)*ZMUL - PSCALAR(JF,IR) = PSCALAR(JF,IR) + PEWDERS(JF,II)*ZMUL -! PEWDERS(JF,IR) = _ZERO_ -! PEWDERS(JF,II) = _ZERO_ - ENDDO - ENDDO -ENDIF - -! ------------------------------------------------------------------ - -!* 1. DIVIDE U V AND N-S DERIVATIVES BY A*COS(THETA) -! ---------------------------------------------- - - -!* 1.1 U AND V. - -IF(KF_UV > 0) THEN - DO JLON=ISTAGTF+1,ISTAGTF+2*(IMEN+1) - DO JF=1,2*KF_UV - PUV(JF,JLON) = PUV(JF,JLON)*ZACHTE - ENDDO - ENDDO -ENDIF - -!* 1.2 N-S DERIVATIVES - -IF(KF_SCDERS > 0)THEN - DO JLON=ISTAGTF+1,ISTAGTF+2*(IMEN+1) - DO JF=1,KF_SCALARS - PNSDERS(JF,JLON) = PNSDERS(JF,JLON)*ZACHTE - ENDDO - ENDDO -ENDIF - -! ------------------------------------------------------------------ - -END SUBROUTINE FSCAD -END MODULE FSCAD_MOD diff --git a/src/trans/gpu/internal/ftdir_ctlad_mod.F90 b/src/trans/gpu/internal/ftdir_ctlad_mod.F90 deleted file mode 100755 index c26f71734..000000000 --- a/src/trans/gpu/internal/ftdir_ctlad_mod.F90 +++ /dev/null @@ -1,188 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE FTDIR_CTLAD_MOD -CONTAINS -SUBROUTINE FTDIR_CTLAD(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & - & KVSETUV,KVSETSC,KPTRGP,& - & KVSETSC3A,KVSETSC3B,KVSETSC2,& - & PGP,PGPUV,PGP3A,PGP3B,PGP2) - - -!**** *FTDIR_CTLAD - Direct Fourier transform control - adjoint - -! Purpose. Control routine for Grid-point to Fourier transform -! -------- - -!** Interface. -! ---------- -! CALL FTDIR_CTLAD(..) - -! Explicit arguments : -! -------------------- -! KF_UV_G - global number of spectral u-v fields -! KF_SCALARS_G - global number of scalar spectral fields -! KF_GP - total number of output gridpoint fields -! KF_FS - total number of fields in fourier space -! PGP - gridpoint array -! KVSETUV - "B" set in spectral/fourier space for -! u and v variables -! KVSETSC - "B" set in spectral/fourier space for -! scalar variables -! KPTRGP - pointer array to fields in gridpoint space - -! Method. -! ------- - -! Externals. TRGTOL - transposition routine -! ---------- FOURIER_OUT - copy fourier data to Fourier buffer -! FTDIR - fourier transform - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 00-03-03 - -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT - -!USE TPM_GEN -!USE TPM_DIM -!USE TPM_GEOMETRY -USE TPM_DISTR ,ONLY : D, MYPROC, NPROC - -!USE TRLTOG_MOD ,ONLY : TRLTOG -USE FOURIER_OUTAD_MOD ,ONLY : FOURIER_OUTAD -USE FTDIRAD_MOD ,ONLY : FTDIRAD -! - -IMPLICIT NONE - -! Dummy arguments - -INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) -REAL(KIND=JPRB),OPTIONAL , INTENT(OUT) :: PGP(:,:,:) -REAL(KIND=JPRB),OPTIONAL , INTENT(OUT) :: PGPUV(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL , INTENT(OUT) :: PGP3A(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL , INTENT(OUT) :: PGP3B(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL , INTENT(OUT) :: PGP2(:,:,:) - -! Local variables -REAL(KIND=JPRBT) :: ZGTF(KF_FS,D%NLENGTF) - - -INTEGER(KIND=JPIM) :: IST -INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) -INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) -INTEGER(KIND=JPIM) :: IVSET(KF_GP) -INTEGER(KIND=JPIM) :: JGL,IGL -INTEGER(KIND=JPIM) :: IFGP2,IFGP3A,IFGP3B,IOFF,J3 -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC - -! ------------------------------------------------------------------ - -! Field distribution in Spectral/Fourier space - -CALL GSTATS(133,0) - -IF(MYPROC > NPROC/2)THEN - IBEG=1 - IEND=D%NDGL_FS - IINC=1 -ELSE - IBEG=D%NDGL_FS - IEND=1 - IINC=-1 -ENDIF - -CALL GSTATS(1642,0) -!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JGL,IGL) -DO JGL=IBEG,IEND,IINC - IGL = JGL - CALL FOURIER_OUTAD(ZGTF,KF_FS,IGL) - -! Fourier transform - - IF(KF_FS>0) THEN - CALL FTDIRAD(ZGTF,KF_FS,IGL) - ENDIF -ENDDO -!$OMP END PARALLEL DO -CALL GSTATS(1642,1) -CALL GSTATS(133,1) - -! Transposition - -CALL GSTATS(183,0) -IF(PRESENT(KVSETUV)) THEN - IVSETUV(:) = KVSETUV(:) -ELSE - IVSETUV(:) = -1 -ENDIF -IVSETSC(:) = -1 -IF(PRESENT(KVSETSC)) THEN - IVSETSC(:) = KVSETSC(:) -ELSE - IOFF=0 - IF(PRESENT(KVSETSC2)) THEN - IFGP2=UBOUND(KVSETSC2,1) - IVSETSC(1:IFGP2)=KVSETSC2(:) - IOFF=IOFF+IFGP2 - ENDIF - IF(PRESENT(KVSETSC3A)) THEN - IFGP3A=UBOUND(KVSETSC3A,1) - DO J3=1,UBOUND(PGP3A,3) - IVSETSC(IOFF+1:IOFF+IFGP3A)=KVSETSC3A(:) - IOFF=IOFF+IFGP3A - ENDDO - ENDIF - IF(PRESENT(KVSETSC3B)) THEN - IFGP3B=UBOUND(KVSETSC3B,1) - DO J3=1,UBOUND(PGP3B,3) - IVSETSC(IOFF+1:IOFF+IFGP3B)=KVSETSC3B(:) - IOFF=IOFF+IFGP3B - ENDDO - ENDIF -ENDIF - -IST = 1 -IF(KF_UV_G > 0) THEN - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G -ENDIF -IF(KF_SCALARS_G > 0) THEN - IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) - IST = IST+KF_SCALARS_G -ENDIF -stop 4 -!CALL TRLTOG(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& -! &PGP,PGPUV,PGP3A,PGP3B,PGP2) - -CALL GSTATS(183,1) - -! ------------------------------------------------------------------ - -END SUBROUTINE FTDIR_CTLAD -END MODULE FTDIR_CTLAD_MOD - - - diff --git a/src/trans/gpu/internal/ftdirad_mod.F90 b/src/trans/gpu/internal/ftdirad_mod.F90 deleted file mode 100755 index d074fd4cb..000000000 --- a/src/trans/gpu/internal/ftdirad_mod.F90 +++ /dev/null @@ -1,119 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE FTDIRAD_MOD -CONTAINS -SUBROUTINE FTDIRAD(PREEL,KFIELDS,KGL) - - -!**** *FTDIRAD - Direct Fourier transform - -! Purpose. Routine for Grid-point to Fourier transform - adjoint -! -------- - -!** Interface. -! ---------- -! CALL FTDIRAD(..) - -! Explicit arguments : PREEL - Fourier/grid-point array -! -------------------- KFIELDS - number of fields - -! Method. -! ------- - -! Externals. FFT992 - FFT routine -! ---------- -! - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 00-03-03 -! D. Degrauwe (Feb 2012): Alternative extension zone (E') -! G. Mozdzynski (Oct 2014): support for FFTW transforms -! G. Mozdzynski (Jun 2015): Support alternative FFTs to FFTW - -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT - -USE TPM_DISTR ,ONLY : D, MYSETW -USE TPM_GEOMETRY ,ONLY : G -USE TPM_FFT ,ONLY : T -#ifdef WITH_FFTW -USE TPM_FFTW ,ONLY : TW, EXEC_FFTW -#endif -USE TPM_DIM ,ONLY : R - -IMPLICIT NONE - -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS,KGL -REAL(KIND=JPRBT), INTENT(INOUT) :: PREEL(:,:) - -INTEGER(KIND=JPIM) :: IGLG,IST,ILEN,IJUMP,JJ,JF,ILOEN -INTEGER(KIND=JPIM) :: IOFF,IRLEN,ICLEN,ITYPE -REAL(KIND=JPRBT) :: ZMUL -LOGICAL :: LL_ALL=.FALSE. ! T=do kfields ffts in one batch, F=do kfields ffts one at a time -! ------------------------------------------------------------------ - -ITYPE = 1 -IJUMP = 1 -IGLG = D%NPTRLS(MYSETW)+KGL-1 -IST = 2*(G%NMEN(IGLG)+1)+1 -ILOEN = G%NLOEN(IGLG)+R%NNOEXTZL -ILEN = ILOEN+3-IST -IOFF = D%NSTAGTF(KGL)+1 -IRLEN = ILOEN -ICLEN = (IRLEN/2+1)*2 - -DO JJ=1,ILEN - DO JF=1,KFIELDS - PREEL(JF,IST+IOFF-1+JJ-1) = 0.0_JPRBT - ENDDO -ENDDO - -#ifdef WITH_FFTW -IF( .NOT. TW%LFFTW )THEN -#endif - -!! IF( T%LUSEFFT992(KGL) )THEN -!! -!! CALL FFT992(PREEL(1,IOFF),T%TRIGS(1,KGL),& -!! &T%NFAX(1,KGL),KFIELDS,IJUMP,ILOEN,KFIELDS,ITYPE) -!! -!! ELSE -!! -!! CALL BLUESTEIN_FFT(TB,ILOEN,ITYPE,KFIELDS,PREEL(1:KFIELDS,IOFF:IOFF+ICLEN-1)) -!! -!! ENDIF - -#ifdef WITH_FFTW -ELSE - -! CALL EXEC_FFTW(ITYPE,IRLEN,ICLEN,IOFF,KFIELDS,LL_ALL,PREEL) - -ENDIF -#endif - - ! Change of metric (not in forward routine) - -ZMUL = 1.0_JPRBT/ILOEN -DO JJ=1,ILOEN - DO JF=1,KFIELDS - PREEL(JF,IOFF-1+JJ) = PREEL(JF,IOFF-1+JJ)*ZMUL - ENDDO -ENDDO - -! ------------------------------------------------------------------ - -END SUBROUTINE FTDIRAD -END MODULE FTDIRAD_MOD diff --git a/src/trans/gpu/internal/ftinv_ctlad_mod.F90 b/src/trans/gpu/internal/ftinv_ctlad_mod.F90 deleted file mode 100755 index 098f250cf..000000000 --- a/src/trans/gpu/internal/ftinv_ctlad_mod.F90 +++ /dev/null @@ -1,295 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE FTINV_CTLAD_MOD -CONTAINS -SUBROUTINE FTINV_CTLAD(KF_UV_G,KF_SCALARS_G,& - & KF_UV,KF_SCALARS,KF_SCDERS,KF_GP,KF_FS,KF_OUT_LT,KVSETUV,KVSETSC,KPTRGP, & - & KVSETSC3A,KVSETSC3B,KVSETSC2,& - & PGP,PGPUV,PGP3A,PGP3B,PGP2) - - -!**** *FTINV_CTLAD - Inverse Fourier transform control - adjoint - -! Purpose. Control routine for Fourier to Gridpoint transform -! -------- - -!** Interface. -! ---------- -! CALL FTINV_CTLAD(..) - -! Explicit arguments : -! -------------------- -! PGP - gridpoint array -! KF_UV_G - global number of spectral u-v fields -! KF_SCALARS_G - global number of scalar spectral fields -! KF_UV - local number of spectral u-v fields -! KF_SCALARS - local number of scalar spectral fields -! KF_SCDERS - local number of derivatives of scalar spectral fields -! KF_GP - total number of output gridpoint fields -! KF_FS - total number of fields in fourier space -! KF_OUT_LT - total number of fields coming out from inverse LT -! KVSETUV - "B" set in spectral/fourier space for -! u and v variables -! KVSETSC - "B" set in spectral/fourier space for -! scalar variables -! KPTRGP - pointer array to fi3elds in gridpoint space - - -! Method. -! ------- - -! Externals. TRLTOG - transposition routine -! ---------- FOURIER_IN - copy fourier data from Fourier buffer -! FTINV - fourier transform -! FSC - Fourier space computations - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 00-03-03 - -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT - -USE TPM_GEN ,ONLY : NERR -!USE TPM_DIM -!USE TPM_GEOMETRY -USE TPM_TRANS ,ONLY : FOUBUF, LDIVGP, LSCDERS, LUVDER, LVORGP -USE TPM_DISTR ,ONLY : D, MYPROC, NPROC - -USE FOURIER_INAD_MOD ,ONLY : FOURIER_INAD -USE FSCAD_MOD ,ONLY : FSCAD -USE FTINVAD_MOD ,ONLY : FTINVAD -USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS -! - -IMPLICIT NONE - -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_UV_G -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_SCALARS_G -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_UV -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_SCALARS -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_SCDERS -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_GP -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_FS -INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_OUT_LT -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:) -REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP(:,:,:) -REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGPUV(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP3A(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP3B(:,:,:,:) -REAL(KIND=JPRB),OPTIONAL , INTENT(IN) :: PGP2(:,:,:) - -! ------------------------------------------------------------------ - -REAL(KIND=JPRBT),TARGET :: ZGTF(KF_FS,D%NLENGTF) -REAL(KIND=JPRBT),TARGET :: ZDUM(1,D%NLENGTF) -REAL(KIND=JPRBT),POINTER :: ZUV(:,:) -REAL(KIND=JPRBT),POINTER :: ZSCALAR(:,:) -REAL(KIND=JPRBT),POINTER :: ZNSDERS(:,:) -REAL(KIND=JPRBT),POINTER :: ZEWDERS(:,:) -REAL(KIND=JPRBT),POINTER :: ZUVDERS(:,:) - -INTEGER(KIND=JPIM) :: IST,IBLEN -INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) -INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) -INTEGER(KIND=JPIM) :: IVSET(KF_GP) -INTEGER(KIND=JPIM) :: J3,JGL,IGL,IOFF,IFGP2,IFGP3A,IFGP3B,IGP3APAR,IGP3BPAR -INTEGER(KIND=JPIM) :: IBEG,IEND,IINC - -! ------------------------------------------------------------------ - -! 4. Transposition - -IF(PRESENT(KVSETUV)) THEN - IVSETUV(:) = KVSETUV(:) -ELSE - IVSETUV(:) = -1 -ENDIF - -IVSETSC(:)=-1 -IF(PRESENT(KVSETSC)) THEN - IVSETSC(:) = KVSETSC(:) -ELSE - IOFF=0 - IF(PRESENT(KVSETSC2)) THEN - IFGP2=UBOUND(KVSETSC2,1) - IVSETSC(1:IFGP2)=KVSETSC2(:) - IOFF=IOFF+IFGP2 - ENDIF - IF(PRESENT(KVSETSC3A)) THEN - IFGP3A=UBOUND(KVSETSC3A,1) - IGP3APAR=UBOUND(PGP3A,3) - IF(LSCDERS) IGP3APAR=IGP3APAR/3 - DO J3=1,IGP3APAR - IVSETSC(IOFF+1:IOFF+IFGP3A)=KVSETSC3A(:) - IOFF=IOFF+IFGP3A - ENDDO - ENDIF - IF(PRESENT(KVSETSC3B)) THEN - IFGP3B=UBOUND(KVSETSC3B,1) - IGP3BPAR=UBOUND(PGP3B,3) - IF(LSCDERS) IGP3BPAR=IGP3BPAR/3 - DO J3=1,IGP3BPAR - IVSETSC(IOFF+1:IOFF+IFGP3B)=KVSETSC3B(:) - IOFF=IOFF+IFGP3B - ENDDO - ENDIF - IF(IOFF > 0 .AND. IOFF /= KF_SCALARS_G ) THEN - WRITE(NERR,*)'FTINV_CTLAD:IOFF,KF_SCALARS_G ',IOFF,KF_SCALARS_G - CALL ABORT_TRANS('FTINV_CTLAD_MOD:IOFF /= KF_SCALARS_G') - ENDIF -ENDIF - -IST = 1 -IF(KF_UV_G > 0) THEN - IF( LVORGP) THEN - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - ENDIF - IF( LDIVGP) THEN - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - ENDIF - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G -ENDIF -IF(KF_SCALARS_G > 0) THEN - IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) - IST = IST+KF_SCALARS_G - IF(LSCDERS) THEN - IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) - IST = IST+KF_SCALARS_G - ENDIF -ENDIF -IF(KF_UV_G > 0 .AND. LUVDER) THEN - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G - IVSET(IST:IST+KF_UV_G-1) = IVSETUV(:) - IST = IST+KF_UV_G -ENDIF -IF(KF_SCALARS_G > 0) THEN - IF(LSCDERS) THEN - IVSET(IST:IST+KF_SCALARS_G-1) = IVSETSC(:) - IST = IST+KF_SCALARS_G - ENDIF -ENDIF - -CALL GSTATS(182,0) -print *, "not supported..." -flush(6) -stop -! CALL TRGTOL(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& - ! &PGP,PGPUV,PGP3A,PGP3B,PGP2) -CALL GSTATS(182,1) - -! 3. Fourier transform - -IF(KF_UV > 0 .OR. KF_SCDERS > 0) THEN - IST = 1 - IF(LVORGP) THEN - IST = IST+KF_UV - ENDIF - IF(LDIVGP) THEN - IST = IST+KF_UV - ENDIF - IF(KF_UV>0)ZUV => ZGTF(IST:IST+2*KF_UV-1,:) - IST = IST+2*KF_UV - IF(KF_SCALARS>0)ZSCALAR => ZGTF(IST:IST+KF_SCALARS-1,:) - IST = IST+KF_SCALARS - IF(KF_SCDERS>0)ZNSDERS => ZGTF(IST:IST+KF_SCDERS-1,:) - IST = IST+KF_SCDERS - IF(LUVDER) THEN - ZUVDERS => ZGTF(IST:IST+2*KF_UV-1,:) - IST = IST+2*KF_UV - ELSE - ZUVDERS => ZDUM(1:1,:) - ENDIF - IF(KF_SCDERS > 0) THEN - ZEWDERS => ZGTF(IST:IST+KF_SCDERS-1,:) - ELSE - ZEWDERS => ZDUM(1:1,:) - ENDIF -ENDIF - -IBLEN = D%NLENGT0B*2*KF_OUT_LT -IF (ALLOCATED(FOUBUF)) THEN - IF (MAX(1,IBLEN) > SIZE(FOUBUF)) THEN - DEALLOCATE(FOUBUF) - ALLOCATE(FOUBUF(MAX(1,IBLEN))) - ENDIF -ELSE - ALLOCATE(FOUBUF(MAX(1,IBLEN))) -ENDIF - -CALL GSTATS(132,0) - -IF(MYPROC > NPROC/2)THEN - IBEG=1 - IEND=D%NDGL_FS - IINC=1 -ELSE - IBEG=D%NDGL_FS - IEND=1 - IINC=-1 -ENDIF - -CALL GSTATS(1641,0) -!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JGL,IGL) -DO JGL=IBEG,IEND,IINC - IGL = JGL - IF(KF_FS > 0) THEN - CALL FTINVAD(ZGTF,KF_FS,IGL) - ENDIF - -! 2. Fourier space computations - - IF(KF_UV > 0 .OR. KF_SCDERS > 0) THEN - CALL FSCAD(IGL,KF_UV,KF_SCALARS,KF_SCDERS,& - & ZUV,ZSCALAR,ZNSDERS,ZEWDERS,ZUVDERS) - ENDIF - -! 1. Copy Fourier data to local array - - CALL FOURIER_INAD(ZGTF,KF_OUT_LT,IGL) - -ENDDO -!$OMP END PARALLEL DO -CALL GSTATS(1641,1) - -IF(KF_UV > 0 .OR. KF_SCDERS > 0) THEN - NULLIFY(ZUV) - NULLIFY(ZSCALAR) - NULLIFY(ZNSDERS) - NULLIFY(ZUVDERS) - NULLIFY(ZEWDERS) -ENDIF - -CALL GSTATS(132,1) - -! ------------------------------------------------------------------ - -END SUBROUTINE FTINV_CTLAD -END MODULE FTINV_CTLAD_MOD - - - diff --git a/src/trans/gpu/internal/ftinvad_mod.F90 b/src/trans/gpu/internal/ftinvad_mod.F90 deleted file mode 100755 index 7aa455735..000000000 --- a/src/trans/gpu/internal/ftinvad_mod.F90 +++ /dev/null @@ -1,124 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE FTINVAD_MOD -CONTAINS -SUBROUTINE FTINVAD(PREEL,KFIELDS,KGL) - - -!**** *FTINVAD - Inverse Fourier transform - adjoint - -! Purpose. Routine for Fourier to Grid-point transform -! -------- - -!** Interface. -! ---------- -! CALL FTINVAD(..) - -! Explicit arguments : PREEL - Fourier/grid-point array -! -------------------- KFIELDS - number of fields - -! Method. -! ------- - -! Externals. FFT992 - FFT routine -! ---------- -! - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 00-03-03 -! D. Degrauwe (Feb 2012): Alternative extension zone (E') -! G. Mozdzynski (Oct 2014): support for FFTW transforms -! G. Mozdzynski (Jun 2015): Support alternative FFTs to FFTW - -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM, JPIB, JPRBT - -USE TPM_DISTR ,ONLY : D, MYSETW -USE TPM_DIM ,ONLY : R -USE TPM_GEOMETRY ,ONLY : G -USE TPM_FFT ,ONLY : T -#ifdef WITH_FFTW -USE TPM_FFTW ,ONLY : TW, EXEC_FFTW -#endif -USE TPM_DIM ,ONLY : R -! - -IMPLICIT NONE - -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELDS,KGL -REAL(KIND=JPRBT), INTENT(OUT) :: PREEL(:,:) - -INTEGER(KIND=JPIM) :: IGLG,IST,ILEN,IJUMP,JJ,JF,ILOEN -INTEGER(KIND=JPIM) :: IOFF,IRLEN,ICLEN,ITYPE -LOGICAL :: LL_ALL=.FALSE. ! T=do kfields ffts in one batch, F=do kfields ffts one at a time - -! ------------------------------------------------------------------ - -ITYPE =-1 -IJUMP = 1 -IGLG = D%NPTRLS(MYSETW)+KGL-1 -ILOEN = G%NLOEN(IGLG)+R%NNOEXTZL -IST = 2*(G%NMEN(IGLG)+1)+1 -ILEN = ILOEN+3-IST -IOFF = D%NSTAGTF(KGL)+1 -IRLEN = ILOEN -ICLEN = (IRLEN/2+1)*2 - - ! Change of metric (not in forward routine) -DO JJ=1,ILOEN - DO JF=1,KFIELDS - PREEL(JF,IOFF-1+JJ) = PREEL(JF,IOFF-1+JJ)*ILOEN - ENDDO -ENDDO - -#ifdef WITH_FFTW -IF( .NOT. TW%LFFTW )THEN -#endif - -!! IF( T%LUSEFFT992(KGL) )THEN -!! -!! CALL FFT992(PREEL(1,IOFF),T%TRIGS(1,KGL),& -!! &T%NFAX(1,KGL),KFIELDS,IJUMP,ILOEN,KFIELDS,ITYPE) -!! -!! ELSE -!! -!! CALL BLUESTEIN_FFT(TB,ILOEN,ITYPE,KFIELDS,PREEL(1:KFIELDS,IOFF:IOFF+ICLEN-1)) -!! DO JJ=1,ICLEN -!! DO JF=1,KFIELDS -!! PREEL(JF,IOFF-1+JJ)=PREEL(JF,IOFF-1+JJ)/REAL(ILOEN,JPRBT) -!! ENDDO -!! ENDDO -!! -!! ENDIF - -#ifdef WITH_FFTW -ELSE - -! CALL EXEC_FFTW(ITYPE,IRLEN,ICLEN,IOFF,KFIELDS,LL_ALL,PREEL) - -ENDIF -#endif - -DO JJ=1,ILEN - DO JF=1,KFIELDS - PREEL(JF,IST+IOFF-1+JJ-1) = 0.0_JPRBT - ENDDO -ENDDO - -! ------------------------------------------------------------------ - -END SUBROUTINE FTINVAD -END MODULE FTINVAD_MOD diff --git a/src/trans/gpu/internal/inv_trans_ctlad_mod.F90 b/src/trans/gpu/internal/inv_trans_ctlad_mod.F90 deleted file mode 100755 index b1729fe92..000000000 --- a/src/trans/gpu/internal/inv_trans_ctlad_mod.F90 +++ /dev/null @@ -1,295 +0,0 @@ -! (C) Copyright 2001- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE INV_TRANS_CTLAD_MOD -CONTAINS -SUBROUTINE INV_TRANS_CTLAD(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& - & KF_UV,KF_SCALARS,KF_SCDERS,& - & PSPVOR,PSPDIV,PSPSCALAR,KVSETUV,KVSETSC,PGP,& - & PSPSC3A,PSPSC3B,PSPSC2,KVSETSC3A,KVSETSC3B,KVSETSC2,PGPUV,PGP3A,PGP3B,PGP2) - -!**** *INV_TRANS_CTLAD* - Control routine for inverse spectral transform adj. - -! Purpose. -! -------- -! Control routine for the inverse spectral transform - -!** Interface. -! ---------- -! CALL INV_TRANS_CTLAD(...) - -! Explicit arguments : -! -------------------- -! KF_UV_G - global number of spectral u-v fields -! KF_SCALARS_G - global number of scalar spectral fields -! KF_GP - total number of output gridpoint fields -! KF_FS - total number of fields in fourier space -! KF_OUT_LT - total number of fields coming out from inverse LT -! KF_UV - local number of spectral u-v fields -! KF_SCALARS - local number of scalar spectral fields -! KF_SCDERS - local number of derivatives of scalar spectral fields -! PSPVOR(:,:) - spectral vorticity (input) -! PSPDIV(:,:) - spectral divergence (input) -! PSPSCALAR(:,:) - spectral scalarvalued fields (input) -! KVSETUV(:) - indicating which 'b-set' in spectral space owns a -! vor/div field. Equivalant to NBSETLEV in the IFS. -! The length of KVSETUV should be the GLOBAL number -! of u/v fields which is the dimension of u and v releated -! fields in grid-point space. -! KVESETSC(:) - indicating which 'b-set' in spectral space owns a -! scalar field. As for KVSETUV this argument is required -! if the total number of processors is greater than -! the number of processors used for distribution in -! spectral wave space. -! PGP(:,:,:) - gridpoint fields (output) - -! The ordering of the output fields is as follows (all -! parts are optional depending on the input switches): -! -! vorticity : KF_UV_G fields -! divergence : KF_UV_G fields -! u : KF_UV_G fields -! v : KF_UV_G fields -! scalar fields : KF_SCALARS_G fields -! N-S derivative of scalar fields : KF_SCALARS_G fields -! E-W derivative of u : KF_UV_G fields -! E-W derivative of v : KF_UV_G fields -! E-W derivative of scalar fields : KF_SCALARS_G fields -! -! Method. -! ------- - -! Externals. SHUFFLE - reshuffle fields for load balancing -! ---------- FIELD_SPLIT - split fields in NPROMATR packets -! LTINV_CTLAD - control of Legendre transform -! FTINV_CTLAD - control of Fourier transform - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 01-01-03 - -! ------------------------------------------------------------------ - - -USE PARKIND1 ,ONLY : JPIM ,JPRB - -USE TPM_GEN ,ONLY : NPROMATR -USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP -!USE TPM_DISTR - -USE SHUFFLE_MOD ,ONLY : SHUFFLE -USE FIELD_SPLIT_MOD ,ONLY : FIELD_SPLIT -USE LTINV_CTLAD_MOD ,ONLY : LTINV_CTLAD -USE FTINV_CTLAD_MOD ,ONLY : FTINV_CTLAD -! - -IMPLICIT NONE - -! Declaration of arguments - -INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV_G -INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS_G -INTEGER(KIND=JPIM), INTENT(IN) :: KF_GP -INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS -INTEGER(KIND=JPIM), INTENT(IN) :: KF_OUT_LT -INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV -INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS -INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCDERS -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3A(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3B(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC2(:,:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3A(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC3B(:) -INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC2(:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGP(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGPUV(:,:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGP3A(:,:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGP3B(:,:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(IN) :: PGP2(:,:,:) - -! Local variables - -INTEGER(KIND=JPIM) :: IPTRGP(KF_GP),IPTRSPUV(NPROMATR),IPTRSPSC(NPROMATR) -INTEGER(KIND=JPIM) :: ISHFUV_G(KF_GP),ISHFSC_G(KF_GP) -INTEGER(KIND=JPIM) :: IVSETUV(KF_GP),IVSETSC(KF_GP) -INTEGER(KIND=JPIM) :: IBLKS,JBLK,ISTUV_G,IENUV_G -INTEGER(KIND=JPIM) :: IF_UV_G,IF_UV,ISTUV,IF_SCALARS,IF_SCALARS_G,IF_FS,IF_GP -INTEGER(KIND=JPIM) :: JFLD,ISTSC_G,IENSC_G,ISTSC,IENSC,IENUV,IF_SCDERS,IF_OUT_LT -INTEGER(KIND=JPIM) :: IOFFD,IOFFU,IOFFV,IOFFUVD,IOFFSC,IOFFSCNS,IOFFSCEW,IOFF,IF_GPB - -! ------------------------------------------------------------------ - -! Perform transform - - -IF_GPB = 2*KF_UV_G+KF_SCALARS_G -IF(NPROMATR > 0 .AND. IF_GPB > NPROMATR) THEN - - ! Fields to be split into packets - - CALL SHUFFLE(KF_UV_G,KF_SCALARS_G,ISHFUV_G,IVSETUV,ISHFSC_G,IVSETSC,& - & KVSETUV,KVSETSC) - - IBLKS=(IF_GPB-1)/NPROMATR+1 - - DO JBLK=1,IBLKS - - CALL FIELD_SPLIT(JBLK,IF_GPB,KF_UV_G,IVSETUV,IVSETSC,& - & ISTUV_G,IENUV_G,IF_UV_G,ISTSC_G,IENSC_G,IF_SCALARS_G,& - & ISTUV,IENUV,IF_UV,ISTSC,IENSC,IF_SCALARS) - - IF(LSCDERS) THEN - IF_SCDERS = IF_SCALARS - ELSE - IF_SCDERS = 0 - ENDIF - - IF_OUT_LT = 2*IF_UV + IF_SCALARS+IF_SCDERS - IF(LVORGP) THEN - IF_OUT_LT = IF_OUT_LT+IF_UV - ENDIF - IF(LDIVGP) THEN - IF_OUT_LT = IF_OUT_LT+IF_UV - ENDIF - IF_FS = IF_OUT_LT+IF_SCDERS - IF(LUVDER) THEN - IF_FS = IF_FS+2*IF_UV - ENDIF - - IF_GP = 2*IF_UV_G+IF_SCALARS_G - IOFFD = 0 - IOFFU = 0 - IOFFV = KF_UV_G - IOFFUVD = 2*KF_UV_G+KF_SCALARS_G - IOFFSC = 2*KF_UV_G - IF(LVORGP) THEN - IF_GP = IF_GP+IF_UV_G - IOFFD = KF_UV_G - IOFFU = IOFFU+KF_UV_G - IOFFV = IOFFV+KF_UV_G - IOFFUVD =IOFFUVD+KF_UV_G - IOFFSC = IOFFSC+KF_UV_G - ENDIF - IF(LDIVGP) THEN - IF_GP = IF_GP+IF_UV_G - IOFFU = IOFFU+KF_UV_G - IOFFV = IOFFV+KF_UV_G - IOFFUVD =IOFFUVD+KF_UV_G - IOFFSC = IOFFSC+KF_UV_G - ENDIF - IF(LSCDERS) THEN - IF_GP = IF_GP+2*IF_SCALARS_G - IOFFUVD =IOFFUVD+KF_SCALARS_G - IOFFSCNS = IOFFSC+KF_SCALARS_G - IOFFSCEW = IOFFSC+2*KF_SCALARS_G - ENDIF - IF(LUVDER) THEN - IF_GP = IF_GP+2*IF_UV_G - IOFFSCEW = IOFFSCEW+2*KF_UV_G - ENDIF - - DO JFLD=1,IF_UV_G - IOFF = 0 - IF(LVORGP) THEN - IPTRGP(JFLD+IOFF) = ISHFUV_G(ISTUV_G+JFLD-1) - IOFF = IOFF+IF_UV_G - ENDIF - IF(LDIVGP) THEN - IPTRGP(JFLD+IOFF) = IOFFD+ISHFUV_G(ISTUV_G+JFLD-1) - IOFF = IOFF+IF_UV_G - ENDIF - IPTRGP(JFLD+IOFF) = IOFFU+ISHFUV_G(ISTUV_G+JFLD-1) - IOFF = IOFF+IF_UV_G - IPTRGP(JFLD+IOFF) = IOFFV+ISHFUV_G(ISTUV_G+JFLD-1) - IOFF = IOFF+IF_UV_G+IF_SCALARS_G - IF(LSCDERS) THEN - IOFF = IOFF+IF_SCALARS_G - ENDIF - IF(LUVDER) THEN - IPTRGP(JFLD+IOFF) = IOFFUVD+ISHFUV_G(ISTUV_G+JFLD-1) - IOFF = IOFF+IF_UV_G - IPTRGP(JFLD+IOFF) = IOFFUVD+KF_UV_G+ISHFUV_G(ISTUV_G+JFLD-1) - ENDIF - ENDDO - - DO JFLD=1,IF_SCALARS_G - IOFF = 2*IF_UV_G - IF (LVORGP) IOFF = IOFF+IF_UV_G - IF (LDIVGP) IOFF = IOFF+IF_UV_G - IPTRGP(JFLD+IOFF) = IOFFSC+ISHFSC_G(ISTSC_G+JFLD-1) - IOFF = IOFF+IF_SCALARS_G - IF(LSCDERS) THEN - IPTRGP(JFLD+IOFF) = IOFFSCNS+ISHFSC_G(ISTSC_G+JFLD-1) - IOFF = IOFF+IF_SCALARS_G - IF(LUVDER) THEN - IOFF = IOFF+2*IF_UV_G - ENDIF - IPTRGP(JFLD+IOFF) = IOFFSCEW+ISHFSC_G(ISTSC_G+JFLD-1) - ENDIF - ENDDO - DO JFLD=1,IF_UV - IPTRSPUV(JFLD) = ISTUV+JFLD-1 - ENDDO - DO JFLD=1,IF_SCALARS - IPTRSPSC(JFLD) = ISTSC+JFLD-1 - ENDDO - - IF(IF_UV_G > 0 .AND. IF_SCALARS_G > 0) THEN - CALL FTINV_CTLAD(IF_UV_G,IF_SCALARS_G,& - & IF_UV,IF_SCALARS,IF_SCDERS,IF_GP,IF_FS,IF_OUT_LT,& - & KVSETUV=IVSETUV(ISTUV_G:IENUV_G),& - & KVSETSC=IVSETSC(ISTSC_G:IENSC_G),KPTRGP=IPTRGP,& - & PGP=PGP) - ELSEIF(IF_UV_G > 0) THEN - CALL FTINV_CTLAD(IF_UV_G,IF_SCALARS_G,& - & IF_UV,IF_SCALARS,IF_SCDERS,IF_GP,IF_FS,IF_OUT_LT,& - & KVSETUV=IVSETUV(ISTUV_G:IENUV_G),& - & KPTRGP=IPTRGP,& - & PGP=PGP) - ELSEIF(IF_SCALARS_G > 0) THEN - CALL FTINV_CTLAD(IF_UV_G,IF_SCALARS_G,& - & IF_UV,IF_SCALARS,IF_SCDERS,IF_GP,IF_FS,IF_OUT_LT,& - & KVSETSC=IVSETSC(ISTSC_G:IENSC_G),KPTRGP=IPTRGP,& - & PGP=PGP) - ENDIF - CALL LTINV_CTLAD(IF_OUT_LT,IF_UV,IF_SCALARS,IF_SCDERS, & - & PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& - & KFLDPTRUV=IPTRSPUV,KFLDPTRSC=IPTRSPSC) - - ENDDO - -ELSE - - ! No splitting of fields, transform done in one go - - CALL FTINV_CTLAD(KF_UV_G,KF_SCALARS_G,& - & KF_UV,KF_SCALARS,KF_SCDERS,KF_GP,KF_FS,KF_OUT_LT,& - & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& - & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& - & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) - - CALL LTINV_CTLAD(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS, & - &PSPVOR=PSPVOR,PSPDIV=PSPDIV,PSPSCALAR=PSPSCALAR,& - &PSPSC3A=PSPSC3A,PSPSC3B=PSPSC3B,PSPSC2=PSPSC2) - -ENDIF - -! ------------------------------------------------------------------ - -END SUBROUTINE INV_TRANS_CTLAD -END MODULE INV_TRANS_CTLAD_MOD diff --git a/src/trans/gpu/internal/ldfou2ad_mod.F90 b/src/trans/gpu/internal/ldfou2ad_mod.F90 deleted file mode 100755 index 0763a2571..000000000 --- a/src/trans/gpu/internal/ldfou2ad_mod.F90 +++ /dev/null @@ -1,96 +0,0 @@ -! (C) Copyright 1991- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE LDFOU2AD_MOD -CONTAINS -SUBROUTINE LDFOU2AD(KM,KF_UV,PAIA,PSIA) - -!**** *LDFOU2AD* - Division by a*cos(theta) of u and v - -! Purpose. -! -------- -! In Fourier space divide u and v by a*cos(theta). - -!** Interface. -! ---------- -! CALL LDFOU2AD(KM,PAIA,PSIA) - -! Explicit arguments : -! -------------------- KM - zonal wavenumber -! PAIA - antisymmetric fourier fields -! PSIA - symmetric fourierfields - -! Implicit arguments : RACTHE - 1./(a*cos(theta)) -! -------------------- - -! Method. -! ------- - -! Externals. None. -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS -! Temperton, 1991, MWR 119 p1303 - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 91-07-01 -! Modified : 94-04-06 R. El Khatib - Full-POS configuration 'P' -! M.Hamrud : 94-11-01 New conf 'G' - vor,div->vor,div -! instead of u,v->vor,div -! MPP Group: 95-10-01 Message Passing option added -! Modified : 04/06/99 D.Salmond : change order of AIA and SIA -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - -USE TPM_DIM ,ONLY : R -USE TPM_GEOMETRY ,ONLY : G -USE TPM_FIELDS ,ONLY : F -! - -IMPLICIT NONE - - -! DUMMY INTEGER SCALARS -INTEGER(KIND=JPIM), INTENT(IN) :: KM,KF_UV - -REAL(KIND=JPRBT) ,INTENT(INOUT) :: PSIA(:,:), PAIA(:,:) - -! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: J, JGL ,IFLD ,ISL - - -! ------------------------------------------------------------------ - -!* 1. DIVIDE U V BY A*COS(THETA) -! -------------------------- - -ISL = MAX(R%NDGNH-G%NDGLU(KM)+1,1) -IFLD = 4*KF_UV - -!* 1.1 U AND V - -DO JGL=ISL,R%NDGNH - DO J=1,IFLD - PAIA(J,JGL) = PAIA(J,JGL)*F%RACTHE(JGL) - PSIA(J,JGL) = PSIA(J,JGL)*F%RACTHE(JGL) - ENDDO -ENDDO - -! ------------------------------------------------------------------ - -END SUBROUTINE LDFOU2AD -END MODULE LDFOU2AD_MOD diff --git a/src/trans/gpu/internal/ledirad_mod.F90 b/src/trans/gpu/internal/ledirad_mod.F90 deleted file mode 100755 index da66e318a..000000000 --- a/src/trans/gpu/internal/ledirad_mod.F90 +++ /dev/null @@ -1,206 +0,0 @@ -! (C) Copyright 1988- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE LEDIRAD_MOD -CONTAINS -SUBROUTINE LEDIRAD(KM,KMLOC,KFC,KIFC,KDGLU,KLED2,PAIA,PSIA,POA1) - -!**** *LEDIRAD* - Direct Legendre transform. - -! Purpose. -! -------- -! Direct Legendre tranform of state variables. - -!** Interface. -! ---------- -! CALL LEDIRAD(...) - -! Explicit arguments : KM - zonal wavenumber -! -------------------- KFC - number of field to transform -! PAIA - antisymmetric part of Fourier -! fields for zonal wavenumber KM -! PSIA - symmetric part of Fourier -! fields for zonal wavenumber KM -! POA1 - spectral -! fields for zonal wavenumber KM -! PLEPO - Legendre polonomials - -! Implicit arguments : None. -! -------------------- - -! Method. -! ------- - -! Externals. MXMAOP - matrix multiply -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 88-01-28 -! Modified : 91-07-01 Philippe Courtier/Mats Hamrud - Rewrite -! for uv formulation -! Modified : 93-03-19 D. Giard - NTMAX instead of NSMAX -! Modified : 04/06/99 D.Salmond : change order of AIA and SIA -! Modified ! 16/10/12 J.Hague : DR_HOOK round calls to DGEMM: -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT ,JPRD -USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - -USE TPM_DIM ,ONLY : R -USE TPM_GEOMETRY ,ONLY : G -!USE TPM_TRANS -! -USE TPM_FLT -USE TPM_FIELDS -USE TPM_DISTR -USE BUTTERFLY_ALG_MOD - -IMPLICIT NONE - - -! DUMMY ARGUMENTS -INTEGER(KIND=JPIM), INTENT(IN) :: KM -INTEGER(KIND=JPIM), INTENT(IN) :: KMLOC -INTEGER(KIND=JPIM), INTENT(IN) :: KFC -INTEGER(KIND=JPIM), INTENT(IN) :: KIFC -INTEGER(KIND=JPIM), INTENT(IN) :: KDGLU -INTEGER(KIND=JPIM), INTENT(IN) :: KLED2 - -REAL(KIND=JPRBT), INTENT(OUT) :: PSIA(:,:), PAIA(:,:) -REAL(KIND=JPRBT), INTENT(IN) :: POA1(:,:) - -INTEGER(KIND=JPIM) :: IA, ILA, ILS, IS, ISKIP, ISL, J, JK,JGL,J1 -INTEGER(KIND=JPIM) :: IF,ITHRESHOLD -REAL(KIND=JPRBT) :: ZB(KDGLU,KIFC), ZCA((R%NTMAX-KM+2)/2,KIFC), ZCS((R%NTMAX-KM+3)/2,KIFC) -LOGICAL, PARAMETER :: LLDOUBLE = (JPRD == JPRBT) -CHARACTER(LEN=1) :: CLX -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - -! ------------------------------------------------------------------ - -!* 1. PERFORM LEGENDRE TRANFORM. -! -------------------------- - -!* 1.1 PREPARATIONS. - -CLX = 'S' -IF (LLDOUBLE) CLX = 'D' - -IA = 1+MOD(R%NTMAX-KM+2,2) -IS = 1+MOD(R%NTMAX-KM+1,2) -ILA = (R%NTMAX-KM+2)/2 -ILS = (R%NTMAX-KM+3)/2 -ISL = MAX(R%NDGNH-G%NDGLU(KM)+1,1) - -IF(KM == 0)THEN - ISKIP = 2 - DO JGL=ISL,R%NDGNH - DO J1=2,KFC,2 - PSIA(J1,JGL)=0.0_JPRBT - PAIA(J1,JGL)=0.0_JPRBT - ENDDO - ENDDO -ELSE - ISKIP = 1 -ENDIF - - -IF (KIFC > 0 .AND. KDGLU > 0 ) THEN - - ITHRESHOLD=S%ITHRESHOLD - -!* 1. ANTISYMMETRIC PART. - - IF=0 - DO JK=1,KFC,ISKIP - IF=IF+1 - DO J=1,ILA - ZCA(J,IF) = POA1(IA+(J-1)*2,JK) - ENDDO - ENDDO - - IF(ILA <= ITHRESHOLD .OR. .NOT.S%LUSEFLT) THEN - IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_1',0,ZHOOK_HANDLE) - IF(LLDOUBLE)THEN - CALL DGEMM('N','N',KDGLU,KIFC,ILA,1.0_JPRBT,S%FA(KMLOC)%RPNMA,KDGLU,& - &ZCA,ILA,0._JPRBT,ZB,KDGLU) - ELSE - CALL SGEMM('N','N',KDGLU,KIFC,ILA,1.0_JPRBT,S%FA(KMLOC)%RPNMA,KDGLU,& - &ZCA,ILA,0._JPRBT,ZB,KDGLU) - END IF - IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_1',1,ZHOOK_HANDLE) - - ELSE - - CALL MULT_BUTM('N',S%FA(KMLOC)%YBUT_STRUCT_A,KIFC,ZCA,ZB) - - ENDIF - - IF=0 - DO JK=1,KFC,ISKIP - IF=IF+1 - DO J=1,KDGLU - PAIA(JK,ISL+J-1) = ZB(J,IF)*F%RW(ISL+J-1) - ENDDO - ENDDO - - -!* 1.3 SYMMETRIC PART. - - IF=0 - DO JK=1,KFC,ISKIP - IF=IF+1 - DO J=1,ILS - ZCS(J,IF) = POA1(IS+(J-1)*2,JK) - ENDDO - ENDDO - - - IF(ILS <= ITHRESHOLD .OR. .NOT.S%LUSEFLT) THEN - - IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_2',0,ZHOOK_HANDLE) - IF(LLDOUBLE)THEN - CALL DGEMM('N','N',KDGLU,KIFC,ILS,1.0_JPRBT,S%FA(KMLOC)%RPNMS,KDGLU,& - &ZCS,ILS,0._JPRBT,ZB,KDGLU) - ELSE - CALL SGEMM('N','N',KDGLU,KIFC,ILS,1.0_JPRBT,S%FA(KMLOC)%RPNMS,KDGLU,& - &ZCS,ILS,0._JPRBT,ZB,KDGLU) - - END IF - IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_2',1,ZHOOK_HANDLE) - - ELSE - - CALL MULT_BUTM('N',S%FA(KMLOC)%YBUT_STRUCT_S,KIFC,ZCS,ZB) - - ENDIF - - IF=0 - DO JK=1,KFC,ISKIP - IF=IF+1 - DO J=1,KDGLU - PSIA(JK,ISL+J-1) = ZB(J,IF)*F%RW(ISL+J-1) - ENDDO - ENDDO - -ENDIF - -! ------------------------------------------------------------------ - -END SUBROUTINE LEDIRAD -END MODULE LEDIRAD_MOD diff --git a/src/trans/gpu/internal/leinvad_mod.F90 b/src/trans/gpu/internal/leinvad_mod.F90 deleted file mode 100755 index b1695ff55..000000000 --- a/src/trans/gpu/internal/leinvad_mod.F90 +++ /dev/null @@ -1,196 +0,0 @@ -! (C) Copyright 2001- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE LEINVAD_MOD -CONTAINS -SUBROUTINE LEINVAD(KM,KMLOC,KFC,KIFC,KF_OUT_LT,KDGLU,PIA,PAOA1,PSOA1) - -!**** *LEINVAD* - Inverse Legendre transform. - -! Purpose. -! -------- -! Inverse Legendre tranform of all variables(kernel). - -!** Interface. -! ---------- -! CALL LEINVAD(...) - -! Explicit arguments : KM - zonal wavenumber (input-c) -! -------------------- KFC - number of fields to tranform (input-c) -! PIA - spectral fields -! for zonal wavenumber KM (input) -! PAOA1 - antisymmetric part of Fourier -! fields for zonal wavenumber KM (output) -! PSOA1 - symmetric part of Fourier -! fields for zonal wavenumber KM (output) - -! Implicit arguments : None. -! -------------------- - -! Method. -! ------- - -! Externals. MXMAOP - calls SGEMVX (matrix multiply) -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 00-02-01 From LEINVAD in IFS CY22R1 -! Modified ! 16/10/12 J.Hague : DR_HOOK round calls to DGEMM: -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT, JPRD -USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - -USE TPM_DIM ,ONLY : R -USE TPM_GEOMETRY ,ONLY : G -USE TPM_FIELDS ,ONLY : F -!USE TPM_TRANS -USE TPM_DISTR ,ONLY : D -! -USE TPM_FLT -USE BUTTERFLY_ALG_MOD - -IMPLICIT NONE - -INTEGER(KIND=JPIM), INTENT(IN) :: KM -INTEGER(KIND=JPIM), INTENT(IN) :: KMLOC -INTEGER(KIND=JPIM), INTENT(IN) :: KFC -INTEGER(KIND=JPIM), INTENT(IN) :: KIFC -INTEGER(KIND=JPIM), INTENT(IN) :: KDGLU -INTEGER(KIND=JPIM), INTENT(IN) :: KF_OUT_LT -REAL(KIND=JPRB), INTENT(OUT) :: PIA(:,:) -REAL(KIND=JPRBT), INTENT(INOUT) :: PSOA1(:,:) -REAL(KIND=JPRBT), INTENT(INOUT) :: PAOA1(:,:) - -! LOCAL VARIABLES -INTEGER(KIND=JPIM) :: IA, ILA, ILS, IS, ISKIP, ISL, IOAD1, JK,JI -INTEGER(KIND=JPIM) :: IF,ITHRESHOLD -REAL(KIND=JPRBT) :: ZBA((R%NSMAX-KM+2)/2,KIFC), ZBS((R%NSMAX-KM+3)/2,KIFC), ZC(KDGLU,KIFC) -LOGICAL, PARAMETER :: LLDOUBLE = (JPRD == JPRBT) -CHARACTER(LEN=1) :: CLX -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - -! ------------------------------------------------------------------ - -!* 1. PERFORM LEGENDRE TRANFORM. -! -------------------------- - -!* 1.1 PREPARATIONS. - -CLX = 'S' -IF (LLDOUBLE) CLX = 'D' - -IA = 1+MOD(R%NSMAX-KM+2,2) -IS = 1+MOD(R%NSMAX-KM+1,2) -ILA = (R%NSMAX-KM+2)/2 -ILS = (R%NSMAX-KM+3)/2 -ISL = MAX(R%NDGNH-G%NDGLU(KM)+1,1) -IOAD1 = 2*KF_OUT_LT - -IF(KM == 0)THEN - ISKIP = 2 -ELSE - ISKIP = 1 -ENDIF - -IF( KDGLU > 0 ) THEN - - ITHRESHOLD=S%ITHRESHOLD - - -! 1. +++++++++++++ anti-symmetric - - ! we need the transpose of C - IF=0 - DO JK=1,KFC,ISKIP - IF=IF+1 - DO JI=1,KDGLU - ZC(JI,IF) = PAOA1(JK,ISL+JI-1) - ENDDO - ENDDO - - IF(ILA <= ITHRESHOLD .OR. .NOT.S%LUSEFLT) THEN - IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_1',0,ZHOOK_HANDLE) - IF(LLDOUBLE)THEN - CALL DGEMM('T','N',ILA,KIFC,KDGLU,1.0_JPRBT,S%FA(KMLOC)%RPNMA,KDGLU,& - &ZC,KDGLU,0._JPRBT,ZBA,ILA) - ELSE - CALL SGEMM('T','N',ILA,KIFC,KDGLU,1.0_JPRBT,S%FA(KMLOC)%RPNMA,KDGLU,& - &ZC,KDGLU,0._JPRBT,ZBA,ILA) - END IF - IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_1',1,ZHOOK_HANDLE) - - ELSE - - CALL MULT_BUTM('T',S%FA(KMLOC)%YBUT_STRUCT_A,KIFC,ZC,ZBA) - - ENDIF - - IF=0 - DO JK=1,KFC,ISKIP - IF=IF+1 - DO JI=1,ILA - PIA(IA+1+(JI-1)*2,JK) = ZBA(JI,IF) - ENDDO - ENDDO - -! 2. +++++++++++++ symmetric - - ! we need the transpose of C - IF=0 - DO JK=1,KFC,ISKIP - IF=IF+1 - DO JI=1,KDGLU - ZC(JI,IF) = PSOA1(JK,ISL+JI-1) - ENDDO - ENDDO - - IF(ILS <= ITHRESHOLD .OR. .NOT.S%LUSEFLT ) THEN - - IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_2',0,ZHOOK_HANDLE) - IF(LLDOUBLE)THEN - CALL DGEMM('T','N',ILS,KIFC,KDGLU,1.0_JPRBT,S%FA(KMLOC)%RPNMS,KDGLU,& - &ZC,KDGLU,0._JPRBT,ZBS,ILS) - ELSE - CALL SGEMM('T','N',ILS,KIFC,KDGLU,1.0_JPRBT,S%FA(KMLOC)%RPNMS,KDGLU,& - &ZC,KDGLU,0._JPRBT,ZBS,ILS) - END IF - IF (LHOOK) CALL DR_HOOK('LE_'//CLX//'GEMM_2',1,ZHOOK_HANDLE) - - ELSE - - CALL MULT_BUTM('T',S%FA(KMLOC)%YBUT_STRUCT_S,KIFC,ZC,ZBS) - - ENDIF - - IF=0 - DO JK=1,KFC,ISKIP - IF=IF+1 - DO JI=1,ILS - PIA(IS+1+(JI-1)*2,JK) = ZBS(JI,IF) - ENDDO - ENDDO - - -ENDIF -! -! ------------------------------------------------------------------ - - -END SUBROUTINE LEINVAD -END MODULE LEINVAD_MOD diff --git a/src/trans/gpu/internal/ltdir_ctlad_mod.F90 b/src/trans/gpu/internal/ltdir_ctlad_mod.F90 deleted file mode 100755 index 1bf9cbf50..000000000 --- a/src/trans/gpu/internal/ltdir_ctlad_mod.F90 +++ /dev/null @@ -1,110 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE LTDIR_CTLAD_MOD -CONTAINS -SUBROUTINE LTDIR_CTLAD(KF_FS,KF_UV,KF_SCALARS, & - & PSPVOR,PSPDIV,PSPSCALAR, & - & PSPSC3A,PSPSC3B,PSPSC2, & - & KFLDPTRUV,KFLDPTRSC) - -!**** *LTDIR_CTLAD* - Control routine for direct Legendre transform - -! Purpose. -! -------- -! Direct Legendre transform - -!** Interface. -! ---------- -! CALL LTDIR_CTLAD(...) - -! Explicit arguments : -! -------------------- -! PSPVOR(:,:) - spectral vorticity (output) -! PSPDIV(:,:) - spectral divergence (output) -! PSPSCALAR(:,:) - spectral scalarvalued fields (output) - -! ------------------------------------------------------------------ - -USE PARKIND1 ,ONLY : JPIM ,JPRB - -USE TPM_GEN ,ONLY : LALLOPERM -!USE TPM_DIM -USE TPM_TRANS ,ONLY : FOUBUF, FOUBUF_IN -USE TPM_DISTR ,ONLY : D - -USE LTDIRAD_MOD ,ONLY : LTDIRAD -!USE TRMTOL_MOD ,ONLY : TRMTOL -! - -IMPLICIT NONE - -INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_UV,KF_SCALARS -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPVOR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPDIV(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSCALAR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSC3A(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSC3B(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSC2(:,:) -INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) -INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) - -INTEGER(KIND=JPIM) :: JM,IM,IBLEN,ILED2 - -! ------------------------------------------------------------------ - -! Transposition from Fourier space distribution to spectral space distribution - -CALL GSTATS(105,0) -IBLEN = D%NLENGT0B*2*KF_FS -IF (ALLOCATED(FOUBUF_IN)) THEN - IF (MAX(1,IBLEN) > SIZE(FOUBUF_IN)) THEN - DEALLOCATE(FOUBUF_IN) - ALLOCATE(FOUBUF_IN(MAX(1,IBLEN))) - ENDIF -ELSE - ALLOCATE(FOUBUF_IN(MAX(1,IBLEN))) -ENDIF -IF (ALLOCATED(FOUBUF)) THEN - IF (MAX(1,IBLEN) > SIZE(FOUBUF)) THEN - DEALLOCATE(FOUBUF) - ALLOCATE(FOUBUF(MAX(1,IBLEN))) - ENDIF -ELSE - ALLOCATE(FOUBUF(MAX(1,IBLEN))) -ENDIF - -! Direct Legendre transform - -ILED2 = 2*KF_FS -CALL GSTATS(1646,0) -IF(KF_FS > 0) THEN -!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JM,IM) - DO JM=1,D%NUMP - IM = D%MYMS(JM) - CALL LTDIRAD(IM,JM,KF_FS,KF_UV,KF_SCALARS,ILED2, & - & PSPVOR,PSPDIV,PSPSCALAR,& - & PSPSC3A,PSPSC3B,PSPSC2 , & - & KFLDPTRUV,KFLDPTRSC) - ENDDO -!$OMP END PARALLEL DO -ENDIF -CALL GSTATS(1646,1) - -CALL GSTATS(105,1) - -CALL GSTATS(181,0) -!CALL TRMTOL(FOUBUF,FOUBUF_IN,2*KF_FS) -CALL GSTATS(181,1) -IF (.NOT.LALLOPERM) DEALLOCATE(FOUBUF) -! ------------------------------------------------------------------ - -END SUBROUTINE LTDIR_CTLAD -END MODULE LTDIR_CTLAD_MOD diff --git a/src/trans/gpu/internal/ltdirad_mod.F90 b/src/trans/gpu/internal/ltdirad_mod.F90 deleted file mode 100755 index 63c6b2b59..000000000 --- a/src/trans/gpu/internal/ltdirad_mod.F90 +++ /dev/null @@ -1,188 +0,0 @@ -! (C) Copyright 1987- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE LTDIRAD_MOD -CONTAINS -SUBROUTINE LTDIRAD(KM,KMLOC,KF_FS,KF_UV,KF_SCALARS,KLED2,& - & PSPVOR,PSPDIV,PSPSCALAR,& - & PSPSC3A,PSPSC3B,PSPSC2 , & - & KFLDPTRUV,KFLDPTRSC) - - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT - -USE TPM_DIM ,ONLY : R -USE TPM_GEOMETRY - -USE PREPSNM_MOD ,ONLY : PREPSNM -USE PRFI2AD_MOD ,ONLY : PRFI2AD -USE LDFOU2AD_MOD ,ONLY : LDFOU2AD -USE LEDIRAD_MOD ,ONLY : LEDIRAD -USE UVTVDAD_MOD -USE UPDSPAD_MOD ,ONLY : UPDSPAD - - -!**** *LTDIRAD* - Control of Direct Legendre transform step - adjoint - -! Purpose. -! -------- -! Tranform from Fourier space to spectral space, compute -! vorticity and divergence. - -!** Interface. -! ---------- -! *CALL* *LTDIRAD(...)* - -! Explicit arguments : -! -------------------- KM - zonal wavenumber -! KMLOC - local zonal wavenumber -! PSPVOR - spectral vorticity -! PSPDIV - spectral divergence -! PSPSCALAR - spectral scalar variables - -! Implicit arguments : None -! -------------------- - -! Method. -! ------- - -! Externals. -! ---------- -! PREPSNM - prepare REPSNM for wavenumber KM -! PRFI2AD - prepares the Fourier work arrays for model variables. -! LDFOU2AD - computations in Fourier space -! LEDIRAD - direct Legendre transform -! UVTVDAD - -! UPDSPAD - updating of spectral arrays (fields) - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 87-11-24 -! Modified : 91-07-01 Philippe Courtier/Mats Hamrud - Rewrite -! for uv formulation -! Modified 93-03-19 D. Giard - CDCONF='T' for tendencies -! Modified 93-11-18 M. Hamrud - use only one Fourier buffer -! Modified 94-04-06 R. El khatib Full-POS implementation -! M.Hamrud : 94-11-01 New conf 'G' - vor,div->vor,div -! instead of u,v->vor,div -! MPP Group : 95-10-01 Support for Distributed Memory version -! K. YESSAD (AUGUST 1996): -! - Legendre transforms for transmission coefficients. -! Modified : 04/06/99 D.Salmond : change order of AIA and SIA -! R. El Khatib 12-Jul-2012 LDSPC2AD replaced by UVTVDAD -! ------------------------------------------------------------------ - -IMPLICIT NONE - - -! DUMMY INTEGER SCALARS -INTEGER(KIND=JPIM), INTENT(IN) :: KM -INTEGER(KIND=JPIM), INTENT(IN) :: KMLOC -INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_UV,KF_SCALARS,KLED2 - -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPVOR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPDIV(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSCALAR(:,:) -REAL(KIND=JPRB) ,OPTIONAL,INTENT(INOUT) :: PSPSC2(:,:) -REAL(KIND=JPRB) ,OPTIONAL,INTENT(INOUT) :: PSPSC3A(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL,INTENT(INOUT) :: PSPSC3B(:,:,:) -INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) -INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) - - -! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: IFC, IIFC, IDGLU -INTEGER(KIND=JPIM) :: IUS, IUE, IVS, IVE, IVORS, IVORE, IDIVS, IDIVE - -! LOCAL REALS -REAL(KIND=JPRBT) :: ZSIA(KLED2,R%NDGNH), ZAIA(KLED2,R%NDGNH) -REAL(KIND=JPRBT) :: ZEPSNM(0:R%NTMAX+2) -REAL(KIND=JPRBT) :: ZOA1(R%NLED4,KLED2), ZOA2(R%NLED4,MAX(4*KF_UV,1)) - - -! ------------------------------------------------------------------ - -!* 1. PREPARE LEGENDRE POLONOMIALS AND EPSNM -! -------------------------------------- - - - - -! ------------------------------------------------------------------ - -!* 6. UPDATE SPECTRAL ARRAYS. -! ----------------------- - -CALL UPDSPAD(KM,KF_UV,KF_SCALARS,ZOA1,ZOA2, & - & PSPVOR,PSPDIV,PSPSCALAR,& - & PSPSC3A,PSPSC3B,PSPSC2 , & - & KFLDPTRUV,KFLDPTRSC) - -! ------------------------------------------------------------------ - -!* 5. COMPUTE VORTICITY AND DIVERGENCE. -! --------------------------------- - -IF( KF_UV > 0 ) THEN - stop 'Error: code path not (yet) supported in GPU version' - !CALL PREPSNM(KM,KMLOC,ZEPSNM) - IUS = 1 - IUE = 2*KF_UV - IVS = 2*KF_UV+1 - IVE = 4*KF_UV - IVORS = 1 - IVORE = 2*KF_UV - IDIVS = 2*KF_UV+1 - IDIVE = 4*KF_UV -! SET PART OF ZOA1 CONTAINING U AND V TO 0. - ZOA1(:,IUS:IVE) = 0.0_JPRB - CALL UVTVDAD(KM,KF_UV,ZEPSNM,ZOA1(:,IUS:IUE),ZOA1(:,IVS:IVE),& - & ZOA2(:,IVORS:IVORE),ZOA2(:,IDIVS:IDIVE)) -ENDIF - -! ------------------------------------------------------------------ - -!* 4. DIRECT LEGENDRE TRANSFORM. -! -------------------------- -IFC = 2*KF_FS -IDGLU = MIN(R%NDGNH,G%NDGLU(KM)) -IIFC = IFC -IF(KM == 0)THEN - IIFC = IFC/2 -ENDIF -CALL LEDIRAD(KM,KMLOC,IFC,IIFC,IDGLU,KLED2,ZAIA,ZSIA,ZOA1) - -! ------------------------------------------------------------------ - -!* 3. FOURIER SPACE COMPUTATIONS. -! --------------------------- - -CALL LDFOU2AD(KM,KF_UV,ZAIA,ZSIA) - -! ------------------------------------------------------------------ - -!* 2. PREPARE WORK ARRAYS. -! -------------------- - -CALL PRFI2AD(KM,KMLOC,KF_FS,ZAIA,ZSIA) - - -! ------------------------------------------------------------------ - -END SUBROUTINE LTDIRAD -END MODULE LTDIRAD_MOD - diff --git a/src/trans/gpu/internal/ltinv_ctlad_mod.F90 b/src/trans/gpu/internal/ltinv_ctlad_mod.F90 deleted file mode 100755 index a794e5aa5..000000000 --- a/src/trans/gpu/internal/ltinv_ctlad_mod.F90 +++ /dev/null @@ -1,119 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE LTINV_CTLAD_MOD -CONTAINS -SUBROUTINE LTINV_CTLAD(KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,& - & PSPVOR,PSPDIV,PSPSCALAR,& - & PSPSC3A,PSPSC3B,PSPSC2,& - & KFLDPTRUV,KFLDPTRSC,FSPGL_PROC) - -!**** *LTINV_CTLAD* - Control routine for inverse Legandre transform - adj. - -! Purpose. -! -------- -! Control routine for the inverse LEGENDRE transform - -!** Interface. -! ---------- -! CALL INV_TRANS_CTL(...) -! KF_OUT_LT - number of fields coming out from inverse LT -! KF_UV - local number of spectral u-v fields -! KF_SCALARS - local number of scalar spectral fields -! KF_SCDERS - local number of derivatives of scalar spectral fields -! PSPVOR(:,:) - spectral vorticity (input) -! PSPDIV(:,:) - spectral divergence (input) -! PSPSCALAR(:,:) - spectral scalarvalued fields (input) -! KFLDPTRUV(:) - field pointer array for vor./div. -! KFLDPTRSC(:) - field pointer array for PSPSCALAR -! FSPGL_PROC - external procedure to be executed in fourier space -! before transposition - -! Method. -! ------- - -! Externals. -! ---------- -! - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 00-06-03 - -! ------------------------------------------------------------------ -USE PARKIND1 ,ONLY : JPIM ,JPRB - -USE TPM_GEN ,ONLY : LALLOPERM -!USE TPM_DIM -USE TPM_TRANS ,ONLY : FOUBUF, FOUBUF_IN -USE TPM_DISTR ,ONLY : D -USE LTINVAD_MOD ,ONLY : LTINVAD -!USE TRLTOM_MOD ,ONLY : TRLTOM - -IMPLICIT NONE - -INTEGER(KIND=JPIM),INTENT(IN) :: KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPVOR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPDIV(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSCALAR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3A(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC3B(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(OUT) :: PSPSC2(:,:) -INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) -INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) -EXTERNAL FSPGL_PROC -OPTIONAL FSPGL_PROC - -INTEGER(KIND=JPIM) :: JM,IM,IBLEN,ILEI2,IDIM1 - -! ------------------------------------------------------------------ - -ILEI2 = 8*KF_UV + 2*KF_SCALARS + 2*KF_SCDERS -IDIM1 = 2*KF_OUT_LT -IBLEN = D%NLENGT0B*2*KF_OUT_LT -IF (ALLOCATED(FOUBUF_IN)) THEN - IF (MAX(1,IBLEN) > SIZE(FOUBUF_IN)) THEN - DEALLOCATE(FOUBUF_IN) - ALLOCATE(FOUBUF_IN(MAX(1,IBLEN))) - ENDIF -ELSE - ALLOCATE(FOUBUF_IN(MAX(1,IBLEN))) -ENDIF -CALL GSTATS(180,0) -!CALL TRLTOM(FOUBUF,FOUBUF_IN,2*KF_OUT_LT) -CALL GSTATS(180,1) -IF (.NOT.LALLOPERM) DEALLOCATE(FOUBUF) - -CALL GSTATS(104,0) -CALL GSTATS(1648,0) -IF(KF_OUT_LT > 0) THEN -!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JM,IM) - DO JM=1,D%NUMP - IM = D%MYMS(JM) - CALL LTINVAD(IM,JM,KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,ILEI2,IDIM1,& - & PSPVOR,PSPDIV,PSPSCALAR,& - & PSPSC3A,PSPSC3B,PSPSC2 , & - & KFLDPTRUV,KFLDPTRSC,FSPGL_PROC) - ENDDO -!$OMP END PARALLEL DO -ENDIF -CALL GSTATS(1648,1) - -IF (.NOT.LALLOPERM) DEALLOCATE(FOUBUF_IN) -CALL GSTATS(104,1) - -! ------------------------------------------------------------------ - -END SUBROUTINE LTINV_CTLAD -END MODULE LTINV_CTLAD_MOD diff --git a/src/trans/gpu/internal/ltinvad_mod.F90 b/src/trans/gpu/internal/ltinvad_mod.F90 deleted file mode 100755 index 2de610e71..000000000 --- a/src/trans/gpu/internal/ltinvad_mod.F90 +++ /dev/null @@ -1,239 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE LTINVAD_MOD -CONTAINS -SUBROUTINE LTINVAD(KM,KMLOC,KF_OUT_LT,KF_UV,KF_SCALARS,KF_SCDERS,KLEI2,KDIM1,& - & PSPVOR,PSPDIV,PSPSCALAR,& - & PSPSC3A,PSPSC3B,PSPSC2 , & - & KFLDPTRUV,KFLDPTRSC,FSPGL_PROC) - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT - -USE TPM_DIM ,ONLY : R -USE TPM_TRANS ,ONLY : LDIVGP, LVORGP, NF_SC2, NF_SC3A, NF_SC3B -USE TPM_GEOMETRY - -!USE PRLE1AD_MOD -USE PREPSNM_MOD ,ONLY : PREPSNM -USE PRFI1BAD_MOD ,ONLY : PRFI1BAD -USE VDTUVAD_MOD ,ONLY : VDTUVAD -USE SPNSDEAD_MOD ,ONLY : SPNSDEAD -USE LEINVAD_MOD ,ONLY : LEINVAD -USE ASRE1BAD_MOD ,ONLY : ASRE1BAD -!USE FSPGL_INT_MOD - - -!**** *LTINVAD* - Inverse Legendre transform - -! Purpose. -! -------- -! Tranform from Laplace space to Fourier space, compute U and V -! and north/south derivatives of state variables. - -!** Interface. -! ---------- -! *CALL* *LTINVAD(...) - -! Explicit arguments : -! -------------------- -! KM - zonal wavenumber -! KMLOC - local zonal wavenumber -! PSPVOR - spectral vorticity -! PSPDIV - spectral divergence -! PSPSCALAR - spectral scalar variables - -! Implicit arguments : The Laplace arrays of the model. -! -------------------- The values of the Legendre polynomials -! The grid point arrays of the model -! Method. -! ------- - -! Externals. -! ---------- -! PRLE1AD - prepares the Legendre polonymials -! PREPSNM - prepare REPSNM for wavenumber KM -! PRFI1AD - prepares the spectral fields -! VDTUVAD - compute u and v from vorticity and divergence -! SPNSDEAD- compute north-south derivatives -! LEINVAD - Inverse Legendre transform -! ASRE1AD - recombination of symmetric/antisymmetric part - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS -! Temperton, 1991, MWR 119 p1303 - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : 00-02-01 From LTINVAD in IFS CY22R1 -! ------------------------------------------------------------------ - -IMPLICIT NONE - - -INTEGER(KIND=JPIM), INTENT(IN) :: KM -INTEGER(KIND=JPIM), INTENT(IN) :: KMLOC -INTEGER(KIND=JPIM), INTENT(IN) :: KF_OUT_LT -INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV -INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS -INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCDERS -INTEGER(KIND=JPIM), INTENT(IN) :: KLEI2 -INTEGER(KIND=JPIM), INTENT(IN) :: KDIM1 - -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPVOR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPDIV(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSCALAR(:,:) -REAL(KIND=JPRB) ,OPTIONAL,INTENT(INOUT) :: PSPSC2(:,:) -REAL(KIND=JPRB) ,OPTIONAL,INTENT(INOUT) :: PSPSC3A(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL,INTENT(INOUT) :: PSPSC3B(:,:,:) -INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) -INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) -EXTERNAL FSPGL_PROC -OPTIONAL FSPGL_PROC - -REAL(KIND=JPRB) :: ZIA(R%NLEI1,KLEI2) -REAL(KIND=JPRBT) :: ZEPSNM(0:R%NTMAX+2) -REAL(KIND=JPRBT) :: ZSOA1(KDIM1,R%NLEI3),ZAOA1(KDIM1,R%NLEI3) - - -! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: IFC, ISTA, IIFC, IDGLU -INTEGER(KIND=JPIM) :: IVORL,IVORU,IDIVL,IDIVU,IUL,IUU,IVL,IVU,ISL,ISU,IDL,IDU -INTEGER(KIND=JPIM) :: ILAST,IFIRST,IDIM1,IDIM3,J3 - -! LOCAL LOGICAL SCALARS - -! LOCAL REAL SCALARS - -! ------------------------------------------------------------------ - -!* 1. PREPARE AND ZEPSNM. -! ------------------- - -stop 'Error: code path not (yet) supported in GPU version' -!CALL PREPSNM(KM,KMLOC,ZEPSNM) - -! ------------------------------------------------------------------ -! 6. OPTIONAL COMPUTATIONS IN FOURIER SPACE - -!IF(PRESENT(FSPGL_PROC)) THEN -! CALL FSPGL_INT(KM,KMLOC,FSPGL_PROC) -!ENDIF - -! ------------------------------------------------------------------ - -!* 5. RECOMBINATION SYMMETRIC/ANTISYMMETRIC PART. -! -------------------------------------------- - -CALL ASRE1BAD(KF_OUT_LT,KM,KMLOC,ZAOA1,ZSOA1) - -! ------------------------------------------------------------------ - -!* 4. INVERSE LEGENDRE TRANSFORM. -! --------------------------- - - -ISTA = 1 -IFC = 2*KF_OUT_LT -IF(KF_UV > 0 .AND. .NOT. LVORGP) THEN - ISTA = ISTA+2*KF_UV -ENDIF -IF(KF_UV > 0 .AND. .NOT. LDIVGP) THEN - ISTA = ISTA+2*KF_UV -ENDIF - -ZIA(:,ISTA:ISTA+IFC-1) = 0.0_JPRBT - -IDGLU = MIN(R%NDGNH,G%NDGLU(KM)) -IIFC=IFC -IF(KM == 0)THEN - IIFC=IFC/2 -ENDIF -CALL LEINVAD(KM,KMLOC,IFC,IIFC,KF_OUT_LT,IDGLU,ZIA(:,ISTA:ISTA+IFC-1),ZAOA1,ZSOA1) - -! ------------------------------------------------------------------ - -!* 3. SPECTRAL COMPUTATIONS FOR U,V AND DERIVATIVES. -! ---------------------------------------------- - -ZIA(:,1:ISTA-1) = 0.0_JPRBT - -IFIRST = 1 -ILAST = 4*KF_UV -IF (KF_UV > 0) THEN - IVORL = 1 - IVORU = 2*KF_UV - IDIVL = 2*KF_UV+1 - IDIVU = 4*KF_UV - IUL = 4*KF_UV+1 - IUU = 6*KF_UV - IVL = 6*KF_UV+1 - IVU = 8*KF_UV - CALL VDTUVAD(KM,KF_UV,ZEPSNM,ZIA(:,IVORL:IVORU),ZIA(:,IDIVL:IDIVU),& - & ZIA(:,IUL:IUU),ZIA(:,IVL:IVU)) - CALL PRFI1BAD(KM,ZIA(:,IVORL:IVORU),PSPVOR,KF_UV,KFLDPTRUV) - CALL PRFI1BAD(KM,ZIA(:,IDIVL:IDIVU),PSPDIV,KF_UV,KFLDPTRUV) - ILAST = ILAST+4*KF_UV -ENDIF - -IF (KF_SCDERS > 0) THEN - ISL = 2*(4*KF_UV)+1 - ISU = ISL+2*KF_SCALARS-1 - IDL = 2*(4*KF_UV+KF_SCALARS)+1 - IDU = IDL+2*KF_SCDERS-1 - CALL SPNSDEAD(KM,KF_SCALARS,ZEPSNM,ZIA(:,ISL:ISU),ZIA(:,IDL:IDU)) -ENDIF - -IF(KF_SCALARS > 0)THEN - IF(PRESENT(PSPSCALAR)) THEN - IFIRST = ILAST+1 - ILAST = IFIRST - 1 + 2*KF_SCALARS - CALL PRFI1BAD(KM,ZIA(:,IFIRST:ILAST),PSPSCALAR(:,:),KF_SCALARS,KFLDPTRSC) - ELSE - IF(PRESENT(PSPSC2) .AND. NF_SC2 > 0) THEN - IFIRST = ILAST+1 - ILAST = IFIRST-1+2*NF_SC2 - CALL PRFI1BAD(KM,ZIA(:,IFIRST:ILAST),PSPSC2(:,:),NF_SC2) - ENDIF - IF(PRESENT(PSPSC3A) .AND. NF_SC3A > 0) THEN - IDIM1=NF_SC3A - IDIM3=UBOUND(PSPSC3A,3) - DO J3=1,IDIM3 - IFIRST = ILAST+1 - ILAST = IFIRST-1+2*IDIM1 - CALL PRFI1BAD(KM,ZIA(:,IFIRST:ILAST),PSPSC3A(:,:,J3),IDIM1) - ENDDO - ENDIF - IF(PRESENT(PSPSC3B) .AND. NF_SC3B > 0) THEN - IDIM1=NF_SC3B - IDIM3=UBOUND(PSPSC3B,3) - DO J3=1,IDIM3 - IFIRST = ILAST+1 - ILAST = IFIRST-1+2*IDIM1 - CALL PRFI1BAD(KM,ZIA(:,IFIRST:ILAST),PSPSC3B(:,:,J3),IDIM1) - ENDDO - ENDIF - ENDIF -ENDIF - - -! ------------------------------------------------------------------ - - -END SUBROUTINE LTINVAD -END MODULE LTINVAD_MOD - - - - diff --git a/src/trans/gpu/internal/prfi1ad_mod.F90 b/src/trans/gpu/internal/prfi1ad_mod.F90 deleted file mode 100755 index 607ef63c9..000000000 --- a/src/trans/gpu/internal/prfi1ad_mod.F90 +++ /dev/null @@ -1,112 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE PRFI1AD_MOD -CONTAINS -SUBROUTINE PRFI1AD(KM,KF_UV,KF_SCALARS,PIA,PSPVOR,PSPDIV,PSPSCALAR,& - & KFLDPTRUV,KFLDPTRSC) - -USE PARKIND1 ,ONLY : JPIM ,JPRB - -!USE TPM_DISTR -!USE TPM_TRANS - -USE PRFI1BAD_MOD ,ONLY : PRFI1BAD - - -!**** *PRFI1AD* - Prepare spectral fields for inverse Legendre transform - -! Purpose. -! -------- -! To extract the spectral fields for a specific zonal wavenumber -! and put them in an order suitable for the inverse Legendre . -! tranforms.The ordering is from NSMAX to KM for better conditioning. -! Elements 1,2 and NLCM(KM)+1 are zeroed in preparation for computing -! u,v and derivatives in spectral space. - -!** Interface. -! ---------- -! *CALL* *PRFI1AD(KM,PIA,PSPVOR,PSPDIV,PSPSCALAR - -! Explicit arguments : KM - zonal wavenumber -! ------------------ PIA - spectral components for transform -! PSPVOR - vorticity -! PSPDIV - divergence -! PSPSCALAR - scalar variables - -! Implicit arguments : None. -! -------------------- - -! Method. -! ------- - -! Externals. None. -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 00-02-01 From PRFI1AD in IFS CY22R1 - -! ------------------------------------------------------------------ - -IMPLICIT NONE - -INTEGER(KIND=JPIM),INTENT(IN) :: KM,KF_UV,KF_SCALARS -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPVOR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPDIV(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSCALAR(:,:) -REAL(KIND=JPRB) , INTENT(IN) :: PIA(:,:) -INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) -INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) - - -! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: IDIV, IFIRST, ILAST, IVOR - - -! ------------------------------------------------------------------ - -!* 1. EXTRACT FIELDS FROM SPECTRAL ARRAYS. -! ------------------------------------ - -IFIRST = 1 -ILAST = 4*KF_UV - -!* 1.1 VORTICITY AND DIVERGENCE. - -IF(KF_UV > 0)THEN - IVOR = 1 - IDIV = 2*KF_UV+1 - CALL PRFI1BAD(KM,PIA(:,IVOR:IDIV-1),PSPVOR,KF_UV,KFLDPTRUV) - CALL PRFI1BAD(KM,PIA(:,IDIV:ILAST) ,PSPDIV,KF_UV,KFLDPTRUV) - ILAST = ILAST+4*KF_UV -ENDIF - -!* 1.2 SCALAR VARIABLES. - -IF(KF_SCALARS > 0)THEN - IFIRST = ILAST+1 - ILAST = IFIRST - 1 + 2*KF_SCALARS - CALL PRFI1BAD(KM,PIA(:,IFIRST:ILAST),PSPSCALAR(:,:),KF_SCALARS,KFLDPTRSC) -ENDIF - -! ------------------------------------------------------------------ - -END SUBROUTINE PRFI1AD -END MODULE PRFI1AD_MOD - - diff --git a/src/trans/gpu/internal/prfi1bad_mod.F90 b/src/trans/gpu/internal/prfi1bad_mod.F90 deleted file mode 100755 index 5ba45c863..000000000 --- a/src/trans/gpu/internal/prfi1bad_mod.F90 +++ /dev/null @@ -1,111 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE PRFI1BAD_MOD -CONTAINS -SUBROUTINE PRFI1BAD(KM,PIA,PSPEC,KFIELDS,KFLDPTR) - -USE PARKIND1 ,ONLY : JPIM ,JPRB - -USE TPM_DIM ,ONLY : R -USE TPM_DISTR ,ONLY : D - - -!**** *PRFI1* - Prepare spectral fields for inverse Legendre transform - -! Purpose. -! -------- -! To extract the spectral fields for a specific zonal wavenumber -! and put them in an order suitable for the inverse Legendre . -! tranforms.The ordering is from NSMAX to KM for better conditioning. -! Elements 1,2 and NLCM(KM)+1 are zeroed in preparation for computing -! u,v and derivatives in spectral space. - -!** Interface. -! ---------- -! *CALL* *PRFI1BAD(...)* - -! Explicit arguments : KM - zonal wavenumber -! ------------------ PIA - spectral components for transform -! PSPEC - spectral array -! KFIELDS - number of fields - - -! Implicit arguments : None. -! -------------------- - -! Method. -! ------- - -! Externals. None. -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 00-02-01 From PRFI1BAD in IFS CY22R1 - -! ------------------------------------------------------------------ - -IMPLICIT NONE - -INTEGER(KIND=JPIM),INTENT(IN) :: KM,KFIELDS -REAL(KIND=JPRB) ,INTENT(INOUT) :: PSPEC(:,:) -REAL(KIND=JPRB) ,INTENT(IN) :: PIA(:,:) -INTEGER(KIND=JPIM),INTENT(IN),OPTIONAL :: KFLDPTR(:) - -! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: II, INM, IR, J, JFLD, ILCM, IOFF, IFLD - - -! ------------------------------------------------------------------ - -!* 1. EXTRACT FIELDS FROM SPECTRAL ARRAYS. -! -------------------------------------------------- - - -ILCM = R%NSMAX+1-KM -IOFF = D%NASM0(KM) - -IF(PRESENT(KFLDPTR)) THEN - DO JFLD=1,KFIELDS - IR = 2*(JFLD-1)+1 - II = IR+1 - IFLD = KFLDPTR(JFLD) - DO J=1,ILCM - INM = IOFF+(ILCM-J)*2 - PSPEC(IFLD,INM ) = PSPEC(IFLD,INM ) + PIA(J+2,IR) - PSPEC(IFLD,INM+1) = PSPEC(IFLD,INM+1) + PIA(J+2,II) - ENDDO - ENDDO -ELSE - DO J=1,ILCM - INM = IOFF+(ILCM-J)*2 -!DIR$ IVDEP -!OCL NOVREC - DO JFLD=1,KFIELDS - IR = 2*(JFLD-1)+1 - II = IR+1 - PSPEC(JFLD,INM ) = PSPEC(JFLD,INM ) + PIA(J+2,IR) - PSPEC(JFLD,INM+1) = PSPEC(JFLD,INM+1) + PIA(J+2,II) - ENDDO - ENDDO -ENDIF - -! ------------------------------------------------------------------ - -END SUBROUTINE PRFI1BAD -END MODULE PRFI1BAD_MOD diff --git a/src/trans/gpu/internal/prfi2ad_mod.F90 b/src/trans/gpu/internal/prfi2ad_mod.F90 deleted file mode 100755 index ecea0aed4..000000000 --- a/src/trans/gpu/internal/prfi2ad_mod.F90 +++ /dev/null @@ -1,90 +0,0 @@ -! (C) Copyright 1987- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE PRFI2AD_MOD -CONTAINS -SUBROUTINE PRFI2AD(KM,KMLOC,KF_FS,PAIA,PSIA) - -!**** *PRFI2AD* - Prepare input work arrays for direct transform - -! Purpose. -! -------- -! To extract the Fourier fields for a specific zonal wavenumber -! and put them in an order suitable for the direct Legendre -! tranforms, i.e. split into symmetric and anti-symmetric part. - -!** Interface. -! ---------- -! *CALL* *PRFI2AD(..) - -! Explicit arguments : -! -------------------- KM - zonal wavenumber -! KMLOC - local zonal wavenumber -! PAIA - antisymmetric part of Fourier -! components for KM (output) -! PSIA - symmetric part of Fourier -! components for KM (output) - -! Implicit arguments : The Grid point arrays of the model. -! -------------------- - -! Method. -! ------- - -! Externals. PRFI2ADB - basic copying routine -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 87-11-25 -! Modified : 91-07-01 Philippe Courtier/Mats Hamrud - Rewrite -! for uv formulation -! Modified : 93-03-19 D. Giard - CDCONF='T' -! Modified : 93-??-?? ???????? - CDCONF='C'--> bug if CDCONF='T' -! Modified : 93-05-13 D. Giard - correction of the previous bug -! Modified : 93-11-18 M. Hamrud - use only one Fourier buffer -! Modified : 94-04-06 R. El Khatib - Full-POS configuration 'P' -! M.Hamrud : 94-11-01 New conf 'G' - vor,div->vor,div -! instead of u,v->vor,div -! MPP Group: 95-10-01 Support for Distributed Memory version -! Modified : 04/06/99 D.Salmond : change order of AIA and SIA -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - -USE PRFI2BAD_MOD ,ONLY : PRFI2BAD -! - -IMPLICIT NONE - -INTEGER(KIND=JPIM) , INTENT(IN) :: KM -INTEGER(KIND=JPIM) , INTENT(IN) :: KMLOC -INTEGER(KIND=JPIM) , INTENT(IN) :: KF_FS - -REAL(KIND=JPRBT) , INTENT(IN) :: PSIA(:,:), PAIA(:,:) - -! ------------------------------------------------------------------ - -!* 2. EXTRACT SYM./ANTISYM. FIELDS FROM TIME T+1. -! ------------------------------------------- - -CALL PRFI2BAD(KF_FS,KM,KMLOC,PAIA,PSIA) - -! ------------------------------------------------------------------ - -END SUBROUTINE PRFI2AD -END MODULE PRFI2AD_MOD diff --git a/src/trans/gpu/internal/prfi2bad_mod.F90 b/src/trans/gpu/internal/prfi2bad_mod.F90 deleted file mode 100755 index 85aca6259..000000000 --- a/src/trans/gpu/internal/prfi2bad_mod.F90 +++ /dev/null @@ -1,98 +0,0 @@ -! (C) Copyright 1990- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE PRFI2BAD_MOD -CONTAINS -SUBROUTINE PRFI2BAD(KFIELD,KM,KMLOC,PAIA,PSIA) - -!**** *PRFI2BAD* - Prepare input work arrays for direct transform - -! Purpose. -! -------- -! To extract the Fourier fields for a specific zonal wavenumber -! and put them in an order suitable for the direct Legendre -! tranforms, i.e. split into symmetric and anti-symmetric part. - -!** Interface. -! ---------- -! *CALL* *PRFI2BAD(..) - -! Explicit arguments : -! ------------------- KFIELD - number of fields -! KM - zonal wavenumber -! KMLOC - local zonal wavenumber -! PAOA - antisymmetric part of Fourier -! fields for zonal wavenumber KM -! PSOA - symmetric part of Fourier -! fields for zonal wavenumber KM - -! Implicit arguments : FOUBUF in TPM_TRANS -! -------------------- - -! Method. -! ------- - -! Externals. None. -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 90-07-01 -! MPP Group: 95-10-01 Support for Distributed Memory version -! Modified : 04/06/99 D.Salmond : change order of AIA and SIA -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - -USE TPM_DIM ,ONLY : R -USE TPM_TRANS ,ONLY : FOUBUF -USE TPM_GEOMETRY ,ONLY : G -USE TPM_DISTR ,ONLY : D -! - -IMPLICIT NONE - -INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD,KM,KMLOC -REAL(KIND=JPRBT) , INTENT(IN) :: PSIA(:,:), PAIA(:,:) - -! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: IGLS, ISL, ISTAN, ISTAS, JF, JGL - - -! ------------------------------------------------------------------ - -!* 1. EXTRACT SYM./ANTISYM. FIELDS FROM FOURIER ARRAY. -! ------------------------------------------------ - -ISL = MAX(R%NDGNH-G%NDGLU(KM)+1,1) - -DO JGL=ISL,R%NDGNH - IGLS = R%NDGL+1-JGL - ISTAN = (D%NSTAGT1B(D%NPROCL(JGL) )+D%NPNTGTB1(KMLOC,JGL ))*2*KFIELD - ISTAS = (D%NSTAGT1B(D%NPROCL(IGLS))+D%NPNTGTB1(KMLOC,IGLS))*2*KFIELD -!DIR$ IVDEP -!OCL NOVREC - DO JF=1,KFIELD*2 - FOUBUF(ISTAN+JF) = PSIA(JF,JGL)+PAIA(JF,JGL) - FOUBUF(ISTAS+JF) = PSIA(JF,JGL)-PAIA(JF,JGL) - ENDDO -ENDDO - -! ------------------------------------------------------------------ - -END SUBROUTINE PRFI2BAD -END MODULE PRFI2BAD_MOD diff --git a/src/trans/gpu/internal/spnsdead_mod.F90 b/src/trans/gpu/internal/spnsdead_mod.F90 deleted file mode 100755 index 0ed2c6dc4..000000000 --- a/src/trans/gpu/internal/spnsdead_mod.F90 +++ /dev/null @@ -1,118 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE SPNSDEAD_MOD -CONTAINS -SUBROUTINE SPNSDEAD(KM,KF_SCALARS,PEPSNM,PF,PNSD) - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT - -!USE TPM_GEN -USE TPM_DIM ,ONLY : R -USE TPM_FIELDS ,ONLY : F -!USE TPM_TRANS - -!**** *SPNSDEAD* - Compute North-South derivative in spectral space - -! Purpose. -! -------- -! In Laplace space compute the the North-south derivative - -!** Interface. -! ---------- -! CALL SPNSDEAD(...) - -! Explicit arguments : -! -------------------- -! KM -zonal wavenumber (input-c) -! PEPSNM - REPSNM for wavenumber KM (input-c) -! PF (NLEI1,2*KF_SCALARS) - input field (input) -! PNSD(NLEI1,2*KF_SCALARS) - N-S derivative (output) - -! Organisation within NLEI1: -! NLEI1 = NSMAX+4+mod(NSMAX+4+1,2) -! overdimensioning -! 1 : n=NSMAX+2 -! 2 : n=NSMAX+1 -! 3 : n=NSMAX -! . : -! . : -! NSMAX+3 : n=0 -! NSMAX+4 : n=-1 - -! Implicit arguments : YOMLAP -! -------------------- - -! Method. -! ------- - -! Externals. None. -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS -! Temperton, 1991, MWR 119 p1303 - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 00-02-01 From SPNSDEAD in IFS CY22R1 - -! ------------------------------------------------------------------ - -IMPLICIT NONE - -INTEGER(KIND=JPIM), INTENT(IN) :: KM -INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS -REAL(KIND=JPRBT), INTENT(IN) :: PEPSNM(0:R%NTMAX+2) -REAL(KIND=JPRB), INTENT(INOUT) :: PF(:,:) -REAL(KIND=JPRB), INTENT(IN) :: PNSD(:,:) - -INTEGER(KIND=JPIM) :: IJ, ISKIP, J, JN,JI,ISMAX -REAL(KIND=JPRBT) :: ZEPSNM(-1:R%NSMAX+4) -REAL(KIND=JPRBT) :: ZN(-1:R%NTMAX+4) - -! ------------------------------------------------------------------ - -!* 1. COMPUTE NORTH SOUTH DERIVATIVE. -! ------------------------------- - -!* 1.1 COMPUTE - -ISMAX = R%NSMAX -DO JN=KM-1,ISMAX+2 - IJ = ISMAX+3-JN - ZN(IJ) = F%RN(JN) - IF( JN >= 0 ) ZEPSNM(IJ) = PEPSNM(JN) -ENDDO - -ZN(0) = F%RN(ISMAX+3) -IF(KM == 0) THEN - ISKIP = 2 -ELSE - ISKIP = 1 -ENDIF - -!cdir loopchg -!cdir select(vector) -DO J=1,2*KF_SCALARS,ISKIP - DO JI=2,ISMAX+3-KM - PF(JI+1,J) = PF(JI+1,J)-ZN(JI+1)*ZEPSNM(JI) *PNSD(JI,J) - PF(JI-1,J) = PF(JI-1,J)+ZN(JI-2)*ZEPSNM(JI-1)*PNSD(JI,J) - ENDDO -ENDDO - -! ------------------------------------------------------------------ - -END SUBROUTINE SPNSDEAD -END MODULE SPNSDEAD_MOD diff --git a/src/trans/gpu/internal/updspad_mod.F90 b/src/trans/gpu/internal/updspad_mod.F90 deleted file mode 100755 index 16e9111c6..000000000 --- a/src/trans/gpu/internal/updspad_mod.F90 +++ /dev/null @@ -1,177 +0,0 @@ -! (C) Copyright 1988- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE UPDSPAD_MOD -CONTAINS -SUBROUTINE UPDSPAD(KM,KF_UV,KF_SCALARS,POA1,POA2, & - & PSPVOR,PSPDIV,PSPSCALAR,& - & PSPSC3A,PSPSC3B,PSPSC2 , & - & KFLDPTRUV,KFLDPTRSC) - -!**** *UPDSPAD* - Update spectral arrays after direct Legendre transform - -! Purpose. -! -------- -! To update the spectral arrays for a fixed zonal wave-number -! from values in POA1 and POA2. - -!** Interface. -! ---------- -! CALL UPDSPAD(...) - -! Explicit arguments : -! -------------------- -! KM - zonal wave-number -! POA1 - spectral fields for zonal wavenumber KM (basic var.) -! POA2 - spectral fields for zonal wavenumber KM (vor. div.) -! PSPVOR - spectral vorticity -! PSPDIV - spectral divergence -! PSPSCALAR - spectral scalar variables - -! Implicit arguments : -! -------------------- - -! Method. -! ------- - -! Externals. UPDSPADB - basic transfer routine -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 88-02-02 -! Modified : 91-07-01 Philippe Courtier/Mats Hamrud - Rewrite -! for uv formulation -! Modified : 94-08-02 R. El Khatib - interface to UPDSPADB -! M.Hamrud : 94-11-01 New conf 'G' - vor,div->vor,div -! instead of u,v->vor,div -! MPP Group: 95-10-01 Support for Distributed Memory version -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT - -USE TPM_DIM ,ONLY : R -USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B -USE TPM_DISTR ,ONLY : D - -USE UPDSPBAD_MOD ,ONLY : UPDSPBAD -! - -IMPLICIT NONE - - -! DUMMY INTEGER SCALARS - -INTEGER(KIND=JPIM), INTENT(IN) :: KM,KF_UV,KF_SCALARS - -REAL(KIND=JPRBT) , INTENT(OUT) :: POA1(:,:) -REAL(KIND=JPRBT) , INTENT(OUT) :: POA2(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPVOR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPDIV(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSCALAR(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSC2(:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSC3A(:,:,:) -REAL(KIND=JPRB) ,OPTIONAL, INTENT(INOUT) :: PSPSC3B(:,:,:) -INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) -INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) - -! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: IVORS, IVORE, IDIVS, IDIVE, IST ,IEND, JN, ISE,IFLD,JFLD -INTEGER(KIND=JPIM) :: IDIM1,IDIM3,J3 - -! ------------------------------------------------------------------ - -!* 1. UPDATE FIELDS -! ------------- - - -!* 1.1 VORTICITY AND DIVERGENCE. - -IST = 1 -IF (KF_UV > 0) THEN - IST = IST+4*KF_UV - IVORS = 1 - IVORE = 2*KF_UV - IDIVS = 2*KF_UV+1 - IDIVE = 4*KF_UV - IF (KM == 0) THEN - IF(PRESENT(KFLDPTRUV)) THEN - DO JFLD=1,KF_UV - IFLD = KFLDPTRUV(JFLD) - PSPVOR(IFLD,D%NASM0(0)) = 0.0_JPRBT - PSPDIV(IFLD,D%NASM0(0)) = 0.0_JPRBT - ENDDO - DO JN=0,R%NSMAX - ISE = 1+JN*2+1 - DO JFLD=1,KF_UV - IFLD = KFLDPTRUV(JFLD) - PSPDIV(IFLD,ISE) = 0.0_JPRBT - PSPVOR(IFLD,ISE) = 0.0_JPRBT - ENDDO - ENDDO - ELSE - PSPVOR(:,D%NASM0(0)) = 0.0_JPRBT - PSPDIV(:,D%NASM0(0)) = 0.0_JPRBT - DO JN=0,R%NSMAX - ISE = 1+JN*2+1 - PSPDIV(:,ISE) = 0.0_JPRBT - PSPVOR(:,ISE) = 0.0_JPRBT - ENDDO - ENDIF - ENDIF - CALL UPDSPBAD(KM,KF_UV,POA2(:,IVORS:IVORE),PSPVOR,KFLDPTRUV) - CALL UPDSPBAD(KM,KF_UV,POA2(:,IDIVS:IDIVE),PSPDIV,KFLDPTRUV) -ENDIF - -!* 1.2 SCALARS - -IF (KF_SCALARS > 0) THEN - IF(PRESENT(PSPSCALAR)) THEN - IEND = IST+2*KF_SCALARS-1 - CALL UPDSPBAD(KM,KF_SCALARS,POA1(:,IST:IEND),PSPSCALAR,KFLDPTRSC) - ELSE - IF(PRESENT(PSPSC2) .AND. NF_SC2 > 0) THEN - IDIM1 = NF_SC2 - IEND = IST+2*IDIM1-1 - CALL UPDSPBAD(KM,IDIM1,POA1(:,IST:IEND),PSPSC2) - IST=IST+2*IDIM1 - ENDIF - IF(PRESENT(PSPSC3A) .AND. NF_SC3A > 0) THEN - IDIM1=NF_SC3A - IDIM3=UBOUND(PSPSC3A,3) - DO J3=1,IDIM3 - IEND = IST+2*IDIM1-1 - CALL UPDSPBAD(KM,IDIM1,POA1(:,IST:IEND),PSPSC3A(:,:,J3)) - IST=IST+2*IDIM1 - ENDDO - ENDIF - IF(PRESENT(PSPSC3B) .AND. NF_SC3B > 0) THEN - IDIM1=NF_SC3B - IDIM3=UBOUND(PSPSC3B,3) - DO J3=1,IDIM3 - IEND = IST+2*IDIM1-1 - CALL UPDSPBAD(KM,IDIM1,POA1(:,IST:IEND),PSPSC3B(:,:,J3)) - IST=IST+2*IDIM1 - ENDDO - ENDIF - ENDIF -ENDIF - -! ------------------------------------------------------------------ - -END SUBROUTINE UPDSPAD -END MODULE UPDSPAD_MOD diff --git a/src/trans/gpu/internal/updspbad_mod.F90 b/src/trans/gpu/internal/updspbad_mod.F90 deleted file mode 100755 index 0806c62ab..000000000 --- a/src/trans/gpu/internal/updspbad_mod.F90 +++ /dev/null @@ -1,159 +0,0 @@ -! (C) Copyright 1988- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE UPDSPBAD_MOD -CONTAINS -SUBROUTINE UPDSPBAD(KM,KFIELD,POA,PSPEC,KFLDPTR) - - -!**** *UPDSPBAD* - Update spectral arrays after direct Legendre transform - -! Purpose. -! -------- -! To update spectral arrays for a fixed zonal wave-number -! from values in POA. - -!** Interface. -! ---------- -! CALL UPDSPBAD(....) - -! Explicit arguments : KM - zonal wavenumber -! -------------------- KFIELD - number of fields -! POA - work array -! PSPEC - spectral array - -! Implicit arguments : None -! -------------------- - -! Method. -! ------- - -! Externals. -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 88-02-02 -! D. Giard : 93-03-19 truncations NSMAX and NTMAX (see NOTE) -! R. El Khatib : 94-08-02 Replace number of fields by indexes of the -! first and last field -! L. Isaksen : 95-06-06 Reordering of spectral arrays -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT - -USE TPM_DIM ,ONLY : R -!USE TPM_FIELDS -USE TPM_DISTR ,ONLY : D -! - -IMPLICIT NONE -INTEGER(KIND=JPIM),INTENT(IN) :: KM,KFIELD -REAL(KIND=JPRBT) ,INTENT(OUT) :: POA(:,:) -REAL(KIND=JPRB) ,INTENT(INOUT) :: PSPEC(:,:) -INTEGER(KIND=JPIM),INTENT(IN),OPTIONAL :: KFLDPTR(:) - -! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: II, INM, IR, JFLD, JN, ISMAX, ITMAX, IASM0,IFLD - - -! ------------------------------------------------------------------ - -!* 0. NOTE. -! ----- - -! The following transfer reads : -! SPEC(k,NASM0(m)+NLTN(n)*2) =POA(nn,2*k-1) (real part) -! SPEC(k,NASM0(m)+NLTN(n)*2+1)=POA(nn,2*k ) (imaginary part) -! with n from m to NSMAX -! and nn=NTMAX+2-n from NTMAX+2-m to NTMAX+2-NSMAX. -! NLTN(m)=NTMAX+2-m : n=NLTN(nn),nn=NLTN(n) -! nn is the loop index. - - - -!* 1. UPDATE SPECTRAL FIELDS. -! ----------------------- -ISMAX = R%NSMAX -ITMAX = R%NTMAX -IASM0 = D%NASM0(KM) - - -POA(:,:) = 0.0_JPRBT - -!* 1.1 KM=0 - -IF(KM == 0) THEN - IF(PRESENT(KFLDPTR)) THEN - DO JFLD=1,KFIELD - IR = 2*JFLD-1 - IFLD = KFLDPTR(JFLD) - DO JN=ITMAX+2-ISMAX,ITMAX+2-KM - INM = IASM0+(ITMAX+2-JN)*2 - POA(JN,IR) = PSPEC(IFLD,INM) - PSPEC(IFLD,INM) = 0.0_JPRBT - ENDDO - ENDDO - ELSE - DO JN=ITMAX+2-ISMAX,ITMAX+2-KM - INM = IASM0+(ITMAX+2-JN)*2 -!DIR$ IVDEP -!OCL NOVREC - DO JFLD=1,KFIELD - IR = 2*JFLD-1 - POA(JN,IR) = PSPEC(JFLD,INM) - PSPEC(JFLD,INM) = 0.0_JPRBT - ENDDO - ENDDO - ENDIF -!* 1.2 KM!=0 - -ELSE - IF(PRESENT(KFLDPTR)) THEN - DO JFLD=1,KFIELD - IR = 2*JFLD-1 - II = IR+1 - IFLD = KFLDPTR(JFLD) - DO JN=ITMAX+2-ISMAX,ITMAX+2-KM - INM = IASM0+((ITMAX+2-JN)-KM)*2 - POA(JN,IR) = PSPEC(IFLD,INM) - POA(JN,II) = PSPEC(IFLD,INM+1) - PSPEC(IFLD,INM) = 0.0_JPRBT - PSPEC(IFLD,INM+1) = 0.0_JPRBT - ENDDO - ENDDO - ELSE - DO JN=ITMAX+2-ISMAX,ITMAX+2-KM - INM = IASM0+((ITMAX+2-JN)-KM)*2 -!DIR$ IVDEP -!OCL NOVREC - DO JFLD=1,KFIELD - IR = 2*JFLD-1 - II = IR+1 - POA(JN,IR) = PSPEC(JFLD,INM) - POA(JN,II) = PSPEC(JFLD,INM+1) - PSPEC(JFLD,INM) = 0.0_JPRBT - PSPEC(JFLD,INM+1) = 0.0_JPRBT - ENDDO - ENDDO - ENDIF -ENDIF - -! ------------------------------------------------------------------ - -END SUBROUTINE UPDSPBAD -END MODULE UPDSPBAD_MOD diff --git a/src/trans/gpu/internal/uvtvdad_mod.F90 b/src/trans/gpu/internal/uvtvdad_mod.F90 deleted file mode 100755 index c01aaa34d..000000000 --- a/src/trans/gpu/internal/uvtvdad_mod.F90 +++ /dev/null @@ -1,138 +0,0 @@ -! (C) Copyright 1991- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE UVTVDAD_MOD -CONTAINS -SUBROUTINE UVTVDAD(KM,KFIELD,PEPSNM,PU,PV,PVOR,PDIV) - -!**** *UVTVDAD* - Compute vor/div from u and v in spectral space - -! Purpose. -! -------- -! To compute vorticity and divergence from u and v in spectral -! space. Input u and v from KM to NTMAX+1, output vorticity and -! divergence from KM to NTMAX. - -!** Interface. -! ---------- -! CALL UVTVDAD(KM,KFIELD,PEPSNM,PU,PV,PVOR,PDIV) - -! Explicit arguments : KM - zonal wave-number -! -------------------- KFIELD - number of fields (levels) -! PEPSNM - REPSNM for wavenumber KM -! PU - u wind component for zonal -! wavenumber KM -! PV - v wind component for zonal -! wavenumber KM -! PVOR - vorticity for zonal -! wavenumber KM -! PDIV - divergence for zonal -! wavenumber KM - - -! Method. See ref. -! ------- - -! Externals. None. -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 91-07-01 -! D. Giard : NTMAX instead of NSMAX -! ------------------------------------------------------------------ - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - -USE TPM_DIM ,ONLY : R -USE TPM_FIELDS ,ONLY : F -!USE TPM_DISTR -! - -IMPLICIT NONE - -! DUMMY INTEGER SCALARS -INTEGER(KIND=JPIM), INTENT(IN) :: KFIELD -INTEGER(KIND=JPIM), INTENT(IN) :: KM - -REAL(KIND=JPRBT), INTENT(IN) :: PEPSNM(0:R%NTMAX+2) -REAL(KIND=JPRBT), INTENT(IN) :: PVOR(:,:),PDIV(:,:) -REAL(KIND=JPRBT), INTENT(INOUT) :: PU (:,:),PV (:,:) - -! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: II, IN, IR, J, JN, ITMAX - -! LOCAL REAL SCALARS -REAL(KIND=JPRBT) :: ZKM -REAL(KIND=JPRBT) :: ZN(-1:R%NTMAX+3) - - -! ------------------------------------------------------------------ - - -!* 1. COMPUTE U V FROM VORTICITY AND DIVERGENCE. -! ------------------------------------------ - -ZKM = KM -ITMAX = R%NTMAX -ZN(KM-1:ITMAX+3) = F%RN(KM-1:ITMAX+3) - -!* 1.2 COMPUTE VORTICITY AND DIVERGENCE. - -IF(KM /= 0) THEN - DO JN=KM,ITMAX - IN = ITMAX+2-JN -!DIR$ IVDEP -!OCL NOVREC - DO J=1,KFIELD - IR = 2*J-1 - II = IR+1 - - PV(IN,II) = PV(IN,II)-ZKM*PVOR(IN,IR) - PU(IN-1,IR) = PU(IN-1,IR)-ZN(JN)*PEPSNM(JN+1)*PVOR(IN,IR) - PU(IN+1,IR) = PU(IN+1,IR)+ZN(JN+1)*PEPSNM(JN)*PVOR(IN,IR) - - PV(IN,IR) = PV(IN,IR)+ZKM*PVOR(IN,II) - PU(IN-1,II) = PU(IN-1,II)-ZN(JN)*PEPSNM(JN+1)*PVOR(IN,II) - PU(IN+1,II) = PU(IN+1,II)+ZN(JN+1)*PEPSNM(JN)*PVOR(IN,II) - - PU(IN,II) = PU(IN,II)-ZKM*PDIV(IN,IR) - PV(IN-1,IR) = PV(IN-1,IR)+ZN(JN)*PEPSNM(JN+1)*PDIV(IN,IR) - PV(IN+1,IR) = PV(IN+1,IR)-ZN(JN+1)*PEPSNM(JN)*PDIV(IN,IR) - - PU(IN,IR) = PU(IN,IR)+ZKM*PDIV(IN,II) - PV(IN-1,II) = PV(IN-1,II)+ZN(JN)*PEPSNM(JN+1)*PDIV(IN,II) - PV(IN+1,II) = PV(IN+1,II)-ZN(JN+1)*PEPSNM(JN)*PDIV(IN,II) - ENDDO - ENDDO -ELSE - DO JN=KM,ITMAX - IN = ITMAX+2-JN - DO J=1,KFIELD - IR = 2*J-1 - PU(IN-1,IR) = PU(IN-1,IR)-ZN(JN )*PEPSNM(JN+1)*PVOR(IN,IR) - PU(IN+1,IR) = PU(IN+1,IR)+ZN(JN+1)*PEPSNM(JN )*PVOR(IN,IR) - PV(IN-1,IR) = PV(IN-1,IR)+ZN(JN )*PEPSNM(JN+1)*PDIV(IN,IR) - PV(IN+1,IR) = PV(IN+1,IR)-ZN(JN+1)*PEPSNM(JN )*PDIV(IN,IR) - ENDDO - ENDDO -ENDIF - -! ------------------------------------------------------------------ - -END SUBROUTINE UVTVDAD -END MODULE UVTVDAD_MOD diff --git a/src/trans/gpu/internal/vd2uv_ctl_mod.F90 b/src/trans/gpu/internal/vd2uv_ctl_mod.F90 deleted file mode 100755 index 08ec2a971..000000000 --- a/src/trans/gpu/internal/vd2uv_ctl_mod.F90 +++ /dev/null @@ -1,80 +0,0 @@ -! (C) Copyright 2015- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE VD2UV_CTL_MOD -CONTAINS -SUBROUTINE VD2UV_CTL(KF_UV,PSPVOR,PSPDIV,PU,PV) - -!**** *VD2UV_CTL* - Control routine for going from vor/div to spectral U and V. - -! Purpose. -! -------- -! Control routine for computing spectral U (u*cos(theta)) and V - -!** Interface. -! ---------- -! CALL INV_TRANS_CTL(...) -! KF_UV - local number of spectral u-v fields -! PSPVOR(:,:) - spectral vorticity (input) -! PSPDIV(:,:) - spectral divergence (input) -! PU(:,:) - U (out) -! PV(:,:) - V (out) - -! Method. -! ------- - -! Externals. -! ---------- - - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : July 2015 - -! ------------------------------------------------------------------ - -USE PARKIND1 ,ONLY : JPIM ,JPRB - -USE TPM_DISTR ,ONLY : D - -USE VD2UV_MOD ,ONLY : VD2UV - -IMPLICIT NONE - -INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV -REAL(KIND=JPRB),INTENT(IN) :: PSPVOR(:,:) -REAL(KIND=JPRB),INTENT(IN) :: PSPDIV(:,:) -REAL(KIND=JPRB),INTENT(OUT) :: PU(:,:) -REAL(KIND=JPRB),INTENT(OUT) :: PV(:,:) - -INTEGER(KIND=JPIM) :: JM,IM,ILEI2 - -! ------------------------------------------------------------------ - -CALL GSTATS(102,0) -ILEI2 = 8*KF_UV - -CALL GSTATS(1647,0) -!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JM,IM) -DO JM=1,D%NUMP - IM = D%MYMS(JM) - CALL VD2UV(IM,JM,KF_UV,ILEI2,PSPVOR,PSPDIV,PU,PV) -ENDDO -!$OMP END PARALLEL DO -CALL GSTATS(1647,1) -CALL GSTATS(102,1) - -! ------------------------------------------------------------------ - -END SUBROUTINE VD2UV_CTL -END MODULE VD2UV_CTL_MOD diff --git a/src/trans/gpu/internal/vd2uv_mod.F90 b/src/trans/gpu/internal/vd2uv_mod.F90 deleted file mode 100755 index b97c49f22..000000000 --- a/src/trans/gpu/internal/vd2uv_mod.F90 +++ /dev/null @@ -1,156 +0,0 @@ -! (C) Copyright 2015- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE VD2UV_MOD -CONTAINS -SUBROUTINE VD2UV(KM,KMLOC,KF_UV,KLEI2,PSPVOR,PSPDIV,PU,PV) - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT -USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - -USE TPM_CONSTANTS -USE TPM_DIM ,ONLY : R -USE TPM_DISTR ,ONLY : D - -USE PREPSNM_MOD ,ONLY : PREPSNM -USE PRFI1B_MOD ,ONLY : PRFI1B -USE VDTUV_MOD ,ONLY : VDTUV - - -!**** *VD2UV* - U and V from Vor/div -! -! Purpose. -! -------- -! -!** Interface. -! ---------- -! *CALL* *VD2UV(...) - -! Explicit arguments : -! -------------------- -! KM - zonal wavenumber -! KMLOC - local zonal wavenumber -! PSPVOR - spectral vorticity -! PSPDIV - spectral divergence -! PU(:,:) - spectral U (out) -! PV(:,:) - spectral V (out) - - -! Implicit arguments : - -! Method. -! ------- - -! Externals. -! ---------- - -! PREPSNM - prepare REPSNM for wavenumber KM -! PRFI1B - prepares the spectral fields -! VDTUV - compute u and v from vorticity and divergence - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS -! Temperton, 1991, MWR 119 p1303 - -! Author. -! ------- -! Mats Hamrud *ECMWF* - -! Modifications. -! -------------- -! Original : July 2015 -! -! ------------------------------------------------------------------ - -IMPLICIT NONE - -INTEGER(KIND=JPIM), INTENT(IN) :: KM -INTEGER(KIND=JPIM), INTENT(IN) :: KMLOC -INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV -INTEGER(KIND=JPIM), INTENT(IN) :: KLEI2 - -REAL(KIND=JPRB) , INTENT(IN) :: PSPVOR(:,:) -REAL(KIND=JPRB) , INTENT(IN) :: PSPDIV(:,:) -REAL(KIND=JPRB) , INTENT(OUT) :: PU(:,:) -REAL(KIND=JPRB) , INTENT(OUT) :: PV(:,:) - -REAL(KIND=JPRB) :: ZIA(R%NLEI1,KLEI2) -REAL(KIND=JPRBT) :: ZEPSNM(0:R%NTMAX+2),ZA_R - -INTEGER(KIND=JPIM) :: IFC, ISTA, IIFC, IDGLU, JGL, JFLD,ILCM -INTEGER(KIND=JPIM) :: IVORL,IVORU,IDIVL,IDIVU,IUL,IUU,IVL,IVU,ISL,II,IR,INM,J -INTEGER(KIND=JPIM) :: IFIRST, ILAST, IOFF - -REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - -! ------------------------------------------------------------------ - -!* 1. PERFORM LEGENDRE TRANFORM. -! -------------------------- - -IF (LHOOK) CALL DR_HOOK('VD2UV_MOD',0,ZHOOK_HANDLE) - -! ------------------------------------------------------------------ - -!* 1. PREPARE ZEPSNM. -! --------------- - -stop 'Error: code path not (yet) supported in GPU version' -!CALL PREPSNM(KM,KMLOC,ZEPSNM) - -! ------------------------------------------------------------------ - - -!* 3. SPECTRAL COMPUTATIONS FOR U,V AND DERIVATIVES. -! ---------------------------------------------- - -IFIRST = 1 -ILAST = 4*KF_UV - -IF (KF_UV > 0) THEN - IVORL = 1 - IVORU = 2*KF_UV - IDIVL = 2*KF_UV+1 - IDIVU = 4*KF_UV - IUL = 4*KF_UV+1 - IUU = 6*KF_UV - IVL = 6*KF_UV+1 - IVU = 8*KF_UV - stop 'Error: code path not (yet) supported in GPU version' - !CALL PRFI1B(KM,ZIA(:,IVORL:IVORU),PSPVOR,KF_UV) - !CALL PRFI1B(KM,ZIA(:,IDIVL:IDIVU),PSPDIV,KF_UV) - - !CALL VDTUV(KM,KF_UV,ZEPSNM,ZIA(:,IVORL:IVORU),ZIA(:,IDIVL:IDIVU),& - ! & ZIA(:,IUL:IUU),ZIA(:,IVL:IVU)) - ILCM = R%NSMAX+1-KM - IOFF = D%NASM0(KM) - ZA_R = 1.0_JPRBT/RA - DO J=1,ILCM - INM = IOFF+(ILCM-J)*2 - DO JFLD=1,KF_UV - IR = 2*(JFLD-1)+1 - II = IR+1 - PU(JFLD,INM ) = ZIA(J+2,IR+IUL-1)*ZA_R - PU(JFLD,INM+1) = ZIA(J+2,II+IUL-1)*ZA_R - PV(JFLD,INM ) = ZIA(J+2,IR+IVL-1)*ZA_R - PV(JFLD,INM+1) = ZIA(J+2,II+IVL-1)*ZA_R - ENDDO - ENDDO -ENDIF - -IF (LHOOK) CALL DR_HOOK('VD2UV_MOD',1,ZHOOK_HANDLE) -! ------------------------------------------------------------------ - -END SUBROUTINE VD2UV -END MODULE VD2UV_MOD - - - - diff --git a/src/trans/gpu/internal/vdtuvad_mod.F90 b/src/trans/gpu/internal/vdtuvad_mod.F90 deleted file mode 100755 index 8a6ec4b42..000000000 --- a/src/trans/gpu/internal/vdtuvad_mod.F90 +++ /dev/null @@ -1,144 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE VDTUVAD_MOD -CONTAINS -SUBROUTINE VDTUVAD(KM,KFIELD,PEPSNM,PVOR,PDIV,PU,PV) - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT - -USE TPM_DIM ,ONLY : R -USE TPM_FIELDS ,ONLY : F - - -!**** *VDTUVAD* - Compute U,V in spectral space - -! Purpose. -! -------- -! In Laplace space compute the the winds -! from vorticity and divergence. - -!** Interface. -! ---------- -! CALL VDTUVAD(...) - -! Explicit arguments : KM -zonal wavenumber (input-c) -! -------------------- KFIELD - number of fields (input-c) -! PEPSNM - REPSNM for wavenumber KM (input-c) -! PVOR(NLEI1,2*KFIELD) - vorticity (input) -! PDIV(NLEI1,2*KFIELD) - divergence (input) -! PU(NLEI1,2*KFIELD) - u wind (output) -! PV(NLEI1,2*KFIELD) - v wind (output) -! Organisation within NLEI1: -! NLEI1 = NSMAX+4+mod(NSMAX+4+1,2) -! overdimensioning -! 1 : n=NSMAX+2 -! 2 : n=NSMAX+1 -! 3 : n=NSMAX -! . : -! . : -! NSMAX+3 : n=0 -! NSMAX+4 : n=-1 - -! Implicit arguments : Eigenvalues of inverse Laplace operator -! -------------------- from YOMLAP - -! Method. -! ------- - -! Externals. None. -! ---------- - -! Reference. -! ---------- -! ECMWF Research Department documentation of the IFS -! Temperton, 1991, MWR 119 p1303 - -! Author. -! ------- -! Mats Hamrud and Philippe Courtier *ECMWF* - -! Modifications. -! -------------- -! Original : 00-02-01 From VDTUVAD in IFS CY22R1 - -! ------------------------------------------------------------------ - -IMPLICIT NONE - -INTEGER(KIND=JPIM), INTENT(IN) :: KM,KFIELD -REAL(KIND=JPRBT), INTENT(IN) :: PEPSNM(0:R%NTMAX+2) -REAL(KIND=JPRB), INTENT(INOUT) :: PVOR(:,:),PDIV(:,:) -REAL(KIND=JPRB), INTENT(IN) :: PU (:,:),PV (:,:) - -! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: II, IJ, IR, J, JN, ISMAX,JI - -! LOCAL REAL SCALARS -REAL(KIND=JPRBT) :: ZKM -REAL(KIND=JPRBT) :: ZN(-1:R%NTMAX+4) -REAL(KIND=JPRBT) :: ZLAPIN(-1:R%NSMAX+4) -REAL(KIND=JPRBT) :: ZEPSNM(-1:R%NSMAX+4) - - - -! ------------------------------------------------------------------ - -!* 1. COMPUTE U V FROM VORTICITY AND DIVERGENCE. -! ------------------------------------------ - -ZKM = KM -ISMAX = R%NSMAX -DO JN=KM-1,ISMAX+2 - IJ = ISMAX+3-JN - ZN(IJ) = F%RN(JN) - ZLAPIN(IJ) = F%RLAPIN(JN) - IF( JN >= 0 ) ZEPSNM(IJ) = PEPSNM(JN) -ENDDO -ZN(0) = F%RN(ISMAX+3) - -!* 1.1 U AND V (KM=0) . - -IF(KM == 0) THEN - DO J=1,KFIELD - IR = 2*J-1 - DO JI=2,ISMAX+3-KM - PDIV(JI-1,IR) = PDIV(JI-1,IR)+ZN(JI-2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PV(JI,IR) - PVOR(JI-1,IR) = PVOR(JI-1,IR)-ZN(JI-2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PU(JI,IR) - PDIV(JI+1,IR) = PDIV(JI+1,IR)-ZN(JI+1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PV(JI,IR) - PVOR(JI+1,IR) = PVOR(JI+1,IR)+ZN(JI+1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PU(JI,IR) - ENDDO - ENDDO -!* 1.2 U AND V (KM!=0) . - -ELSE - DO J=1,KFIELD - IR = 2*J-1 - II = IR+1 - DO JI=2,ISMAX+3-KM - PDIV(JI-1,II) = PDIV(JI-1,II)+ZN(JI-2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PV(JI,II) - PDIV(JI-1,IR) = PDIV(JI-1,IR)+ZN(JI-2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PV(JI,IR) - PVOR(JI-1,II) = PVOR(JI-1,II)-ZN(JI-2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PU(JI,II) - PVOR(JI-1,IR) = PVOR(JI-1,IR)-ZN(JI-2)*ZEPSNM(JI-1)*ZLAPIN(JI-1)*PU(JI,IR) - PDIV(JI+1,II) = PDIV(JI+1,II)-ZN(JI+1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PV(JI,II) - PDIV(JI+1,IR) = PDIV(JI+1,IR)-ZN(JI+1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PV(JI,IR) - PVOR(JI+1,II) = PVOR(JI+1,II)+ZN(JI+1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PU(JI,II) - PVOR(JI+1,IR) = PVOR(JI+1,IR)+ZN(JI+1)*ZEPSNM(JI)*ZLAPIN(JI+1)*PU(JI,IR) - PVOR(JI,IR) = PVOR(JI,IR)+ZKM*ZLAPIN(JI)*PV(JI,II) - PVOR(JI,II) = PVOR(JI,II)-ZKM*ZLAPIN(JI)*PV(JI,IR) - PDIV(JI,IR) = PDIV(JI,IR)+ZKM*ZLAPIN(JI)*PU(JI,II) - PDIV(JI,II) = PDIV(JI,II)-ZKM*ZLAPIN(JI)*PU(JI,IR) - ENDDO - ENDDO -ENDIF - -! ------------------------------------------------------------------ - -END SUBROUTINE VDTUVAD -END MODULE VDTUVAD_MOD From 0a1177ec8fb70e339fae1295261e24e7fdb89a73 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:39 -0700 Subject: [PATCH 240/263] Minor cleanup with module use --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 25 +++++++------------- src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 6 ++--- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 46fec9b4c..49a04ada5 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -72,30 +72,21 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ! ------------------------------------------------------------------ - USE PARKIND1 ,ONLY : JPIM ,JPRB - USE PARKIND_ECTRANS ,ONLY : JPRBT, JPRD + USE PARKIND_ECTRANS ,ONLY : JPRBT, JPRD, JPRB, JPIM USE TPM_GEN ,ONLY : NPROMATR, NOUT - USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B USE TPM_DISTR, ONLY: NPROC - USE FTDIR_MOD ,ONLY : FTDIR, FTDIR_HANDLE, PREPARE_FTDIR + USE MPL_MODULE ,ONLY : MPL_BARRIER + USE TPM_TRANS, ONLY: REUSE_PTR + USE TPM_GEN + USE ALLOCATOR_MOD - USE SHUFFLE_MOD ,ONLY : SHUFFLE - USE FIELD_SPLIT_MOD ,ONLY : FIELD_SPLIT - USE LTDIR_MOD ,ONLY : LTDIR, PREPARE_LTDIR, LTDIR_HANDLE + USE FTDIR_MOD + USE LTDIR_MOD USE TRGTOL_MOD - USE MPL_MODULE ,ONLY : MPL_BARRIER - USE TPM_GEN ,ONLY : LSYNC_TRANS - USE TRLTOM_MOD ,ONLY : TRLTOM_CUDAAWARE, TRLTOM_HANDLE, PREPARE_TRLTOM + USE TRLTOM_MOD USE TRLTOM_PACK_UNPACK - USE TPM_DISTR, ONLY : D, NPROC - USE TPM_TRANS, ONLY:REUSE_PTR - - USE ALLOCATOR_MOD - USE ISO_C_BINDING, ONLY: C_INT8_T - ! - IMPLICIT NONE ! Declaration of arguments diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index e46b24e36..478c7da78 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -86,9 +86,10 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT, JPRD - USE TPM_GEN ,ONLY : NPROMATR, NOUT, NERR + USE TPM_GEN ,ONLY : NPROMATR, NOUT USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, REUSE_PTR USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS + USE ALLOCATOR_MOD USE TRMTOL_MOD USE LTINV_MOD @@ -96,9 +97,6 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& USE FSC_MOD USE FTINV_MOD USE TRLTOG_MOD - USE TPM_DISTR ,ONLY : D - USE ALLOCATOR_MOD - ! IMPLICIT NONE From bd09839ba4036a228ad373e04e84f8bd016fd22b Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:39 -0700 Subject: [PATCH 241/263] Remove FSPGL_INT_MOD --- src/trans/gpu/internal/fspgl_int_mod.F90 | 124 ----------------------- 1 file changed, 124 deletions(-) delete mode 100755 src/trans/gpu/internal/fspgl_int_mod.F90 diff --git a/src/trans/gpu/internal/fspgl_int_mod.F90 b/src/trans/gpu/internal/fspgl_int_mod.F90 deleted file mode 100755 index 75e1a1acb..000000000 --- a/src/trans/gpu/internal/fspgl_int_mod.F90 +++ /dev/null @@ -1,124 +0,0 @@ -! (C) Copyright 2000- ECMWF. -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE FSPGL_INT_MOD -CONTAINS -SUBROUTINE FSPGL_INT(KF_UV,KF_SCALARS,KF_SCDERS,KF_OUT_LT,& - & FSPGL_PROC,KFLDPTRUV,KFLDPTRSC) - -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT - -USE TPM_DIM ,ONLY : R -USE TPM_TRANS ,ONLY : FOUBUF_IN, LDIVGP, LVORGP -USE TPM_GEOMETRY ,ONLY : G -USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,D_NASM0 -USE TPM_FIELDS ,ONLY : F -USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS -! - -IMPLICIT NONE - -INTEGER(KIND=JPIM) :: KM, KMLOC -INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV,KF_SCALARS,KF_SCDERS,KF_OUT_LT -EXTERNAL FSPGL_PROC -INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRUV(:) -INTEGER(KIND=JPIM),OPTIONAL,INTENT(IN) :: KFLDPTRSC(:) - -! -! ZFIELD 2nd dimension is extended from 0 to R%NDGL+1, while only 1 to R%NDGL -! is given from the north/south transforms, and only 1 to R%NDGL rows will be -! passed to the east/west transforms. -! the 2 extra rows are used inside the model Fourier space computations -! (outside the transform package - see FSPGLH in Arpege/IFS). -! -REAL(KIND=JPRBT) :: ZFIELD(2*KF_OUT_LT,0:R%NDGL+1) - - -INTEGER(KIND=JPIM) :: ISL, IGLS, JFLD, JGL ,IPROC, IPROCS -INTEGER(KIND=JPIM) :: IPTRU,IST,J -INTEGER(KIND=JPIM) :: IDGNH,IDGL -INTEGER(KIND=JPIM) :: ISTAN(R%NDGNH),ISTAS(R%NDGNH) -INTEGER(KIND=JPIM) :: IFLDPTRUV(KF_UV),IFLDPTRSC(KF_SCALARS) -! ------------------------------------------------------------------ -!$acc data if(present(KFLDPTRUV)) COPYIN(KFLDPTRUV,KFLDPTRSC) -!$acc data create(IFLDPTRUV,IFLDPTRSC,ISTAN,ISTAS,ZFIELD) & -!$acc& present(d_myms,D%NSTAGT0B,D%NPNTGTB1,D%NPROCL,FOUBUF_IN) -IF(PRESENT(KFLDPTRUV)) THEN - IFLDPTRUV(:) = KFLDPTRUV(1:KF_UV) - IFLDPTRSC(:) = KFLDPTRSC(1:KF_SCALARS) -ELSE - DO J=1,KF_UV - IFLDPTRUV(J) = J - ENDDO - DO J=1,KF_SCALARS - IFLDPTRSC(J) = J - ENDDO -ENDIF - -!loop over wavenumber -DO KMLOC=1,D_NUMP - KM = D_MYMS(KMLOC) - -ISL = MAX(R%NDGNH-G%NDGLU(KM)+1,1) -IDGNH = R%NDGNH -IDGL = R%NDGL -!$acc parallel loop -DO JGL=ISL,IDGNH - IPROC = D%NPROCL(JGL) - ISTAN(JGL) = (D%NSTAGT1B(IPROC) + D%NPNTGTB1(KMLOC,JGL))*2*KF_OUT_LT - IGLS = IDGL+1-JGL - IPROCS = D%NPROCL(IGLS) - ISTAS(JGL) = (D%NSTAGT1B(IPROCS) + D%NPNTGTB1(KMLOC,IGLS))*2*KF_OUT_LT -ENDDO - -!$acc parallel loop collapse(2) -DO JGL=ISL,IDGNH - DO JFLD=1,2*KF_OUT_LT - IGLS = IDGL+1-JGL - ZFIELD(JFLD,JGL) = FOUBUF_IN(ISTAN(JGL)+JFLD) - ZFIELD(JFLD,IGLS) = FOUBUF_IN(ISTAS(JGL)+JFLD) - ENDDO -ENDDO - -IST = 1 -IF(LVORGP) THEN - IST = IST+2*KF_UV -ENDIF -IF(LDIVGP) THEN - IST = IST+2*KF_UV -ENDIF -IPTRU = IST - - - - -CALL FSPGL_PROC(KM,ISL,IDGL,KF_OUT_LT,F%R1MU2,ZFIELD,& - & IPTRU,KF_UV,KF_SCALARS,& - & IFLDPTRUV) - - !$acc parallel loop collapse(2) -DO JGL=ISL,IDGNH - DO JFLD=1,2*KF_OUT_LT - IGLS = IDGL+1-JGL - !OCL NOVREC - FOUBUF_IN(ISTAN(JGL)+JFLD) = ZFIELD(JFLD,JGL) - FOUBUF_IN(ISTAS(JGL)+JFLD) = ZFIELD(JFLD,IGLS) - ENDDO -ENDDO - -!end loop over wavenumber -END DO - -!$acc end data -!$acc end data -! ------------------------------------------------------------------ - -END SUBROUTINE FSPGL_INT -END MODULE FSPGL_INT_MOD From 7f12a1c36b921101fcfc3b0c312f7ccf79998ccb Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:40 -0700 Subject: [PATCH 242/263] simplify control logic in in main driver routines --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 45 +++++++++----------- src/trans/gpu/internal/inv_trans_ctl_mod.F90 | 9 ++-- 2 files changed, 24 insertions(+), 30 deletions(-) diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 49a04ada5..3611f28aa 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -164,37 +164,32 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) - ! fourier transform from PREEL_REAL to PREEL_COMPLEX (in-place!) - CALL GSTATS(1640,0) IF (KF_FS > 0) THEN - CALL FTDIR(HFTDIR,PREEL_REAL,PREEL_COMPLEX,KF_FS) - ELSE - PREEL_COMPLEX => PREEL_REAL - ENDIF - CALL GSTATS(1640,1) - CALL GSTATS(153,0) - IF (NPROC > 1) THEN - WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_CUDAAWARE' - - IF (KF_FS > 0) THEN - CALL TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) - CALL TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,FOUBUF_IN,FOUBUF,KF_FS) - CALL TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) - ENDIF - - ELSE - WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_DIRECT' - ! Short cut - no need to go through tansforms, we will go directly into - ! the legendre space - CALL TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) - ENDIF - CALL GSTATS(153,1) + ! fourier transform from PREEL_REAL to PREEL_COMPLEX (in-place!) + CALL GSTATS(1640,0) + CALL FTDIR(HFTDIR,PREEL_REAL,PREEL_COMPLEX,KF_FS) + CALL GSTATS(1640,1) + + CALL GSTATS(153,0) + + IF (NPROC > 1) THEN + WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_CUDAAWARE' + CALL TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) + CALL TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,FOUBUF_IN,FOUBUF,KF_FS) + CALL TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + ELSE + WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_DIRECT' + ! Short cut - no need to go through tansforms, we will go directly into + ! the legendre space + CALL TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + ENDIF + CALL GSTATS(153,1) - IF (KF_FS > 0) THEN CALL LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALARS, & & PSPVOR,PSPDIV,PSPSCALAR,& & PSPSC3A,PSPSC3B,PSPSC2) + ENDIF END SUBROUTINE DIR_TRANS_CTL diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index 478c7da78..a6ad97e1c 100755 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -204,17 +204,15 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, REUSE_PTR) - ! Legendre transformations - CALL GSTATS(102,0) IF (KF_FS > 0) THEN + ! Legendre transformations + CALL GSTATS(102,0) CALL LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & & ZOUTS,ZOUTA,ZOUTS0,ZOUTA0) - END IF - CALL GSTATS(102,1) + CALL GSTATS(102,1) ! Packing into send buffer, to fourier space and unpack - IF (KF_FS > 0) THEN CALL GSTATS(152,0) CALL TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,IF_LEG) CALL TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,FOUBUF_IN,FOUBUF,IF_LEG) @@ -237,5 +235,6 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) CALL GSTATS(157,1) + END SUBROUTINE INV_TRANS_CTL END MODULE INV_TRANS_CTL_MOD From 4499c265541eefba421d6b2b5d15c1dd6361ed1f Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:39 -0700 Subject: [PATCH 243/263] ldenv=.false for nsys --- src/programs/driver-spectraltransform.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/programs/driver-spectraltransform.F90 b/src/programs/driver-spectraltransform.F90 index 5c608721f..895af5c1d 100644 --- a/src/programs/driver-spectraltransform.F90 +++ b/src/programs/driver-spectraltransform.F90 @@ -245,7 +245,7 @@ PROGRAM TRANSFORM_TEST ! Participating processors limited by -P option !-------------------------- -CALL MPL_INIT() +CALL MPL_INIT(LDENV=.false.) !IF( LSTATS ) CALL GSTATS(0,0) ZTINIT=TIMEF() From a9f88929bc41633e45b0f446ef6d6cef2f9ddb22 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:39 -0700 Subject: [PATCH 244/263] disable OpenMP dependent domain decomposition computation in driver --- src/programs/driver-spectraltransform.F90 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/programs/driver-spectraltransform.F90 b/src/programs/driver-spectraltransform.F90 index 895af5c1d..e512d3ddb 100644 --- a/src/programs/driver-spectraltransform.F90 +++ b/src/programs/driver-spectraltransform.F90 @@ -310,7 +310,11 @@ PROGRAM TRANSFORM_TEST IF( NPRTRV*NPRTRW /= NPROC ) CYCLE IF( NPRTRV > NPRTRW ) EXIT IF( NPRTRW > NSPECRESMIN ) CYCLE +! With CUDA AWARE MPI we don't need any OpenMP so there is no need for this! Effectively this is even +! undesireable because it may trigger different domain decompositions for no reasons on different machines +#ifndef USE_CUDA_AWARE_MPI_FT IF( NPRTRW <= NSPECRESMIN/(2*OML_MAX_THREADS()) ) EXIT +#endif ENDDO ! GO FOR APPROX SQUARE PARTITION FOR BACKUP IF( NPRTRV*NPRTRW /= NPROC .OR. NPRTRW > NSPECRESMIN .OR. NPRTRV > NPRTRW ) THEN From dd71da0a9289490ed46b1f0c7cf195ca4e649a64 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:49:40 -0700 Subject: [PATCH 245/263] Change output of program driver --- src/programs/driver-spectraltransform.F90 | 84 +++++++++++++++++++---- src/trans/gpu/internal/ltinv_mod.F90 | 4 +- 2 files changed, 71 insertions(+), 17 deletions(-) diff --git a/src/programs/driver-spectraltransform.F90 b/src/programs/driver-spectraltransform.F90 index e512d3ddb..1000fd2f3 100644 --- a/src/programs/driver-spectraltransform.F90 +++ b/src/programs/driver-spectraltransform.F90 @@ -82,7 +82,7 @@ PROGRAM TRANSFORM_TEST LOGICAL :: LUSERPNM, LKEEPRPNM, LUSEFLT LOGICAL :: LTRACE_STATS,LSTATS_OMP, LSTATS_COMMS, LSTATS_MPL LOGICAL :: LSTATS,LBARRIER_STATS, LBARRIER_STATS2, LDETAILED_STATS -LOGICAL :: LSTATS_ALLOC, LSYNCSTATS, LSTATSCPU, LSTATS_MEM, LDUMP_DATA +LOGICAL :: LSTATS_ALLOC, LSYNCSTATS, LSTATSCPU, LSTATS_MEM LOGICAL :: LXML_STATS LOGICAL :: LFFTW INTEGER(KIND=JPIM) :: NSTATS_MEM, NTRACE_STATS, NPRNT_STATS @@ -141,7 +141,7 @@ PROGRAM TRANSFORM_TEST & LUSERPNM, LKEEPRPNM, LUSEFLT, NQ, NLIN, IMAX_FLDS_IN, & & NPRINTNORMS, ITERS, ZMAXERR_CHECK, NPROMA, NPROMATR, LEQ_REGIONS, & & NPRINTLEV, NPRTRW, NPRTRV, NSPECRESMIN, NFLEVG, MBX_SIZE, LSTACK, & - & LFFTW, LDUMP_DATA + & LFFTW ! ------------------------------------------------------------------ @@ -216,7 +216,6 @@ PROGRAM TRANSFORM_TEST LSTACK=.FALSE. ! Use FFTW LFFTW=.TRUE. -LDUMP_DATA=.TRUE. ! Default number of vertical levels NFLEVG=137 @@ -863,12 +862,9 @@ PROGRAM TRANSFORM_TEST ZTSTEP1(JSTEP)=(TIMEF()-ZTSTEP1(JSTEP))/1000.0_JPRD ! Dump a field to a binary file - if (LDUMP_DATA) THEN - CALL DUMP_GRIDPOINT_FIELD(JSTEP, MYPROC, NPROMA, NGPBLKS, ZGMVS(:,1,:), 'S', NOUTDUMP) - CALL DUMP_GRIDPOINT_FIELD(JSTEP, MYPROC, NPROMA, NGPBLKS, ZWINDS(:,NFLEVG,3,:), 'U', NOUTDUMP) - CALL DUMP_GRIDPOINT_FIELD(JSTEP, MYPROC, NPROMA, NGPBLKS, ZWINDS(:,NFLEVG,4,:), 'V', NOUTDUMP) - CALL DUMP_GRIDPOINT_FIELD(JSTEP, MYPROC, NPROMA, NGPBLKS, ZGMV(:,NFLEVG,5,:), 'T', NOUTDUMP) - ENDIF + CALL DUMP_GRIDPOINT_FIELD_3D(JSTEP, MYPROC, ZGMVS(:,:,:), 'S', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD_4D(JSTEP, MYPROC, ZWINDS(:,:,:,:), 'W', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD_4D(JSTEP, MYPROC, ZGMV(:,:,:,:), 'M', NOUTDUMP) ZTSTEP2(JSTEP)=TIMEF() CALL DIR_TRANS(PSPVOR=ZVOR,PSPDIV=ZDIV,& @@ -879,6 +875,12 @@ PROGRAM TRANSFORM_TEST & PGP3A=ZGMV(:,:,5:5,:)) ZTSTEP2(JSTEP)=(TIMEF()-ZTSTEP2(JSTEP))/1000.0_JPRD + ! Dump a field to a binary file + CALL DUMP_GRIDPOINT_FIELD_2D(JSTEP, MYPROC, ZVOR(:,:), 'V', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD_2D(JSTEP, MYPROC, ZDIV(:,:), 'D', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD_2D(JSTEP, MYPROC, ZSP(:,:), 'P', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD_3D(JSTEP, MYPROC, ZT(:,:,:), 'T', NOUTDUMP) + ZTSTEP(JSTEP)=(TIMEF()-ZTSTEP(JSTEP))/1000.0_JPRD ZTSTEPAVG=ZTSTEPAVG+ZTSTEP(JSTEP) @@ -929,6 +931,18 @@ PROGRAM TRANSFORM_TEST ELSE WRITE(NOUT,'("time step ",I6," took", F8.4)') JSTEP,ZTSTEP(JSTEP) ENDIF + flush(nout) + ! call acc_present_dump() + ! print *, "going to free in 3 seconds" + ! call sleep (1) + ! print *, "going to free in 2 seconds" + ! call sleep (1) + ! print *, "going to free in 1 seconds" + ! call sleep (1) + ! !call acc_clear_freelists() + ! call sleep (5) + ! !call acc_present_dump() + ! !call sleep (10000) ENDDO ZTLOOP=(TIMEF()-ZTLOOP)/1000.0_JPRD @@ -1274,15 +1288,55 @@ SUBROUTINE SORT(A, N) ! ------------------------------------------------------------------ -SUBROUTINE DUMP_GRIDPOINT_FIELD(JSTEP, MYPROC, NPROMA, NGPBLKS, FLD, FLDCHAR, NOUTDUMP) +SUBROUTINE DUMP_GRIDPOINT_FIELD_2D(JSTEP, MYPROC, FLD, FLDCHAR, NOUTDUMP) ! Dump a 2D field to a binary file. INTEGER(KIND=JPIM), INTENT(IN) :: JSTEP ! Time step, used for naming file INTEGER(KIND=JPIM), INTENT(IN) :: MYPROC ! MPI rank, used for naming file -INTEGER(KIND=JPIM), INTENT(IN) :: NPROMA ! Size of NPROMA -INTEGER(KIND=JPIM), INTENT(IN) :: NGPBLKS ! Number of NPROMA blocks -REAL(KIND=JPRB) , INTENT(IN) :: FLD(NPROMA,NGPBLKS) ! 2D field +REAL(KIND=JPRB) , INTENT(IN) :: FLD(:,:) ! 2D field +CHARACTER , INTENT(IN) :: FLDCHAR ! Single character field identifier +INTEGER(KIND=JPIM), INTENT(IN) :: NOUTDUMP ! Unit number for output file + +CHARACTER(LEN=14) :: FILENAME = "X.XXX.XXXX.dat" + +WRITE(FILENAME(1:1),'(A1)') FLDCHAR +WRITE(FILENAME(3:5),'(I3.3)') JSTEP +WRITE(FILENAME(7:10),'(I4.4)') MYPROC + +OPEN(NOUTDUMP, FILE=FILENAME, FORM="UNFORMATTED") +WRITE(NOUTDUMP) FLD +CLOSE(NOUTDUMP) + +END SUBROUTINE DUMP_GRIDPOINT_FIELD_2D +SUBROUTINE DUMP_GRIDPOINT_FIELD_3D(JSTEP, MYPROC, FLD, FLDCHAR, NOUTDUMP) + +! Dump a 3D field to a binary file. + +INTEGER(KIND=JPIM), INTENT(IN) :: JSTEP ! Time step, used for naming file +INTEGER(KIND=JPIM), INTENT(IN) :: MYPROC ! MPI rank, used for naming file +REAL(KIND=JPRB) , INTENT(IN) :: FLD(:,:,:) ! 3D field +CHARACTER , INTENT(IN) :: FLDCHAR ! Single character field identifier +INTEGER(KIND=JPIM), INTENT(IN) :: NOUTDUMP ! Unit number for output file + +CHARACTER(LEN=14) :: FILENAME = "X.XXX.XXXX.dat" + +WRITE(FILENAME(1:1),'(A1)') FLDCHAR +WRITE(FILENAME(3:5),'(I3.3)') JSTEP +WRITE(FILENAME(7:10),'(I4.4)') MYPROC + +OPEN(NOUTDUMP, FILE=FILENAME, FORM="UNFORMATTED") +WRITE(NOUTDUMP) FLD +CLOSE(NOUTDUMP) + +END SUBROUTINE DUMP_GRIDPOINT_FIELD_3D +SUBROUTINE DUMP_GRIDPOINT_FIELD_4D(JSTEP, MYPROC, FLD, FLDCHAR, NOUTDUMP) + +! Dump a 4D field to a binary file. + +INTEGER(KIND=JPIM), INTENT(IN) :: JSTEP ! Time step, used for naming file +INTEGER(KIND=JPIM), INTENT(IN) :: MYPROC ! MPI rank, used for naming file +REAL(KIND=JPRB) , INTENT(IN) :: FLD(:,:,:,:) ! 4D field CHARACTER , INTENT(IN) :: FLDCHAR ! Single character field identifier INTEGER(KIND=JPIM), INTENT(IN) :: NOUTDUMP ! Unit number for output file @@ -1293,9 +1347,9 @@ SUBROUTINE DUMP_GRIDPOINT_FIELD(JSTEP, MYPROC, NPROMA, NGPBLKS, FLD, FLDCHAR, NO WRITE(FILENAME(7:10),'(I4.4)') MYPROC OPEN(NOUTDUMP, FILE=FILENAME, FORM="UNFORMATTED") -WRITE(NOUTDUMP) RESHAPE(FLD, (/ NPROMA*NGPBLKS /)) +WRITE(NOUTDUMP) FLD CLOSE(NOUTDUMP) -END SUBROUTINE DUMP_GRIDPOINT_FIELD +END SUBROUTINE DUMP_GRIDPOINT_FIELD_4D END PROGRAM TRANSFORM_TEST diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index bca1d4cdd..8861796a4 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -349,13 +349,13 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& IF(PRESENT(PSPSC3A) .AND. NF_SC3A > 0) THEN DO J3=1,UBOUND(PSPSC3A,3) CALL PRFI1B(PSCALARS(IFIRST:IFIRST+2*NF_SC3A-1,:,:),PSPSC3A(:,:,J3),NF_SC3A,UBOUND(PSPSC3A,2)) - IFIRST = IFIRST+2*NF_SC3A + IFIRST = IFIRST+2*NF_SC3A ENDDO ENDIF IF(PRESENT(PSPSC3B) .AND. NF_SC3B > 0) THEN DO J3=1,UBOUND(PSPSC3B,3) CALL PRFI1B(PSCALARS(IFIRST:IFIRST+2*NF_SC3B-1,:,:),PSPSC3B(:,:,J3),NF_SC3B,UBOUND(PSPSC3B,2)) - IFIRST = IFIRST+2*NF_SC3B + IFIRST = IFIRST+2*NF_SC3B ENDDO ENDIF IF(IFIRST-1 /= 2*KF_SCALARS) THEN From 1f747f1b384db92081e6caebfa6acc729071eee1 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 24 May 2022 05:55:08 -0700 Subject: [PATCH 246/263] add second executable --- src/programs/CMakeLists.txt | 19 + .../driver-spectraltransform_indiv.F90 | 1571 +++++++++++++++++ 2 files changed, 1590 insertions(+) create mode 100644 src/programs/driver-spectraltransform_indiv.F90 diff --git a/src/programs/CMakeLists.txt b/src/programs/CMakeLists.txt index a4f2aa5f0..8fc85159e 100644 --- a/src/programs/CMakeLists.txt +++ b/src/programs/CMakeLists.txt @@ -53,6 +53,25 @@ if( HAVE_GPU ) set_property( TARGET driver-spectrans-CA-${prec} PROPERTY CUDA_ARCHITECTURES 70 ) target_compile_options( driver-spectrans-CA-${prec} PRIVATE $<$:-g -acc -Minfo=acc -gpu=cc70,lineinfo,deepcopy,fastmath,nordc,pinned -cudalib=cufft,cublas -fpic> ) set_target_properties(driver-spectrans-CA-${prec} PROPERTIES LINK_FLAGS "-acc -cudalib=cufft,cublas -fpic -gpu=cc70,pinned") + + ecbuild_add_executable(TARGET driver-spectrans-CA-${prec}-indiv + SOURCES driver-spectraltransform_indiv.F90 + INCLUDES + ${MPI_Fortran_INCLUDE_PATH} + $ + LIBS + fiat parkind_${prec} + eccodes_f90 eccodes_memfs + ${MPI_Fortran_LIBRARIES} + trans_gpu_static_CA_${prec} + gpu + OpenACC::OpenACC_Fortran + ${LAPACK_LIBRARIES} + nvhpcwrapnvtx + ) + set_property( TARGET driver-spectrans-CA-${prec}-indiv PROPERTY CUDA_ARCHITECTURES 70 ) + target_compile_options( driver-spectrans-CA-${prec}-indiv PRIVATE $<$:-g -acc -Minfo=acc -gpu=cc70,lineinfo,deepcopy,fastmath,nordc,pinned -cudalib=cufft,cublas -fpic> ) + set_target_properties(driver-spectrans-CA-${prec}-indiv PROPERTIES LINK_FLAGS "-acc -cudalib=cufft,cublas -fpic -gpu=cc70,pinned") message("Building ${prec} GPU driver") endif() endforeach() diff --git a/src/programs/driver-spectraltransform_indiv.F90 b/src/programs/driver-spectraltransform_indiv.F90 new file mode 100644 index 000000000..920d92fa5 --- /dev/null +++ b/src/programs/driver-spectraltransform_indiv.F90 @@ -0,0 +1,1571 @@ +! (C) Copyright 2014- ECMWF. +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +PROGRAM TRANSFORM_TEST + +! +! Spectral transform test +! +! This test performs spectral to real and real to spectral transforms repeated in +! timed loop. +! +! +! Author : George Mozdzynski +! + +USE PARKIND1 ,ONLY : JPIM ,JPRB, JPRD +USE OML_MOD ,ONLY : OML_MAX_THREADS +USE MPL_MPIF +USE MPL_MODULE +USE GRIB_API +USE YOMGSTATS, ONLY: JPMAXSTAT, YLSTATS => LSTATS +USE TPM_DISTR, ONLY: D +USE EQ_REGIONS_MOD ,ONLY : MY_REGION_NS, MY_REGION_EW +USE TPM_FIELDS, ONLY: F + +IMPLICIT NONE + +INTEGER(KIND=JPIM) :: RETURN_CODE + +CHARACTER CTYPEG*1 +CHARACTER*127 CINSF, CRTABLE +CHARACTER*127 CGRIDTYPE,CFNAME + +! Maximum latitudes, currently equal to TCO7999 +INTEGER(KIND=JPIM),PARAMETER :: JPMLAT=16000 + +INTEGER(KIND=JPIM) :: ISTACK, GETSTACKUSAGE +REAL(KIND=JPRB), DIMENSION(1) :: ZMAXERR(5), ZERR(5) +REAL(KIND=JPRB) :: ZMAXERRG + +INTEGER(KIND=JPIM) :: NRGRI(JPMLAT) +INTEGER(KIND=JPIM) :: NERR,NULNAM,NLIN,INSF,NSMAX,NDGL,NQ +INTEGER(KIND=JPIM) :: ITABLE,NOUT,NOUTDUMP,NSPEC2,NGPTOT,NGPTOTG,IFLD,IFLDS,ICODE,IOUTSF,JROC,JB +INTEGER(KIND=JPIM) :: IERR,ITAG,NSPEC2G,IRET,NTYPE,I,IGRIBOUT,IMAX_FLDS_IN +INTEGER(KIND=JPIM) :: JF,JA,IB,JPRTRV +INTEGER(KIND=JPIM), DIMENSION(1) :: IPARAM,IGRIB,IEDITION,ICURLEV +INTEGER(KIND=JPIM) ,ALLOCATABLE :: NLOEN(:),ITO(:),NPRCIDS(:) +INTEGER(KIND=JPIM) :: MYPROC,JJ +INTEGER :: JSTEP, JJSTEP +REAL(KIND=JPRD) :: ZTINIT,ZTLOOP,TIMEF, ZTSTEPMAX, ZTSTEPMIN, ZTSTEPAVG, ZTSTEPMED +REAL(KIND=JPRD) :: ZTSTEPMAX1, ZTSTEPMIN1, ZTSTEPAVG1, ZTSTEPMED1 +REAL(KIND=JPRD) :: ZTSTEPMAX2, ZTSTEPMIN2, ZTSTEPAVG2, ZTSTEPMED2 +REAL(KIND=JPRD),ALLOCATABLE :: ZTSTEP(:), ZTSTEP1(:), ZTSTEP2(:) +REAL(KIND=JPRB),ALLOCATABLE :: ZFPDAT(:) +REAL(KIND=JPRB),ALLOCATABLE :: ZNORMSP(:),ZNORMSP1(:),ZNORMDIV(:),ZNORMDIV1(:) +REAL(KIND=JPRB),ALLOCATABLE :: ZNORMVOR(:),ZNORMVOR1(:),ZNORMT(:),ZNORMT1(:) +REAL(KIND=JPRB),ALLOCATABLE :: ZNORM(:),ZNORM1(:) +REAL(KIND=JPRD) :: ZAVEAVE(0:JPMAXSTAT) + +! GRID-POINT SPACE DATA STRUCTURES +REAL(KIND=JPRB), ALLOCATABLE :: ZWINDS (:,:,:,:) ! Multilevel fields at t and t-dt +REAL(KIND=JPRB), ALLOCATABLE, TARGET :: ZGMV (:,:,:,:) ! Multilevel fields at t and t-dt +REAL(KIND=JPRB), ALLOCATABLE, TARGET :: ZGMVS (:,:,:) ! Single level fields at t and t-dt + +! SPECTRAL SPACE DATA STRUCTURES +REAL(KIND=JPRB), ALLOCATABLE :: ZSPVORG(:,:) +REAL(KIND=JPRB), ALLOCATABLE :: ZSPDIVG(:,:) +REAL(KIND=JPRB), ALLOCATABLE :: ZSPSPG(:,:) +REAL(KIND=JPRB), ALLOCATABLE :: ZSPTG(:,:,:) +REAL(KIND=JPRB), ALLOCATABLE, TARGET :: SP3D(:,:,:) +REAL(KIND=JPRB), POINTER :: ZVOR(:,:) => NULL() +REAL(KIND=JPRB), POINTER :: ZDIV(:,:) => NULL() +REAL(KIND=JPRB), POINTER :: ZT(:,:,:) => NULL() +REAL(KIND=JPRB), ALLOCATABLE :: ZSP(:,:) + +LOGICAL :: LSTACK +LOGICAL :: LDONE,LSTDEV +LOGICAL :: LUSERPNM, LKEEPRPNM, LUSEFLT +LOGICAL :: LTRACE_STATS,LSTATS_OMP, LSTATS_COMMS, LSTATS_MPL +LOGICAL :: LSTATS,LBARRIER_STATS, LBARRIER_STATS2, LDETAILED_STATS +LOGICAL :: LSTATS_ALLOC, LSYNCSTATS, LSTATSCPU, LSTATS_MEM +LOGICAL :: LXML_STATS +LOGICAL :: LFFTW +INTEGER(KIND=JPIM) :: NSTATS_MEM, NTRACE_STATS, NPRNT_STATS +! 0 - no output, 1 - init and final result, 2 - every timestep +INTEGER(KIND=JPIM) :: NPRINTNORMS=0 +LOGICAL :: LMPOFF +INTEGER(KIND=JPIM) :: ITERS=100 + +REAL(KIND=JPRB) :: ZMAXERR_CHECK=0.0_JPRB +REAL(KIND=JPRB) :: ZRA=6371229._JPRB + +INTEGER(KIND=JPIM) :: NMAX_RESOL +INTEGER(KIND=JPIM) :: NPRINTLEV +INTEGER(KIND=JPIM) :: NPROMATR +INTEGER(KIND=JPIM) :: NCOMBFLEN + +INTEGER(KIND=JPIM) :: NPROC +INTEGER(KIND=JPIM) :: NTHREAD +INTEGER(KIND=JPIM) :: NPRGPNS +INTEGER(KIND=JPIM) :: NPRGPEW +INTEGER(KIND=JPIM) :: NPRTRV +INTEGER(KIND=JPIM) :: NPRTRW +INTEGER(KIND=JPIM) :: NSPECRESMIN +INTEGER(KIND=JPIM) :: MYSETV +INTEGER(KIND=JPIM) :: MYSETW +INTEGER(KIND=JPIM) :: MYSETA +INTEGER(KIND=JPIM) :: MYSETB +INTEGER(KIND=JPIM) :: MP_TYPE +INTEGER(KIND=JPIM) :: MBX_SIZE + +INTEGER(KIND=JPIM), ALLOCATABLE :: NUMLL(:), IVSET(:), NPSURF(:) +INTEGER(KIND=JPIM) :: IVSETSC(1) +INTEGER(KIND=JPIM) :: NPSP ! Set to 1 if PE has V set with surface variables + +INTEGER(KIND=JPIM) :: NFLEVG, NFLEVL +! SUMPINI +INTEGER(KIND=JPIM) :: ISQR +LOGICAL :: LSYNC_TRANS +LOGICAL :: LEQ_REGIONS + + +INTEGER(KIND=JPIM) :: NPROMA +INTEGER(KIND=JPIM) :: NGPBLKS +! LOCALS +INTEGER(KIND=JPIM) :: IPRTRV +INTEGER(KIND=JPIM) :: IPRTRW +INTEGER(KIND=JPIM) :: IPRUSED, ILEVPP, IREST, ILEV, JLEV, ILASTLEV + +LOGICAL :: LLINFO + +INTEGER(KIND=JPIM) :: NDIMGMV ! Third dim. of GMV "(NPROMA,NFLEVG,NDIMGMV,NGPBLKS)" +INTEGER(KIND=JPIM) :: NDIMGMVS ! Second dim. GMVS "(NPROMA,NDIMGMVS,NGPBLKS)" + +NAMELIST/NAMRGRI/ NRGRI +NAMELIST/NAMTRANS/ LSTATS, LBARRIER_STATS, LBARRIER_STATS2, LDETAILED_STATS, & + & LUSERPNM, LKEEPRPNM, LUSEFLT, NQ, NLIN, IMAX_FLDS_IN, & + & NPRINTNORMS, ITERS, ZMAXERR_CHECK, NPROMA, NPROMATR, LEQ_REGIONS, & + & NPRINTLEV, NPRTRW, NPRTRV, NSPECRESMIN, NFLEVG, MBX_SIZE, LSTACK, & + & LFFTW + +! ------------------------------------------------------------------ + +#include "setup_trans0.h" +#include "setup_trans.h" +#include "inv_trans.h" +#include "dir_trans.h" +#include "dist_spec.h" +#include "gath_grid.h" +#include "trans_inq.h" +#include "specnorm.h" +#include "abor1.intfb.h" +#include "gstats_setup.intfb.h" + +! Default initializations +NERR = 0 +NULNAM = 4 +NOUT = 6 +! Unit number for file to dump 2D fields to +NOUTDUMP = 7 +! Max number of resolutions +NMAX_RESOL=37 +! Print level +NPRINTLEV=0 +! NPROMA for trans lib +NPROMATR=0 +! Size of comm buffer +NCOMBFLEN=1800000 +! EQ REGIONS flag +LEQ_REGIONS=.TRUE. +! Message Passing switch +LMPOFF=.FALSE. +! Activate barrier sync +LSYNC_TRANS=.false. +! Number of procs +NPROC=0 +! Grid-point decomp +NPRGPNS=0 +NPRGPEW=0 +! Spectral decomp +NPRTRW=0 +NPRTRV=0 +! Minimum spectral resolution +! Used for controlling NPRTRW +NSPECRESMIN=0 +! Message passing type +MP_TYPE=2 +! Mailbox size +MBX_SIZE=150000000 +! GSTATS statistics +LSTATS=.FALSE. +LDETAILED_STATS=.FALSE. +LSTATS_OMP=.FALSE. +LSTATS_COMMS=.FALSE. +LSTATS_MPL=.FALSE. +LBARRIER_STATS=.FALSE. +LBARRIER_STATS2=.FALSE. +LSTATSCPU=.FALSE. +LSYNCSTATS=.FALSE. +LXML_STATS=.FALSE. +LTRACE_STATS=.FALSE. +NSTATS_MEM=0 +LSTATS_MEM=.FALSE. +LSTATS_ALLOC=.FALSE. +NTRACE_STATS=0 +NPRNT_STATS=1 +LUSERPNM=.FALSE. +LKEEPRPNM=.FALSE. +! Use fast Legendre transforms +LUSEFLT=.FALSE. +! Output stack info +LSTACK=.FALSE. +! Use FFTW +LFFTW=.TRUE. + +! Default number of vertical levels +NFLEVG=137 +! Number of 3D grid-point fields in GMV +NDIMGMV=9 +! Number of 2D grid-point fields in GMVS +! surface pressure, north south der, east-west der +NDIMGMVS=3 +! Set defaults for options +CINSF = ' ' +CTYPEG = 'r' +LSTDEV = .FALSE. +NLIN = 1 +NDGL = 0 +NQ = 0 +CRTABLE = '.' + +! Locals +ILASTLEV = 0 + +! Read NAMELIST to override defaults +REWIND(NULNAM) +READ(NULNAM,NAMTRANS) + +! Message passing setup +! Participating processors limited by -P option + +!-------------------------- +CALL MPL_INIT() +!IF( LSTATS ) CALL GSTATS(0,0) +ZTINIT=TIMEF() + +NPROC= MPL_NPROC() +MYPROC = MPL_MYRANK() +NTHREAD= OML_MAX_THREADS() + +! ONLY OUTPUT TO STDOUT ON PE 1 +IF( NPROC > 1 ) THEN + IF( MYPROC /= 1 ) THEN + OPEN(UNIT=NOUT, FILE='/dev/null') + ENDIF +ENDIF + +IF(LDETAILED_STATS)THEN + LSTATS_OMP=.TRUE. + LSTATS_COMMS=.TRUE. + LSTATS_MPL=.TRUE. + LSTATSCPU=.TRUE. + NPRNT_STATS=NPROC +! LSTATS_MEM=.TRUE. +! LSTATS_ALLOC=.TRUE. +ENDIF + +!------------------------- + +ALLOCATE(NPRCIDS(NPROC)) +DO JJ=1,NPROC + NPRCIDS(JJ) = JJ +ENDDO + +IF( NPROC <= 1 ) LMPOFF=.TRUE. +! COMPUTE NPRGPNS and NPRGPEW +! THIS VERSION SELECTS MOST SQUARE-LIKE DISTRIBUTION +! THESE WILL CHANGE IF LEQ_REGIONS=.TRUE. +IF( NPROC == 0 ) NPROC = 1 +ISQR=INT(SQRT(REAL(NPROC,JPRB))) +DO JA=ISQR,NPROC + IB=NPROC/JA + IF( JA*IB == NPROC ) THEN + NPRGPNS=MAX(JA,IB) + NPRGPEW=MIN(JA,IB) + EXIT + ENDIF +ENDDO + +! FROM SUMPINI, ALTHOUGH THIS +! SHOULD BE SPECIFIED IN NAMELIST +IF( NSPECRESMIN==0 ) NSPECRESMIN=NPROC + +! COMPUTE NPRTRV AND NPRTRW +! IF NOT PROVIDED IN NAMELIST +IF( NPRTRV > 0 .OR. NPRTRW > 0 ) THEN + IF( NPRTRV == 0 ) NPRTRV=NPROC/NPRTRW + IF( NPRTRW == 0 ) NPRTRW=NPROC/NPRTRV + IF( NPRTRW*NPRTRV /= NPROC ) CALL ABOR1('TRANSFORM_TEST:NPRTRW*NPRTRV /= NPROC') + IF( NPRTRW > NSPECRESMIN ) CALL ABOR1('TRANSFORM_TEST:NPRTRW > NSPECRESMIN') +ELSE + DO JPRTRV=4,NPROC + NPRTRV=JPRTRV + NPRTRW=NPROC/NPRTRV + IF( NPRTRV*NPRTRW /= NPROC ) CYCLE + IF( NPRTRV > NPRTRW ) EXIT + IF( NPRTRW > NSPECRESMIN ) CYCLE + IF( NPRTRW <= NSPECRESMIN/(2*OML_MAX_THREADS()) ) EXIT + ENDDO + ! GO FOR APPROX SQUARE PARTITION FOR BACKUP + IF( NPRTRV*NPRTRW /= NPROC .OR. NPRTRW > NSPECRESMIN .OR. NPRTRV > NPRTRW ) THEN + ISQR=INT(SQRT(REAL(NPROC,JPRB))) + DO JA=ISQR,NPROC + IB=NPROC/JA + IF (JA*IB == NPROC) THEN + NPRTRW=MAX(JA,IB) + NPRTRV=MIN(JA,IB) + IF (NPRTRW > NSPECRESMIN ) CALL ABOR1('TRANSFORM_TEST:NPRTRW & + & (approx square value) > NSPECRESMIN') + EXIT + ENDIF + ENDDO + ENDIF +ENDIF + +! Create communicators for MPI groups +IF (.NOT.LMPOFF) THEN + CALL MPL_GROUPS_CREATE(NPRTRW,NPRTRV) +ENDIF + +IF (LMPOFF) THEN + MYSETW=(MYPROC-1)/NPRTRV+1 + MYSETV=MOD(MYPROC-1,NPRTRV)+1 +ELSE + CALL MPL_CART_COORDS(MYPROC,MYSETW,MYSETV) + ! Just checking for now... + IPRTRV=MOD(MYPROC-1,NPRTRV)+1 + IPRTRW=(MYPROC-1)/NPRTRV+1 + IF (IPRTRV/=MYSETV .OR. IPRTRW/=MYSETW) THEN + CALL ABOR1('TRANSFORM_TEST:Inconsistency when computing MYSETW and MYSETV') + ENDIF +ENDIF + +IF (.NOT.LMPOFF) THEN + LLINFO=.FALSE. + IF (MYPROC == 1) LLINFO=.TRUE. + CALL MPL_BUFFER_METHOD(KMP_TYPE=MP_TYPE,KMBX_SIZE=MBX_SIZE,KPROCIDS=NPRCIDS,LDINFO=LLINFO) +ENDIF + +! Determine number of local levels for Fourier and Legendre calculations +! based on the values of NFLEVG and NPRTRV +ALLOCATE(NUMLL(NPRTRV+1)) + +ALLOCATE(NPSURF(NPRTRV)) + +! Calculate remainder +IPRUSED=MIN(NFLEVG+1,NPRTRV) +ILEVPP=NFLEVG/NPRTRV +IREST=NFLEVG-ILEVPP*NPRTRV +DO JROC=1,NPRTRV + IF(JROC <= IREST) THEN + NUMLL(JROC)=ILEVPP+1 + ELSE + NUMLL(JROC)=ILEVPP + ENDIF +ENDDO +NUMLL(IPRUSED+1:NPRTRV+1)=0 + +NFLEVL=NUMLL(MYSETV) + +DO JROC=1,IPRUSED + NPSURF(JROC)=0 +ENDDO +NPSURF(IPRUSED)=1 +NPSP=NPSURF(MYSETV) +IVSETSC(1)=IPRUSED + +ITAG = 123456 +IFLD=0 +IFLDS=0 + +IF(MYPROC == 1) THEN + IF(CINSF == ' ') THEN + CINSF = 'fort.11' + ENDIF +ENDIF + +IF(CTYPEG == 'r') THEN + NTYPE = 1 +ELSEIF(CTYPEG == 'f') THEN + NTYPE = 0 +ELSE + WRITE(NERR,*) 'WRONG TYPE OF GRID: ',CTYPEG,' It should be',' either f or r' + CALL ABOR1('TRANSFORM_TEST:WRONG TYPE OF GRID') +ENDIF + +ICODE = 0 + +! Find spectral resolution + +IF(MYPROC == 1) THEN + CALL GRIB_OPEN_FILE(INSF,CINSF,'R',IRET) + IF(IRET /= GRIB_SUCCESS) THEN + WRITE(NERR,*) 'ERROR OPENING FILE INPUT SPECTRAL FILE',CINSF,IRET + CALL ABOR1('TRANSFORM_TEST: ERROR OPENING FILE INPUT SPECTRAL FILE') + ENDIF + CALL GRIB_NEW_FROM_FILE(INSF,IGRIB(1),IRET) + IF(IRET /= GRIB_SUCCESS) THEN + WRITE(NERR,*)'ERROR IN FILE ',CINSF,' : NO INFORMATION' + CALL ABOR1('TRANSFORM_TEST: ERROR GRIB_NEW_FROM_FILE') + ENDIF + CALL GRIB_GET(IGRIB(1),'gridType',CGRIDTYPE,IRET) + IF(CGRIDTYPE /= 'sh') THEN + WRITE(NERR,*)'INPUT DATA NOT IN SPECTRAL FORM' + CALL ABOR1('TRANSFORM_TEST:INPUT DATA NOT IN SPECTRAL FORM') + ENDIF + + CALL GRIB_GET(IGRIB(1),'pentagonalResolutionParameterJ',NSMAX) + +! Decide gridpoint resolution + + IF (NDGL == 0) THEN + CALL SETNDGL + ELSE + CALL CHECK_NDGL + ENDIF +ENDIF + +IF(NPROC > 1) THEN + CALL MPL_BROADCAST(NSMAX,KROOT=1,KTAG=ITAG, & + & CDSTRING='TRANSFORM_TEST:') + + CALL MPL_BROADCAST(NDGL,KROOT=1,KTAG=ITAG, & + & CDSTRING='TRANSFORM_TEST:') +ENDIF +ALLOCATE(NLOEN(NDGL)) + +IF(MYPROC == 1) THEN + + IF(NTYPE == 0) THEN + NLOEN(:) = 2*NDGL + ELSEIF(NQ == 1.AND.NTYPE == 1) THEN + ! cubic grid + ITABLE = INDEX(CRTABLE,' ') + IF(NSMAX < 1000) THEN + WRITE(CRTABLE(ITABLE:ITABLE+11),'(A,I3.3)') '/rtable_3',NSMAX + ELSE + WRITE(CRTABLE(ITABLE:ITABLE+12),'(A,I4.4)') '/rtable_3',NSMAX + ENDIF + ELSEIF(NQ == 2.AND.NTYPE == 1) THEN + ! cubic grid + Collignon + ITABLE = INDEX(CRTABLE,' ') + IF(NSMAX < 1000) THEN + WRITE(CRTABLE(ITABLE:ITABLE+11),'(A,I3.3)') '/rtable_4',NSMAX + ELSE + WRITE(CRTABLE(ITABLE:ITABLE+12),'(A,I4.4)') '/rtable_4',NSMAX + ENDIF + ELSEIF(NLIN == 0.AND.NTYPE == 1) THEN + ! quadratic grid + ITABLE = INDEX(CRTABLE,' ') + IF(NSMAX < 1000) THEN + WRITE(CRTABLE(ITABLE:ITABLE+11),'(A,I3.3)') '/rtable_2',NSMAX + ELSE + WRITE(CRTABLE(ITABLE:ITABLE+12),'(A,I4.4)') '/rtable_2',NSMAX + ENDIF + ELSEIF(NLIN == 1.AND.NTYPE == 1) THEN + ITABLE = INDEX(CRTABLE,' ') + IF(NSMAX < 1000) THEN + WRITE(CRTABLE(ITABLE:ITABLE+12),'(A,I3.3)') '/rtablel_2',NSMAX + ELSE + WRITE(CRTABLE(ITABLE:ITABLE+13),'(A,I4.4)') '/rtablel_2',NSMAX + ENDIF + ENDIF + IF(NTYPE == 1) THEN + OPEN(15,FILE=CRTABLE,FORM='FORMATTED',ACTION='READ') + READ(15,NAMRGRI) + NLOEN(:) = NRGRI(1:NDGL) + CLOSE(15) + ENDIF + + CALL GRIB_RELEASE(IGRIB(1)) + CALL GRIB_CLOSE_FILE(INSF) + +ENDIF + + +IF(NPROC > 1) THEN + CALL MPL_BROADCAST(NLOEN(:),KROOT=1,KTAG=ITAG, & + & CDSTRING='TRANSFORM_TEST:') +ENDIF + +CALL SETUP_TRANS0(KOUT=NOUT,KERR=NERR,KPRINTLEV=NPRINTLEV,KMAX_RESOL=NMAX_RESOL, & +& KPROMATR=NPROMATR,KPRGPNS=NPRGPNS,KPRGPEW=NPRGPEW,KPRTRW=NPRTRW, & +& KCOMBFLEN=NCOMBFLEN,LDMPOFF=LMPOFF,LDSYNC_TRANS=LSYNC_TRANS, & +& LDEQ_REGIONS=LEQ_REGIONS, & +& PRAD=ZRA,LDALLOPERM=.TRUE.) + +CALL SETUP_TRANS(KSMAX=NSMAX,KDGL=NDGL,KLOEN=NLOEN,LDSPLIT=.TRUE.,& +& KFLEV=NFLEVL, LDUSEFFTW=LFFTW,& +& LDUSERPNM=LUSERPNM,LDKEEPRPNM=LKEEPRPNM,LDUSEFLT=LUSEFLT) +! +CALL TRANS_INQ(KSPEC2=NSPEC2,KSPEC2G=NSPEC2G,KGPTOT=NGPTOT,KGPTOTG=NGPTOTG) +DO JB=D%NPTRFRSTLAT(MY_REGION_NS),D%NPTRLSTLAT(MY_REGION_NS) + WRITE(300+MYPROC,*) "MY_REGION", JB-D%NPTRFRSTLAT(MY_REGION_NS)+D%NFRSTLAT(MY_REGION_NS), D%NSTA(JB,MY_REGION_EW), D%NONL(JB,MY_REGION_EW) +ENDDO +IF (MYPROC == 1) THEN + DO JB=1,NDGL + WRITE(300+MYPROC,*) "LATITUDE", JB, ASIN(F%RMU(JB))/3.14159265358979323846264338327950288*180, NLOEN(JB) + ENDDO +ENDIF + + +! Default, no blocking +NPROMA=NGPTOT +! allow NPROMA to be overidden by namelist value +REWIND(NULNAM) +READ(NULNAM,NAMTRANS) +! Calculate number of NPROMA blocks +NGPBLKS=(NGPTOT-1)/NPROMA+1 + +! Allocate spectral arrays +! Try to mimick IFS layout as much as possible +NULLIFY(ZVOR) +NULLIFY(ZDIV) +NULLIFY(ZT) +ALLOCATE(SP3D(NFLEVL,NSPEC2,3)) +ALLOCATE(ZSP(1,NSPEC2)) + +SP3D(:,:,:)=0.0_JPRB +ZSP(:,:) =0.0_JPRB +ZVOR =>SP3D(:,:,1) +ZDIV =>SP3D(:,:,2) +ZT =>SP3D(:,:,3:3) + +! Spectral global buffers +! Allocating only on PE 1 +! Dangerous, but otherwise we run out of memory +! if we run flat MPI +IF(MYPROC == 1) THEN + ALLOCATE(ZFPDAT(NSPEC2G)) + ALLOCATE(ZSPVORG(NFLEVG,NSPEC2G)) + ALLOCATE(ZSPDIVG(NFLEVG,NSPEC2G)) + ALLOCATE(ZSPTG(NFLEVG,NSPEC2G,1)) + ALLOCATE(ZSPSPG(1,NSPEC2G)) +ENDIF + +! Open files +IF(MYPROC == 1) THEN + CALL GRIB_OPEN_FILE(INSF,CINSF,'R',IRET) + IF(IRET /= GRIB_SUCCESS) THEN + WRITE(NERR,*) 'ERROR OPENING FILE INPUT SPECTRAL FILE',CINSF,IRET + CALL ABOR1('TRANSFORM_TEST:ERROR OPENING FILE INPUT SPECTRAL FILE') + ENDIF +ENDIF + +! Spectral to gridpoint transform +LDONE = .FALSE. + +! specify the maximum number of fields to be read from input dataset +! it is not a problem if there are less fields as the actual number of fields +! that will transformed will be replicated from the actual number of fields read +IMAX_FLDS_IN=412 + +! allow IMAX_FLDS_IN to be overidden by namelist value +REWIND(NULNAM) +READ(NULNAM,NAMTRANS) + +! Inititialize GRIB_API handles to zero +IGRIB(:) = 0 +IGRIBOUT = 0 + +DO + IF(MYPROC == 1) THEN + + ! Read and decode spectral field +! WRITE(NOUT,*) ' CALLING GRIB_NEW_FROM_FILE' + CALL GRIB_NEW_FROM_FILE(INSF,IGRIB(1),IRET) + IF(IRET == GRIB_END_OF_FILE) THEN + LDONE = .TRUE. + WRITE(NOUT,'(A)') 'END OF GRIB FILE REACHED.' + ENDIF + + IF(IFLDS==IMAX_FLDS_IN) THEN + LDONE = .TRUE. + ENDIF + + IF(.NOT. LDONE) THEN + IF(IRET /= GRIB_SUCCESS) THEN + WRITE(NERR,*) 'ERROR GRIB_NEW_FROM_FILE',IRET + CALL ABOR1('TRANSFORM_TEST:ERROR GRIB_NEW_FROM_FILE') + ENDIF + + CALL GRIB_GET(IGRIB(1),'edition',IEDITION(1),IRET) + IF(IRET /= GRIB_SUCCESS) THEN + WRITE(NERR,*)'ERROR GRIB_GET edition' + CALL ABOR1('TRANSFORM_TEST:ERROR GRIB_GET edition') + ENDIF + + CALL GRIB_GET(IGRIB(1),'paramId',IPARAM(1),IRET) + IF(IRET /= GRIB_SUCCESS) THEN + WRITE(NERR,*)'ERROR GRIB_GET paramId' + CALL ABOR1('TRANSFORM_TEST:ERROR GRIB_GET paramId') + ENDIF + + ! Write out "skipped field" to output spectral file + IF(IPARAM(1) /= ICODE.AND.ICODE /= 0) THEN + PRINT *,'FIELD ',IPARAM(1),' NOT TRANSFORMED' + CALL GRIB_CLONE(IGRIB(1),IGRIBOUT,IRET) + IF(IRET /= GRIB_SUCCESS) THEN + WRITE(NERR,*)'TRANSFORM_TEST:ERROR GRIB_CLONE' + CALL ABOR1('TRANSFORM_TEST:ERROR GRIB_CLONE') + ENDIF + CALL GRIB_WRITE(IGRIBOUT,IOUTSF,IRET) + IF(IRET /= GRIB_SUCCESS) THEN + WRITE(NERR,*)'TRANSFORM_TEST:ERROR GRIB_WRITE' + CALL ABOR1('TRANSFORM_TEST:ERROR GRIB_WRITE') + ENDIF + CALL GRIB_RELEASE(IGRIBOUT) + CYCLE + ENDIF + + CALL GRIB_GET(IGRIB(1),'level',ICURLEV,IRET) + IF( IRET /= GRIB_SUCCESS) THEN + WRITE(NERR,*)'ERROR GRIB_GET level' + CALL ABOR1('TRANSFORM_TEST:ERROR GRIB_GET level') + ENDIF + + CALL GRIB_GET(IGRIB(1),'shortName',CFNAME,IRET) + IF( IRET /= GRIB_SUCCESS) THEN + WRITE(NERR,*)'ERROR GRIB_GET shortName' + CALL ABOR1('TRANSFORM_TEST:ERROR GRIB_GET shortName') + ENDIF + + CALL GRIB_GET(IGRIB(1),'values',ZFPDAT,IRET) + IF(IRET /= GRIB_SUCCESS) THEN + WRITE(NERR,*)'ERROR GRIB_GET values ',IRET + CALL ABOR1('TRANSFORM_TEST:ERROR GRIB_GET values') + ENDIF + CALL GRIB_RELEASE(IGRIB(1)) + IFLD=1 + ILEV=ICURLEV(1) + ILASTLEV = MAX(ILEV,ILASTLEV) + + IF( CFNAME == 'lnsp' ) THEN + ZSPSPG(1,:)=ZFPDAT(:) + IFLDS=IFLDS+1 + ENDIF + + IF( CFNAME == 'vo' ) THEN + ZSPVORG(ILEV,:)=ZFPDAT(:) + IFLDS=IFLDS+1 + ENDIF + + IF( CFNAME == 'd' ) THEN + ZSPDIVG(ILEV,:)=ZFPDAT(:) + IFLDS=IFLDS+1 + ENDIF + + IF( CFNAME == 't' ) THEN + ZSPTG(ILEV,:,1)=ZFPDAT(:) + IFLDS=IFLDS+1 + ENDIF + ENDIF + ! Send number of fields in this batch to other processors + IF(NPROC > 1) THEN + DO JROC=2,NPROC + CALL MPL_SEND(IFLD,KDEST=NPRCIDS(JROC),KTAG=ITAG) + ENDDO + ENDIF + + ELSE + ! Receive field + CALL MPL_RECV(IFLD,KSOURCE=NPRCIDS(1),KTAG=ITAG) + ENDIF + + IF(NPROC > 1 .AND. IFLD == 1) THEN + CALL MPL_BROADCAST(IPARAM(IFLD),KROOT=1,KTAG=ITAG, & + & CDSTRING='TRANSFORM_TEST:') + ENDIF + IF(IFLD == 0 ) EXIT + + ! Synchronize processors + IF(NPROC > 1) THEN + CALL MPL_BARRIER() + ENDIF +! Distribute batch of fields + + IFLD = 0 +ENDDO + + +IF (MYPROC == 1) THEN + CALL GRIB_CLOSE_FILE(INSF) +ENDIF + +! Broadcast number of fields read to all procs and levels +IF(NPROC > 1) THEN + CALL MPL_BROADCAST(IFLDS,KROOT=1,KTAG=ITAG, & + & CDSTRING='TRANSFORM_TEST:') + CALL MPL_BROADCAST(ILASTLEV,KROOT=1,KTAG=ITAG, & + & CDSTRING='TRANSFORM_TEST:') + +ENDIF + +! Some sanity checks +IF( ILASTLEV < 1 ) CALL ABOR1('TRANSFORM_TEST:ILASTLEV < 1') +IF( NFLEVG < ILASTLEV ) CALL ABOR1('TRANSFORM_TEST:NFLEVG < ILASTLEV') + +! Extend fields using mod function +IF( NFLEVG > ILASTLEV ) THEN + IF( MYPROC == 1 ) THEN + DO ILEV=ILASTLEV+1,NFLEVG + ZSPVORG(ILEV,:) = ZSPVORG(MOD(ILEV-1,ILASTLEV)+1,:) + ZSPDIVG(ILEV,:) = ZSPDIVG(MOD(ILEV-1,ILASTLEV)+1,:) + ZSPTG(ILEV,:,1) = ZSPTG(MOD(ILEV-1,ILASTLEV)+1,:,1) + ENDDO + ENDIF +ENDIF + +WRITE(NOUT,'(" ")') +WRITE(NOUT,'("SPECTRAL FIELDS HAVE BEEN SUCCESSFULY READ, IFLDS=",I3)')IFLDS +WRITE(NOUT,'(" ")') + +! PRINT CONFIGURATION DETAILS +WRITE(NOUT,'(A)')'===-=== START OF RUNTIME PARAMETERS ===-===' +WRITE(NOUT,'(" ")') +WRITE(NOUT,'("NLIN= ",I10)') NLIN +WRITE(NOUT,'("NQ= ",I10)') NQ +WRITE(NOUT,'("NSMAX= ",I10)') NSMAX +WRITE(NOUT,'("NDGL= ",I10)') NDGL +WRITE(NOUT,'("NPROC= ",I10)') NPROC +WRITE(NOUT,'("NTHREAD=",I10)') NTHREAD +WRITE(NOUT,'("NPRGPNS=",I10)') NPRGPNS +WRITE(NOUT,'("NPRGPEW=",I10)') NPRGPEW +WRITE(NOUT,'("NPRTRW= ",I10)') NPRTRW +WRITE(NOUT,'("NPRTRV= ",I10)') NPRTRV +WRITE(NOUT,'("NPROMA= ",I10)') NPROMA +WRITE(NOUT,'("NGPTOT= ",I10)') NGPTOT +WRITE(NOUT,'("NGPTOTG=",I10)') NGPTOTG +WRITE(NOUT,'("NFLEVG= ",I10)') NFLEVG +WRITE(NOUT,'("IFLDS= ",I10)') IFLDS +WRITE(NOUT,'("NSPEC2= ",I10)') NSPEC2 +WRITE(NOUT,'("NSPEC2G=",I10)') NSPEC2G +WRITE(NOUT,'("LUSEFLT=",L10)') LUSEFLT +WRITE(NOUT,'(" ")') +WRITE(NOUT,'(A)') '===-=== END OF RUNTIME PARAMETERS ===-===' + + +ALLOCATE(IVSET(NFLEVG)) + +! Compute spectral distribution +ILEV = 0 +DO JB=1,NPRTRV + DO JLEV=1,NUMLL(JB) + ILEV = ILEV + 1 + IVSET(ILEV) = JB + ENDDO +ENDDO + +ALLOCATE(ITO(IFLDS)) +ITO(:)=1 + +! Distribute spectral fields to processors +CALL DIST_SPEC(PSPECG=ZSPVORG,KFDISTG=NFLEVG,KFROM=ITO,PSPEC=ZVOR,KVSET=IVSET(1:NFLEVG)) +CALL DIST_SPEC(PSPECG=ZSPDIVG,KFDISTG=NFLEVG,KFROM=ITO,PSPEC=ZDIV,KVSET=IVSET(1:NFLEVG)) +CALL DIST_SPEC(PSPECG=ZSPTG(:,:,1),KFDISTG=NFLEVG,KFROM=ITO,PSPEC=ZT(:,:,1),KVSET=IVSET(1:NFLEVG)) +CALL DIST_SPEC(PSPECG=ZSPSPG,KFDISTG=1,KFROM=ITO,PSPEC=ZSP,KVSET=IVSETSC(1:1)) + +! Deallocate resources +IF(MYPROC==1) THEN + DEALLOCATE(ZFPDAT,ITO) + DEALLOCATE(ZSPVORG,ZSPDIVG,ZSPTG,ZSPSPG) +ENDIF +! ALLOCATE GRID-POINT ARRAYS +ALLOCATE(ZWINDS(NPROMA,NFLEVG,6,NGPBLKS)) +ALLOCATE(ZGMV(NPROMA,NFLEVG,NDIMGMV,NGPBLKS)) +ALLOCATE(ZGMVS(NPROMA,NDIMGMVS,NGPBLKS)) + +ALLOCATE(ZNORMSP(1)) +ALLOCATE(ZNORMSP1(1)) +ALLOCATE(ZNORMVOR(NFLEVG)) +ALLOCATE(ZNORMVOR1(NFLEVG)) +ALLOCATE(ZNORMDIV(NFLEVG)) +ALLOCATE(ZNORMDIV1(NFLEVG)) +ALLOCATE(ZNORMT(NFLEVG)) +ALLOCATE(ZNORMT1(NFLEVG)) + +IF( NPRINTNORMS > 0 ) THEN + CALL SPECNORM(PSPEC=ZVOR(1:NFLEVL,:),PNORM=ZNORMVOR1,KVSET=IVSET(1:NFLEVG)) + CALL SPECNORM(PSPEC=ZDIV(1:NFLEVL,:),PNORM=ZNORMDIV1,KVSET=IVSET(1:NFLEVG)) + CALL SPECNORM(PSPEC=ZT(1:NFLEVL,:,1),PNORM=ZNORMT1,KVSET=IVSET(1:NFLEVG)) + CALL SPECNORM(PSPEC=ZSP(1:1,:), PNORM=ZNORMSP1,KVSET=IVSETSC(1:1)) + + IF(MYPROC == 1) THEN + DO IFLD=1,1 + WRITE(NOUT,'("SP ZNORM(",I4,")=",F20.15)') IFLD,ZNORMSP1(IFLD) + ENDDO + DO IFLD=1,NFLEVG + WRITE(NOUT,'("DIV ZNORM(",I4,")=",F20.15)') IFLD,ZNORMDIV1(IFLD) + ENDDO + DO IFLD=1,NFLEVG + WRITE(NOUT,'("VOR ZNORM(",I4,")=",F20.15)') IFLD,ZNORMVOR1(IFLD) + ENDDO + DO IFLD=1,NFLEVG + WRITE(NOUT,'("T ZNORM(",I4,")=",F20.15)') IFLD,ZNORMT1(IFLD) + ENDDO + ENDIF +ENDIF + +ZTINIT=(TIMEF()-ZTINIT)/1000.0_JPRD +WRITE(NOUT,'(" ")') +WRITE(NOUT,'(a,I6,a,F9.2,a)') "TRANSFORM_TEST initialisation, on",NPROC,& + & " tasks, took",ZTINIT," sec" +WRITE(NOUT,'(" ")') + +IF(ITERS<=0) CALL ABOR1('TRANSFORM_TEST:ITERS <= 0') + +ALLOCATE(ZTSTEP(ITERS)) +ALLOCATE(ZTSTEP1(ITERS)) +ALLOCATE(ZTSTEP2(ITERS)) + +ZTSTEPAVG=0._JPRD +ZTSTEPMAX=0._JPRD +ZTSTEPMIN=9999999999999999._JPRD +ZTSTEPAVG1=0._JPRD +ZTSTEPMAX1=0._JPRD +ZTSTEPMIN1=9999999999999999._JPRD +ZTSTEPAVG2=0._JPRD +ZTSTEPMAX2=0._JPRD +ZTSTEPMIN2=9999999999999999._JPRD + +WRITE(NOUT,'(A)') '===-=== START OF SPEC TRANSFORMS ===-===' +WRITE(NOUT,'(" ")') + +IF( LSTATS ) THEN + CALL GSTATS(0,0) + CALL GSTATS_SETUP(NPROC,MYPROC,NPRCIDS,& + & LSTATS,LSTATSCPU,LSYNCSTATS,LDETAILED_STATS,LBARRIER_STATS,LBARRIER_STATS2,& + & LSTATS_OMP,LSTATS_COMMS,LSTATS_MEM,NSTATS_MEM,LSTATS_ALLOC,& + & LTRACE_STATS,NTRACE_STATS,NPRNT_STATS,LXML_STATS) + CALL GSTATS_PSUT + CALL GSTATS_LABEL_IFS +ENDIF + +ZTLOOP=TIMEF() +! simulated time stepping loop + +!skip time measurements for first iteration +YLSTATS = .false. + +DO JSTEP=1,1 + IF (JSTEP > 1) YLSTATS = .true. + ZTSTEP(JSTEP)=TIMEF() + ZTSTEP1(JSTEP)=TIMEF() + + ! scalar parts only + JJSTEP = 1 + ZGMVS = 0 + ZWINDS = 0 + ZGMV = 0 + CALL INV_TRANS(PSPSC2=ZSP(1:1,:),PSPSC3A=ZT,& + & PGP2=ZGMVS(:,1:3,:),PGP3A=ZGMV(:,:,5:7,:), & + & LDSCDERS=.TRUE.,LDVORGP=.FALSE.,LDDIVGP=.FALSE.,LDUVDER=.FALSE.,& + & KRESOL=1,KPROMA=NPROMA,KVSETSC2=IVSETSC(1:1),& + & KVSETSC3A=IVSET) + CALL DUMP_GRIDPOINT_FIELD_3D(JJSTEP, MYPROC, ZGMVS(:,:,:), 'S', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD_4D(JJSTEP, MYPROC, ZGMV(:,:,:,:), 'M', NOUTDUMP) + + ! all; split, all options on + JJSTEP = 2 + ZGMVS = 0 + ZWINDS = 0 + ZGMV = 0 + CALL INV_TRANS(PSPVOR=ZVOR,PSPDIV=ZDIV,PSPSC2=ZSP(1:1,:),PSPSC3A=ZT,& + & PGPUV=ZWINDS(:,:,1:4,:),PGP2=ZGMVS(:,1:3,:),PGP3A=ZGMV(:,:,5:7,:),& + & LDSCDERS=.TRUE.,LDVORGP=.TRUE.,LDDIVGP=.TRUE.,LDUVDER=.FALSE.,& + & KRESOL=1,KPROMA=NPROMA,KVSETUV=IVSET,KVSETSC2=IVSETSC(1:1),& + & KVSETSC3A=IVSET) + CALL DUMP_GRIDPOINT_FIELD_3D(JJSTEP, MYPROC, ZGMVS(:,:,:), 'S', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD_4D(JJSTEP, MYPROC, ZWINDS(:,:,:,:), 'W', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD_4D(JJSTEP, MYPROC, ZGMV(:,:,:,:), 'M', NOUTDUMP) + + ! only wind + JJSTEP = 3 + ZGMVS = 0 + ZWINDS = 0 + ZGMV = 0 + CALL INV_TRANS(PSPVOR=ZVOR,PSPDIV=ZDIV,& + & PGPUV=ZWINDS(:,:,3:4,:),& + & LDVORGP=.FALSE.,LDDIVGP=.false.,LDUVDER=.FALSE.,& + & KRESOL=1,KPROMA=NPROMA,KVSETUV=IVSET) + CALL DUMP_GRIDPOINT_FIELD_4D(JJSTEP, MYPROC, ZWINDS(:,:,:,:), 'W', NOUTDUMP) + + ! all; split + JJSTEP = 4 + ZGMVS = 0 + ZWINDS = 0 + ZGMV = 0 + CALL INV_TRANS(PSPVOR=ZVOR,PSPDIV=ZDIV,PSPSC2=ZSP(1:1,:),PSPSC3A=ZT,& + & PGPUV=ZWINDS(:,:,2:4,:),PGP2=ZGMVS(:,1:3,:),PGP3A=ZGMV(:,:,5:7,:),& + & LDSCDERS=.TRUE.,LDVORGP=.FALSE.,LDDIVGP=.TRUE.,LDUVDER=.FALSE.,& + & KRESOL=1,KPROMA=NPROMA,KVSETUV=IVSET,KVSETSC2=IVSETSC(1:1),& + & KVSETSC3A=IVSET) + CALL DUMP_GRIDPOINT_FIELD_3D(JJSTEP, MYPROC, ZGMVS(:,:,:), 'S', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD_4D(JJSTEP, MYPROC, ZWINDS(:,:,:,:), 'W', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD_4D(JJSTEP, MYPROC, ZGMV(:,:,:,:), 'M', NOUTDUMP) + + ! scalar only; with derivatives + JJSTEP = 5 + ZGMVS = 0 + ZWINDS = 0 + ZGMV = 0 + CALL INV_TRANS(PSPSCALAR=ZSP(1:1,:),& + & PGP=ZGMVS(:,1:3,:), & + & LDSCDERS=.TRUE.,& + & KRESOL=1,KPROMA=NPROMA,KVSETSC=IVSETSC(1:1)) + CALL DUMP_GRIDPOINT_FIELD_3D(JJSTEP, MYPROC, ZGMVS(:,:,:), 'S', NOUTDUMP) + + ! scalar only + JJSTEP = 6 + ZGMVS = 0 + ZWINDS = 0 + ZGMV = 0 + CALL INV_TRANS(PSPSCALAR=ZSP(1:1,:),& + & PGP=ZGMVS(:,1:1,:),& + & KRESOL=1,KPROMA=NPROMA,KVSETSC=IVSETSC(1:1)) + CALL DUMP_GRIDPOINT_FIELD_3D(JJSTEP, MYPROC, ZGMVS(:,:,:), 'S', NOUTDUMP) + + ! scalar split only + JJSTEP = 7 + ZGMVS = 0 + ZWINDS = 0 + ZGMV = 0 + CALL INV_TRANS(PSPSC2=ZSP(1:1,:),& + & PGP2=ZGMVS(:,1:3,:),& + & LDSCDERS=.TRUE.,& + & KRESOL=1,KPROMA=NPROMA,KVSETSC2=IVSETSC(1:1)) + CALL DUMP_GRIDPOINT_FIELD_3D(JJSTEP, MYPROC, ZGMVS(:,:,:), 'S', NOUTDUMP) + + ! scalar split only; with derivatives + JJSTEP = 8 + ZGMVS = 0 + ZWINDS = 0 + ZGMV = 0 + CALL INV_TRANS(PSPSC2=ZT(:,:,1),& + & PGP2=ZGMV(:,:,5,:), & + & KRESOL=1,KPROMA=NGPTOT,KVSETSC2=IVSET) + CALL DUMP_GRIDPOINT_FIELD_4D(JJSTEP, MYPROC, ZGMV(:,:,:,:), 'M', NOUTDUMP) + + ZTSTEP1(JSTEP)=(TIMEF()-ZTSTEP1(JSTEP))/1000.0_JPRD + ZTSTEP2(JSTEP)=TIMEF() + + CALL INV_TRANS(PSPVOR=ZVOR,PSPDIV=ZDIV,PSPSC2=ZSP(1:1,:),PSPSC3A=ZT,& + & PGPUV=ZWINDS(:,:,1:4,:),PGP2=ZGMVS(:,1:3,:),PGP3A=ZGMV(:,:,5:7,:),& + & LDSCDERS=.TRUE.,LDVORGP=.TRUE.,LDDIVGP=.TRUE.,LDUVDER=.FALSE.,& + & KRESOL=1,KPROMA=NPROMA,KVSETUV=IVSET,KVSETSC2=IVSETSC(1:1),& + & KVSETSC3A=IVSET) + + JJSTEP = 9 + ZSP = 0 + ZT = 0 + ZVOR = 0 + ZDIV = 0 + CALL DIR_TRANS(PSPSCALAR=ZSP(1:1,:), & + & PGP=ZGMVS(:,1:1,:), & + & KRESOL=1,KPROMA=NPROMA,KVSETSC=IVSETSC(1:1)) + CALL DUMP_GRIDPOINT_FIELD_2D(JJSTEP, MYPROC, ZSP(:,:), 'P', NOUTDUMP) + + JJSTEP = 10 + ZSP = 0 + ZT = 0 + ZVOR = 0 + ZDIV = 0 + CALL DIR_TRANS(PSPSC2=ZSP(1:1,:),PSPSC3A=ZT,& + & PGP2=ZGMVS(:,1:1,:),PGP3A=ZGMV(:,:,5:5,:), & + & KRESOL=1,KPROMA=NPROMA,KVSETSC2=IVSETSC(1:1),& + & KVSETSC3A=IVSET) + CALL DUMP_GRIDPOINT_FIELD_2D(JJSTEP, MYPROC, ZSP(:,:), 'P', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD_3D(JJSTEP, MYPROC, ZT(:,:,:), 'T', NOUTDUMP) + + JJSTEP = 11 + ZSP = 0 + ZT = 0 + ZVOR = 0 + ZDIV = 0 + CALL DIR_TRANS(PSPVOR=ZVOR(1:NFLEVL,:),PSPDIV=ZDIV(1:NFLEVL,:),& + & PGPUV=ZWINDS(:,1:NFLEVG,3:4,:), & + & KRESOL=1,KPROMA=NPROMA,KVSETUV=IVSET(1:NFLEVG)) + CALL DUMP_GRIDPOINT_FIELD_2D(JJSTEP, MYPROC, ZVOR(:,:), 'V', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD_2D(JJSTEP, MYPROC, ZDIV(:,:), 'D', NOUTDUMP) + + JJSTEP = 12 + ZSP = 0 + ZT = 0 + ZVOR = 0 + ZDIV = 0 + CALL DIR_TRANS(PSPSC2=ZSP(1:1,:),& + & PGP2=ZGMVS(:,1:1,:), & + & KRESOL=1,KPROMA=NPROMA,KVSETSC2=IVSETSC(1:1)) + CALL DUMP_GRIDPOINT_FIELD_2D(JJSTEP, MYPROC, ZSP(:,:), 'P', NOUTDUMP) + + JJSTEP = 13 + ZSP = 0 + ZT = 0 + ZVOR = 0 + ZDIV = 0 + CALL DIR_TRANS(PSPSC2=ZT(:,:,1),& + & PGP2=ZGMV(:,:,5,:), & + & KRESOL=1,KPROMA=NGPTOT,KVSETSC2=IVSET) + CALL DUMP_GRIDPOINT_FIELD_3D(JJSTEP, MYPROC, ZT(:,:,:), 'T', NOUTDUMP) + + JJSTEP = 14 + ZSP = 0 + ZT = 0 + ZVOR = 0 + ZDIV = 0 + CALL DIR_TRANS(PSPSC2=ZSP(1:1,:),PSPSC3A=ZT(:,:,:),& + & PGP2=ZGMVS(:,1:1,:),PGP3A=ZGMV(:,:,5:5,:), & + & KRESOL=1,KPROMA=NPROMA,KVSETSC2=IVSETSC(1:1),& + & KVSETSC3A=IVSET(:)) + CALL DUMP_GRIDPOINT_FIELD_2D(JJSTEP, MYPROC, ZSP(:,:), 'P', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD_3D(JJSTEP, MYPROC, ZT(:,:,:), 'T', NOUTDUMP) + + JJSTEP = 16 + ZSP = 0 + ZT = 0 + ZVOR = 0 + ZDIV = 0 + CALL DIR_TRANS(PSPVOR=ZVOR(:,:),PSPDIV=ZDIV(:,:),& + & KRESOL=1,KPROMA=NPROMA,KVSETUV=IVSET(:),& + & PGPUV=ZWINDS(:,:,3:4,:)) + CALL DUMP_GRIDPOINT_FIELD_2D(JJSTEP, MYPROC, ZVOR(:,:), 'V', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD_2D(JJSTEP, MYPROC, ZDIV(:,:), 'D', NOUTDUMP) + + JJSTEP = 17 + ZSP = 0 + ZT = 0 + ZVOR = 0 + ZDIV = 0 + CALL DIR_TRANS(PSPSCALAR=ZSP(1:1,:),PGP=ZGMVS(:,1:1,:),KVSETSC=IVSETSC(1:1)) + CALL DUMP_GRIDPOINT_FIELD_2D(JJSTEP, MYPROC, ZSP(:,:), 'V', NOUTDUMP) + + JJSTEP = 15 + ZSP = 0 + ZT = 0 + ZVOR = 0 + ZDIV = 0 + CALL DIR_TRANS(PSPVOR=ZVOR,PSPDIV=ZDIV,PSPSC2=ZSP(1:1,:),PSPSC3A=ZT,& + & PGPUV=ZWINDS(:,:,3:4,:),PGP2=ZGMVS(:,1:1,:),PGP3A=ZGMV(:,:,5:5,:), & + & KRESOL=1,KPROMA=NPROMA,KVSETUV=IVSET,KVSETSC2=IVSETSC(1:1),& + & KVSETSC3A=IVSET) + CALL DUMP_GRIDPOINT_FIELD_2D(JJSTEP, MYPROC, ZVOR(:,:), 'V', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD_2D(JJSTEP, MYPROC, ZDIV(:,:), 'D', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD_2D(JJSTEP, MYPROC, ZSP(:,:), 'P', NOUTDUMP) + CALL DUMP_GRIDPOINT_FIELD_3D(JJSTEP, MYPROC, ZT(:,:,:), 'T', NOUTDUMP) + + ZTSTEP2(JSTEP)=(TIMEF()-ZTSTEP2(JSTEP))/1000.0_JPRD + + ZTSTEP(JSTEP)=(TIMEF()-ZTSTEP(JSTEP))/1000.0_JPRD + + ZTSTEPAVG=ZTSTEPAVG+ZTSTEP(JSTEP) + ZTSTEPMIN=MIN(ZTSTEP(JSTEP),ZTSTEPMIN) + ZTSTEPMAX=MAX(ZTSTEP(JSTEP),ZTSTEPMAX) + + ZTSTEPAVG1=ZTSTEPAVG1+ZTSTEP1(JSTEP) + ZTSTEPMIN1=MIN(ZTSTEP1(JSTEP),ZTSTEPMIN1) + ZTSTEPMAX1=MAX(ZTSTEP1(JSTEP),ZTSTEPMAX1) + + ZTSTEPAVG2=ZTSTEPAVG2+ZTSTEP2(JSTEP) + ZTSTEPMIN2=MIN(ZTSTEP2(JSTEP),ZTSTEPMIN2) + ZTSTEPMAX2=MAX(ZTSTEP2(JSTEP),ZTSTEPMAX2) + + + IF( NPRINTNORMS > 1 )THEN + CALL SPECNORM(PSPEC=ZSP(1:1,:), PNORM=ZNORMSP, KVSET=IVSETSC(1:1)) + CALL SPECNORM(PSPEC=ZVOR(1:NFLEVL,:), PNORM=ZNORMVOR,KVSET=IVSET(1:NFLEVG)) + CALL SPECNORM(PSPEC=ZDIV(1:NFLEVL,:), PNORM=ZNORMDIV,KVSET=IVSET(1:NFLEVG)) + CALL SPECNORM(PSPEC=ZT(1:NFLEVL,:,1), PNORM=ZNORMT, KVSET=IVSET(1:NFLEVG)) + + IF( MYPROC==1 ) THEN + ! SURFACE PRESSURE + ZMAXERR(:)=-999.0 + DO IFLD=1,1 + ZERR(1)=ABS(ZNORMSP1(IFLD)/ZNORMSP(IFLD)-1.0_JPRB) + ZMAXERR(1)=MAX(ZMAXERR(1),ZERR(1)) + ENDDO + ! DIVERGENCE + DO IFLD=1,NFLEVG + ZERR(2)=ABS(ZNORMDIV1(IFLD)/ZNORMDIV(IFLD)-1.0_JPRB) + ZMAXERR(2)=MAX(ZMAXERR(2),ZERR(2)) + ENDDO + ! VORTICITY + DO IFLD=1,NFLEVG + ZERR(3)=ABS(ZNORMVOR1(IFLD)/ZNORMVOR(IFLD)-1.0_JPRB) + ZMAXERR(3)=MAX(ZMAXERR(3),ZERR(3)) + ENDDO + ! TEMPERATURE + DO IFLD=1,NFLEVG + ZERR(4)=ABS(ZNORMT1(IFLD)/ZNORMT(IFLD)-1.0_JPRB) + ZMAXERR(4)=MAX(ZMAXERR(4),ZERR(4)) + ENDDO + WRITE(NOUT,'("time step ",I6," took", F8.4," | SP max err="E10.3,& + & " | DIV max err="E10.3," | VOR max err="E10.3," | T max err="E10.3)') & + & JSTEP,ZTSTEP(JSTEP),ZMAXERR(1),ZMAXERR(2),ZMAXERR(3),ZMAXERR(4) + ENDIF + ELSE + WRITE(NOUT,'("time step ",I6," took", F8.4)') JSTEP,ZTSTEP(JSTEP) + ENDIF + flush(nout) + ! call acc_present_dump() + ! print *, "going to free in 3 seconds" + ! call sleep (1) + ! print *, "going to free in 2 seconds" + ! call sleep (1) + ! print *, "going to free in 1 seconds" + ! call sleep (1) + ! !call acc_clear_freelists() + ! call sleep (5) + ! !call acc_present_dump() + ! !call sleep (10000) +ENDDO + +ZTLOOP=(TIMEF()-ZTLOOP)/1000.0_JPRD + +WRITE(NOUT,'(" ")') +WRITE(NOUT,'(A)') '===-=== END OF SPEC TRANSFORMS ===-===' +WRITE(NOUT,'(" ")') + + +IF( NPRINTNORMS > 0 ) THEN + CALL SPECNORM(PSPEC=ZVOR(1:NFLEVL,:),PNORM=ZNORMVOR,KVSET=IVSET(1:NFLEVG)) + CALL SPECNORM(PSPEC=ZDIV(1:NFLEVL,:),PNORM=ZNORMDIV,KVSET=IVSET(1:NFLEVG)) + CALL SPECNORM(PSPEC=ZT(1:NFLEVL,:,1),PNORM=ZNORMT,KVSET=IVSET(1:NFLEVG)) + CALL SPECNORM(PSPEC=ZSP(1:1,:), PNORM=ZNORMSP,KVSET=IVSETSC(1:1)) + + IF(MYPROC == 1) THEN + ! SURFACE PRESSURE + ZMAXERR(:)=-999.0 + DO IFLD=1,1 + ZERR(1)=ABS(ZNORMSP1(IFLD)/ZNORMSP(IFLD)-1.0D0) + ZMAXERR(1)=MAX(ZMAXERR(1),ZERR(1)) + WRITE(NOUT,'("SP ZNORM(",I4,")=",F20.15," ERR=",E10.3)') IFLD,ZNORMSP(IFLD),ZERR(1) + ENDDO + ! DIVERGENCE + DO IFLD=1,NFLEVG + ZERR(2)=ABS(ZNORMDIV1(IFLD)/ZNORMDIV(IFLD)-1.0D0) + ZMAXERR(2)=MAX(ZMAXERR(2),ZERR(2)) + WRITE(NOUT,'("DIV ZNORM(",I4,")=",F20.15," ERR=",E10.3)') IFLD,ZNORMDIV(IFLD),ZERR(2) + ENDDO + ! VORTICITY + DO IFLD=1,NFLEVG + ZERR(3)=ABS(ZNORMVOR1(IFLD)/ZNORMVOR(IFLD)-1.0D0) + ZMAXERR(3)=MAX(ZMAXERR(3),ZERR(3)) + WRITE(NOUT,'("VOR ZNORM(",I4,")=",F20.15," ERR=",E10.3)') IFLD,ZNORMVOR(IFLD),ZERR(3) + ENDDO + ! TEMPERATURE + DO IFLD=1,NFLEVG + ZERR(4)=ABS(ZNORMT1(IFLD)/ZNORMT(IFLD)-1.0D0) + ZMAXERR(4)=MAX(ZMAXERR(4),ZERR(4)) + WRITE(NOUT,'("T ZNORM(",I4,")=",F20.15," ERR=",E10.3)') IFLD,ZNORMT(IFLD),ZERR(4) + ENDDO + ! MAXIMUM ERROR ACROSS ALL FIELDS + ZMAXERRG=MAX(MAX(ZMAXERR(1),ZMAXERR(2)),MAX(ZMAXERR(2),ZMAXERR(3))) + + WRITE(NOUT,'("SURFACE PRESSURE MAX ERROR=",E10.3)')ZMAXERR(1) + WRITE(NOUT,'("DIVERGENCE MAX ERROR=",E10.3)')ZMAXERR(2) + WRITE(NOUT,'("VORTICITY MAX ERROR=",E10.3)')ZMAXERR(3) + WRITE(NOUT,'("TEMPERATURE MAX ERROR=",E10.3)')ZMAXERR(4) + WRITE(NOUT,'("GLOBAL MAX ERROR=",E10.3)')ZMAXERRG + + ENDIF +ENDIF + +CALL MPL_ALLREDUCE(ZTLOOP, 'SUM', LDREPROD=.FALSE.) +CALL MPL_ALLREDUCE(ZTSTEP, 'SUM', LDREPROD=.FALSE.) +CALL MPL_ALLREDUCE(ZTSTEPAVG, 'SUM', LDREPROD=.FALSE.) +CALL MPL_ALLREDUCE(ZTSTEPMAX, 'MAX', LDREPROD=.FALSE.) +CALL MPL_ALLREDUCE(ZTSTEPMIN, 'MIN', LDREPROD=.FALSE.) + +CALL MPL_ALLREDUCE(ZTSTEP1, 'SUM', LDREPROD=.FALSE.) +CALL MPL_ALLREDUCE(ZTSTEPAVG1, 'SUM', LDREPROD=.FALSE.) +CALL MPL_ALLREDUCE(ZTSTEPMAX1, 'MAX', LDREPROD=.FALSE.) +CALL MPL_ALLREDUCE(ZTSTEPMIN1, 'MIN', LDREPROD=.FALSE.) + +CALL MPL_ALLREDUCE(ZTSTEP2, 'SUM', LDREPROD=.FALSE.) +CALL MPL_ALLREDUCE(ZTSTEPAVG2, 'SUM', LDREPROD=.FALSE.) +CALL MPL_ALLREDUCE(ZTSTEPMAX2, 'MAX', LDREPROD=.FALSE.) +CALL MPL_ALLREDUCE(ZTSTEPMIN2, 'MIN', LDREPROD=.FALSE.) + + +ZTSTEPAVG=(ZTSTEPAVG/REAL(NPROC,JPRB))/REAL(ITERS,JPRD) +ZTLOOP=ZTLOOP/REAL(NPROC,JPRD) +ZTSTEP(:)=ZTSTEP(:)/REAL(NPROC,JPRD) + +CALL SORT(ZTSTEP,ITERS) +ZTSTEPMED = ZTSTEP(ITERS/2) + +ZTSTEPAVG1=(ZTSTEPAVG1/REAL(NPROC,JPRB))/REAL(ITERS,JPRD) +ZTSTEP1(:)=ZTSTEP1(:)/REAL(NPROC,JPRD) + +CALL SORT(ZTSTEP1,ITERS) +ZTSTEPMED1 = ZTSTEP1(ITERS/2) + +ZTSTEPAVG2=(ZTSTEPAVG2/REAL(NPROC,JPRB))/REAL(ITERS,JPRD) +ZTSTEP2(:)=ZTSTEP2(:)/REAL(NPROC,JPRD) + +CALL SORT(ZTSTEP2,ITERS) +ZTSTEPMED2 = ZTSTEP2(ITERS/2) + +IF(MYPROC == 1)THEN + WRITE(NOUT,'(" ")') + WRITE(NOUT,'(A)') '===-=== START OF TIME STEP STATS ===-===' + WRITE(NOUT,'(" ")') + WRITE(NOUT,'("INVERSE TRANSFORMS")') + WRITE(NOUT,'("------------------")') + WRITE(NOUT,'("AVG (s): ",F8.4)') ZTSTEPAVG1 + WRITE(NOUT,'("MIN (s): ",F8.4)') ZTSTEPMIN1 + WRITE(NOUT,'("MAX (s): ",F8.4)') ZTSTEPMAX1 + WRITE(NOUT,'("MED (s): ",F8.4)') ZTSTEPMED1 + WRITE(NOUT,'(" ")') + WRITE(NOUT,'("DIRECT TRANSFORMS")') + WRITE(NOUT,'("-----------------")') + WRITE(NOUT,'("AVG (s): ",F8.4)') ZTSTEPAVG2 + WRITE(NOUT,'("MIN (s): ",F8.4)') ZTSTEPMIN2 + WRITE(NOUT,'("MAX (s): ",F8.4)') ZTSTEPMAX2 + WRITE(NOUT,'("MED (s): ",F8.4)') ZTSTEPMED2 + WRITE(NOUT,'(" ")') + WRITE(NOUT,'("INVERSE-DIRECT TRANSFORMS")') + WRITE(NOUT,'("-------------------------")') + WRITE(NOUT,'("AVG (s): ",F8.4)') ZTSTEPAVG + WRITE(NOUT,'("MIN (s): ",F8.4)') ZTSTEPMIN + WRITE(NOUT,'("MAX (s): ",F8.4)') ZTSTEPMAX + WRITE(NOUT,'("MED (s): ",F8.4)') ZTSTEPMED + WRITE(NOUT,'("LOOP (s): ",F8.4)') ZTLOOP + WRITE(NOUT,'(" ")') + WRITE(NOUT,'(A)') '===-=== END OF TIME STEP STATS ===-===' + WRITE(NOUT,'(" ")') +ENDIF + +IF( LSTACK ) THEN +! gather stack usage statistics + ISTACK = GETSTACKUSAGE() + IF(MYPROC == 1) THEN + PRINT 9000, istack + 9000 FORMAT("Stack Utilisation Information",/,& + &"=============================",//,& + &"Task Size(Bytes)",/,& + &"==== ===========",//,& + &" 1",11x,I10) + + DO I=2,NPROC + CALL MPL_RECV(ISTACK,KSOURCE=NPRCIDS(I),KTAG=I, & + & CDSTRING='TRANSFORM_TEST:') + PRINT '(I4,11X,I10)', I,ISTACK + ENDDO + ELSE + CALL MPL_SEND(ISTACK,KDEST=NPRCIDS(1),KTAG=MYPROC, & + & CDSTRING='TRANSFORM_TEST:') + ENDIF +ENDIF + + +!-------------------------- +IF( LSTATS ) THEN + CALL GSTATS(0,1) + CALL GSTATS_PRINT(NOUT,ZAVEAVE,JPMAXSTAT) +ENDIF +!-------------------------- + +! CLOSE FILE +IF( NPROC > 1 ) THEN + IF( MYPROC /= 1 ) THEN + CLOSE(UNIT=NOUT) + ENDIF +ENDIF + +DEALLOCATE(ZWINDS) +DEALLOCATE(ZGMV) +DEALLOCATE(ZGMVS) + +!-------------------------- +CALL MPL_BARRIER() +CALL MPL_END() +!-------------------------- + + + + +CONTAINS + +! ------------------------------------------------------------------ + +SUBROUTINE SETNDGL + +! Decide number of Gaussian latitudes given spectral truncation +! Only certain combinations of truncation/linear grid +! or quadratic grid are supported + +! +! See prepdata/programs/sptogp.F90 +! + +IF(NLIN == 0 .AND. (NQ == 1.OR.NQ == 2)) THEN + IF(NSMAX == 79) THEN + NDGL = 160 + ELSEIF(NSMAX == 95) THEN + NDGL = 192 + ELSEIF(NSMAX == 127) THEN + NDGL = 256 + ELSEIF(NSMAX == 159) THEN + NDGL = 320 + ELSEIF(NSMAX == 199) THEN + NDGL = 400 + ELSEIF(NSMAX == 255) THEN + NDGL = 512 + ELSEIF(NSMAX == 319) THEN + NDGL = 640 + ELSEIF(NSMAX == 399) THEN + NDGL = 800 + ELSEIF(NSMAX == 511) THEN + NDGL = 1024 + ELSEIF(NSMAX == 639) THEN + NDGL = 1280 + ELSEIF(NSMAX == 799) THEN + NDGL = 1600 + ELSEIF(NSMAX == 1023) THEN + NDGL = 2048 + ELSEIF(NSMAX == 1279) THEN + NDGL = 2560 + ELSEIF(NSMAX == 1599) THEN + NDGL = 3200 + ELSEIF(NSMAX == 1999) THEN + NDGL = 4000 + ELSEIF(NSMAX == 3999) THEN + NDGL = 8000 + ELSEIF(NSMAX == 7999) THEN + NDGL = 16000 + ELSE + WRITE(NERR,*)'WRONG SPECTRAL RESOLUTION ',NSMAX,' CUBIC GRID' + CALL ABOR1('TRANSFORM_TEST:UNSUPPORTED SPECTRAL RESOLUTION - CUBIC GRID ') + ENDIF +ELSEIF (NLIN == 0) THEN + IF(NSMAX == 21) THEN + NDGL = 32 + ELSEIF(NSMAX == 42) THEN + NDGL = 64 + ELSEIF(NSMAX == 63) THEN + NDGL = 96 + ELSEIF(NSMAX == 106) THEN + NDGL = 160 + ELSEIF(NSMAX == 213) THEN + NDGL = 320 + ELSEIF(NSMAX == 341) THEN + NDGL = 512 + ELSEIF(NSMAX == 426) THEN + NDGL = 640 + ELSEIF(NSMAX == 533) THEN + NDGL = 800 + ELSEIF(NSMAX == 682) THEN + NDGL = 1024 + ELSEIF(NSMAX == 853) THEN + NDGL = 1280 + ELSEIF(NSMAX == 1364) THEN + NDGL = 2048 + ELSEIF(NSMAX == 1706) THEN + NDGL = 2560 + ELSE + WRITE(NERR,*)'WRONG SPECTRAL RESOLUTION ',NSMAX,' QUAD. GRID' + CALL ABOR1('TRANSFORM_TEST:UNSUPPORTED SPECTRAL RESOLUTION - QUAD. GRID ') + ENDIF +ELSEIF(NLIN == 1) THEN + IF(NSMAX == 63) THEN + NDGL = 64 + ELSEIF(NSMAX == 95) THEN + NDGL = 96 + ELSEIF(NSMAX == 127) THEN + NDGL = 128 + ELSEIF(NSMAX == 159) THEN + NDGL = 160 + ELSEIF(NSMAX == 191) THEN + NDGL = 192 + ELSEIF(NSMAX == 199) THEN + NDGL = 200 + ELSEIF(NSMAX == 255) THEN + NDGL = 256 + ELSEIF(NSMAX == 319) THEN + NDGL = 320 + ELSEIF(NSMAX == 399) THEN + NDGL = 400 + ELSEIF(NSMAX == 511) THEN + NDGL = 512 + ELSEIF(NSMAX == 639) THEN + NDGL = 640 + ELSEIF(NSMAX == 799) THEN + NDGL = 800 + ELSEIF(NSMAX == 1023) THEN + NDGL = 1024 + ELSEIF(NSMAX == 1279) THEN + NDGL = 1280 + ELSEIF(NSMAX == 2047) THEN + NDGL = 2048 + ELSEIF(NSMAX == 3999) THEN + NDGL = 4000 + ELSEIF(NSMAX == 7999) THEN + NDGL = 8000 + ELSE + WRITE(NERR,*)'WRONG SPECTRAL RESOLUTION ',NSMAX,' LIN. GRID' + CALL ABOR1('TRANSFORM_TEST:UNSUPPORTED SPECTRAL RESOLUTION - LIN. GRID') + ENDIF +ELSE + WRITE(NERR,*)'WRONG NLIN=',NLIN + CALL ABOR1('TRANSFORM_TEST:WRONG NLIN') +ENDIF +END SUBROUTINE SETNDGL + +! ------------------------------------------------------------------ + +SUBROUTINE CHECK_NDGL + +! Decide number of Gaussian latitudes given spectral truncation +! Only certain combinations of truncation/linear grid +! or quadratic grid are supported + +IF(NLIN == 0) THEN + IF(NDGL .ne. 32 .and. NDGL .ne. 64 .and. NDGL .ne. 96 .and. & + NDGL .ne. 160 .and. NDGL .ne. 320 .and. NDGL .ne. 512 .and. & + NDGL .ne. 640 .and. NDGL .ne. 800 .and. NDGL .ne. 1024 .and. NDGL .ne. 1280 ) THEN + WRITE(NERR,*)'WRONG SPECTRAL RESOLUTION ',NSMAX,' QUAD. GRID' +! CALL ABOR1('TRANSFORM_TEST:UNSUPPORTED SPECTRAL RESOLUTION - QUAD. GRID ') + ENDIF +ELSEIF(NLIN == 1) THEN + IF(NDGL .ne. 32 .and. NDGL .ne. 64 .and. NDGL .ne. 96 .and. NDGL .ne. 128 .and. & + NDGL .ne. 160.and. NDGL .ne. 256 .and. NDGL .ne. 320 .and. NDGL .ne. 400 .and. & + NDGL .ne. 512.and. NDGL .ne. 640 .and. NDGL .ne. 800 .and. NDGL .ne. 1024) THEN + WRITE(NERR,*)'WRONG SPECTRAL RESOLUTION ',NSMAX,' LIN. GRID' +! CALL ABOR1('TRANSFORM_TEST:UNSUPPORTED SPECTRAL RESOLUTION - LIN. GRID') + ENDIF +ELSE + WRITE(NERR,*)'WRONG NLIN=',NLIN +! CALL ABOR1('TRANSFORM_TEST:WRONG NLIN') +ENDIF + +END SUBROUTINE CHECK_NDGL + +! ------------------------------------------------------------------ + +SUBROUTINE SORT(A, N) + IMPLICIT NONE + INTEGER(KIND=JPIM) :: N, I, J + REAL(KIND=JPRD) :: A(N), X + + DO I = 2, N + X = A(I) + J = I - 1 + DO WHILE (J >= 1) + IF (A(J) <= X) EXIT + A(J + 1) = A(J) + J = J - 1 + END DO + A(J + 1) = X + END DO +END SUBROUTINE + +! ------------------------------------------------------------------ + +SUBROUTINE DUMP_GRIDPOINT_FIELD_2D(JSTEP, MYPROC, FLD, FLDCHAR, NOUTDUMP) + +! Dump a 2D field to a binary file. + +INTEGER(KIND=JPIM), INTENT(IN) :: JSTEP ! Time step, used for naming file +INTEGER(KIND=JPIM), INTENT(IN) :: MYPROC ! MPI rank, used for naming file +REAL(KIND=JPRB) , INTENT(IN) :: FLD(:,:) ! 2D field +CHARACTER , INTENT(IN) :: FLDCHAR ! Single character field identifier +INTEGER(KIND=JPIM), INTENT(IN) :: NOUTDUMP ! Unit number for output file + +CHARACTER(LEN=14) :: FILENAME = "X.XXX.XXXX.dat" + +REAL(KIND=JPRB), ALLOCATABLE :: FLD_R(:,:) + +WRITE(FILENAME(1:1),'(A1)') FLDCHAR +WRITE(FILENAME(3:5),'(I3.3)') JSTEP +WRITE(FILENAME(7:10),'(I4.4)') MYPROC + +ALLOCATE(FLD_R(SIZE(FLD,1), SIZE(FLD,2))) +OPEN(NOUTDUMP, FILE='./ref/' // FILENAME, FORM="UNFORMATTED") +READ(NOUTDUMP) FLD_R +CLOSE(NOUTDUMP) + +WRITE(400+MYPROC, *), FILENAME, MAXVAL(ABS(FLD_R-FLD)) + +OPEN(NOUTDUMP, FILE=FILENAME, FORM="UNFORMATTED") +WRITE(NOUTDUMP) FLD +CLOSE(NOUTDUMP) + +END SUBROUTINE DUMP_GRIDPOINT_FIELD_2D +SUBROUTINE DUMP_GRIDPOINT_FIELD_3D(JSTEP, MYPROC, FLD, FLDCHAR, NOUTDUMP) + +! Dump a 3D field to a binary file. + +INTEGER(KIND=JPIM), INTENT(IN) :: JSTEP ! Time step, used for naming file +INTEGER(KIND=JPIM), INTENT(IN) :: MYPROC ! MPI rank, used for naming file +REAL(KIND=JPRB) , INTENT(IN) :: FLD(:,:,:) ! 3D field +CHARACTER , INTENT(IN) :: FLDCHAR ! Single character field identifier +INTEGER(KIND=JPIM), INTENT(IN) :: NOUTDUMP ! Unit number for output file + +CHARACTER(LEN=14) :: FILENAME = "X.XXX.XXXX.dat" +REAL(KIND=JPRB), ALLOCATABLE :: FLD_R(:,:,:) + +WRITE(FILENAME(1:1),'(A1)') FLDCHAR +WRITE(FILENAME(3:5),'(I3.3)') JSTEP +WRITE(FILENAME(7:10),'(I4.4)') MYPROC + +ALLOCATE(FLD_R(SIZE(FLD,1), SIZE(FLD,2), SIZE(FLD,3))) +OPEN(NOUTDUMP, FILE='./ref/' // FILENAME, FORM="UNFORMATTED") +READ(NOUTDUMP) FLD_R +CLOSE(NOUTDUMP) + +WRITE(400+MYPROC, *), FILENAME, MAXVAL(ABS(FLD_R-FLD)) + +WRITE(FILENAME(1:1),'(A1)') FLDCHAR +WRITE(FILENAME(3:5),'(I3.3)') JSTEP +WRITE(FILENAME(7:10),'(I4.4)') MYPROC + +OPEN(NOUTDUMP, FILE=FILENAME, FORM="UNFORMATTED") +WRITE(NOUTDUMP) FLD +CLOSE(NOUTDUMP) + +END SUBROUTINE DUMP_GRIDPOINT_FIELD_3D +SUBROUTINE DUMP_GRIDPOINT_FIELD_4D(JSTEP, MYPROC, FLD, FLDCHAR, NOUTDUMP) + +! Dump a 4D field to a binary file. + +INTEGER(KIND=JPIM), INTENT(IN) :: JSTEP ! Time step, used for naming file +INTEGER(KIND=JPIM), INTENT(IN) :: MYPROC ! MPI rank, used for naming file +REAL(KIND=JPRB) , INTENT(IN) :: FLD(:,:,:,:) ! 4D field +CHARACTER , INTENT(IN) :: FLDCHAR ! Single character field identifier +INTEGER(KIND=JPIM), INTENT(IN) :: NOUTDUMP ! Unit number for output file + +CHARACTER(LEN=14) :: FILENAME = "X.XXX.XXXX.dat" + +REAL(KIND=JPRB), ALLOCATABLE :: FLD_R(:,:,:,:) + +WRITE(FILENAME(1:1),'(A1)') FLDCHAR +WRITE(FILENAME(3:5),'(I3.3)') JSTEP +WRITE(FILENAME(7:10),'(I4.4)') MYPROC + +ALLOCATE(FLD_R(SIZE(FLD,1), SIZE(FLD,2),SIZE(FLD,3),SIZE(FLD,4))) +OPEN(NOUTDUMP, FILE='./ref/' // FILENAME, FORM="UNFORMATTED") +READ(NOUTDUMP) FLD_R +CLOSE(NOUTDUMP) + +WRITE(400+MYPROC, *), FILENAME, MAXVAL(ABS(FLD_R-FLD)) + +WRITE(FILENAME(1:1),'(A1)') FLDCHAR +WRITE(FILENAME(3:5),'(I3.3)') JSTEP +WRITE(FILENAME(7:10),'(I4.4)') MYPROC + +OPEN(NOUTDUMP, FILE=FILENAME, FORM="UNFORMATTED") +WRITE(NOUTDUMP) FLD +CLOSE(NOUTDUMP) + +END SUBROUTINE DUMP_GRIDPOINT_FIELD_4D + +END PROGRAM TRANSFORM_TEST From 5b0af2a15fc8490dc8ec32416ec785ea24821ac6 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Thu, 2 Jun 2022 05:32:10 -0700 Subject: [PATCH 247/263] Add a call to gpnorm --- src/programs/driver-spectraltransform.F90 | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/programs/driver-spectraltransform.F90 b/src/programs/driver-spectraltransform.F90 index 1000fd2f3..78de24b6e 100644 --- a/src/programs/driver-spectraltransform.F90 +++ b/src/programs/driver-spectraltransform.F90 @@ -77,6 +77,10 @@ PROGRAM TRANSFORM_TEST REAL(KIND=JPRB), POINTER :: ZT(:,:,:) => NULL() REAL(KIND=JPRB), ALLOCATABLE :: ZSP(:,:) +REAL(KIND=JPRB),ALLOCATABLE :: PAVE(:) +REAL(KIND=JPRB),ALLOCATABLE :: PMIN(:) +REAL(KIND=JPRB),ALLOCATABLE :: PMAX(:) + LOGICAL :: LSTACK LOGICAL :: LDONE,LSTDEV LOGICAL :: LUSERPNM, LKEEPRPNM, LUSEFLT @@ -149,6 +153,7 @@ PROGRAM TRANSFORM_TEST #include "setup_trans.h" #include "inv_trans.h" #include "dir_trans.h" +#include "gpnorm_trans.h" #include "dist_spec.h" #include "gath_grid.h" #include "trans_inq.h" @@ -776,6 +781,10 @@ PROGRAM TRANSFORM_TEST ALLOCATE(ZGMV(NPROMA,NFLEVG,NDIMGMV,NGPBLKS)) ALLOCATE(ZGMVS(NPROMA,NDIMGMVS,NGPBLKS)) +ALLOCATE(PMIN(NFLEVG)) +ALLOCATE(PMAX(NFLEVG)) +ALLOCATE(PAVE(NFLEVG)) + ALLOCATE(ZNORMSP(1)) ALLOCATE(ZNORMSP1(1)) ALLOCATE(ZNORMVOR(NFLEVG)) @@ -945,6 +954,18 @@ PROGRAM TRANSFORM_TEST ! !call sleep (10000) ENDDO +CALL GPNORM_TRANS(ZWINDS(:,:,2,:),NFLEVG,KPROMA=NPROMA,PAVE=PAVE,PMIN=PMIN,PMAX=PMAX,LDAVE_ONLY=.false.,KRESOL=1) +if (myproc == 1) then + OPEN(800+myproc, FORM="UNFORMATTED") + write(800+myproc) "pave", sum(pave)/size(pave) + write(800+myproc) "pmin", sum(pmin)/size(pmin) + write(800+myproc) "pmax", sum(pmax)/size(pmax) + close(800+myproc) + print *, "pave", sum(pave)/size(pave) + print *, "pmin", sum(pmin)/size(pmin) + print *, "pmax", sum(pmax)/size(pmax) +endif + ZTLOOP=(TIMEF()-ZTLOOP)/1000.0_JPRD WRITE(NOUT,'(" ")') From e137897bf30baf1bf926631f5836717abd62ec35 Mon Sep 17 00:00:00 2001 From: Lukas Mosiman Date: Wed, 27 Jul 2022 01:19:59 -0700 Subject: [PATCH 248/263] Make dump optional --- src/programs/driver-spectraltransform.F90 | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/programs/driver-spectraltransform.F90 b/src/programs/driver-spectraltransform.F90 index 78de24b6e..0d38f9a4f 100644 --- a/src/programs/driver-spectraltransform.F90 +++ b/src/programs/driver-spectraltransform.F90 @@ -87,7 +87,7 @@ PROGRAM TRANSFORM_TEST LOGICAL :: LTRACE_STATS,LSTATS_OMP, LSTATS_COMMS, LSTATS_MPL LOGICAL :: LSTATS,LBARRIER_STATS, LBARRIER_STATS2, LDETAILED_STATS LOGICAL :: LSTATS_ALLOC, LSYNCSTATS, LSTATSCPU, LSTATS_MEM -LOGICAL :: LXML_STATS +LOGICAL :: LXML_STATS, LDUMP LOGICAL :: LFFTW INTEGER(KIND=JPIM) :: NSTATS_MEM, NTRACE_STATS, NPRNT_STATS ! 0 - no output, 1 - init and final result, 2 - every timestep @@ -145,7 +145,7 @@ PROGRAM TRANSFORM_TEST & LUSERPNM, LKEEPRPNM, LUSEFLT, NQ, NLIN, IMAX_FLDS_IN, & & NPRINTNORMS, ITERS, ZMAXERR_CHECK, NPROMA, NPROMATR, LEQ_REGIONS, & & NPRINTLEV, NPRTRW, NPRTRV, NSPECRESMIN, NFLEVG, MBX_SIZE, LSTACK, & - & LFFTW + & LFFTW, LDUMP ! ------------------------------------------------------------------ @@ -206,6 +206,7 @@ PROGRAM TRANSFORM_TEST LBARRIER_STATS2=.FALSE. LSTATSCPU=.FALSE. LSYNCSTATS=.FALSE. +LDUMP=.TRUE. LXML_STATS=.FALSE. LTRACE_STATS=.FALSE. NSTATS_MEM=0 @@ -871,9 +872,9 @@ PROGRAM TRANSFORM_TEST ZTSTEP1(JSTEP)=(TIMEF()-ZTSTEP1(JSTEP))/1000.0_JPRD ! Dump a field to a binary file - CALL DUMP_GRIDPOINT_FIELD_3D(JSTEP, MYPROC, ZGMVS(:,:,:), 'S', NOUTDUMP) - CALL DUMP_GRIDPOINT_FIELD_4D(JSTEP, MYPROC, ZWINDS(:,:,:,:), 'W', NOUTDUMP) - CALL DUMP_GRIDPOINT_FIELD_4D(JSTEP, MYPROC, ZGMV(:,:,:,:), 'M', NOUTDUMP) + IF (LDUMP) CALL DUMP_GRIDPOINT_FIELD_3D(JSTEP, MYPROC, ZGMVS(:,:,:), 'S', NOUTDUMP) + IF (LDUMP) CALL DUMP_GRIDPOINT_FIELD_4D(JSTEP, MYPROC, ZWINDS(:,:,:,:), 'W', NOUTDUMP) + IF (LDUMP) CALL DUMP_GRIDPOINT_FIELD_4D(JSTEP, MYPROC, ZGMV(:,:,:,:), 'M', NOUTDUMP) ZTSTEP2(JSTEP)=TIMEF() CALL DIR_TRANS(PSPVOR=ZVOR,PSPDIV=ZDIV,& @@ -885,10 +886,10 @@ PROGRAM TRANSFORM_TEST ZTSTEP2(JSTEP)=(TIMEF()-ZTSTEP2(JSTEP))/1000.0_JPRD ! Dump a field to a binary file - CALL DUMP_GRIDPOINT_FIELD_2D(JSTEP, MYPROC, ZVOR(:,:), 'V', NOUTDUMP) - CALL DUMP_GRIDPOINT_FIELD_2D(JSTEP, MYPROC, ZDIV(:,:), 'D', NOUTDUMP) - CALL DUMP_GRIDPOINT_FIELD_2D(JSTEP, MYPROC, ZSP(:,:), 'P', NOUTDUMP) - CALL DUMP_GRIDPOINT_FIELD_3D(JSTEP, MYPROC, ZT(:,:,:), 'T', NOUTDUMP) + IF (LDUMP) CALL DUMP_GRIDPOINT_FIELD_2D(JSTEP, MYPROC, ZVOR(:,:), 'V', NOUTDUMP) + IF (LDUMP) CALL DUMP_GRIDPOINT_FIELD_2D(JSTEP, MYPROC, ZDIV(:,:), 'D', NOUTDUMP) + IF (LDUMP) CALL DUMP_GRIDPOINT_FIELD_2D(JSTEP, MYPROC, ZSP(:,:), 'P', NOUTDUMP) + IF (LDUMP) CALL DUMP_GRIDPOINT_FIELD_3D(JSTEP, MYPROC, ZT(:,:,:), 'T', NOUTDUMP) ZTSTEP(JSTEP)=(TIMEF()-ZTSTEP(JSTEP))/1000.0_JPRD From 974069db2df4e43a00ed7b3e5c44c424e735b55b Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Thu, 28 Jul 2022 23:11:39 -0700 Subject: [PATCH 249/263] add dump directory as env variable --- src/programs/driver-spectraltransform.F90 | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/programs/driver-spectraltransform.F90 b/src/programs/driver-spectraltransform.F90 index 0d38f9a4f..4b8fb29c3 100644 --- a/src/programs/driver-spectraltransform.F90 +++ b/src/programs/driver-spectraltransform.F90 @@ -1321,12 +1321,15 @@ SUBROUTINE DUMP_GRIDPOINT_FIELD_2D(JSTEP, MYPROC, FLD, FLDCHAR, NOUTDUMP) INTEGER(KIND=JPIM), INTENT(IN) :: NOUTDUMP ! Unit number for output file CHARACTER(LEN=14) :: FILENAME = "X.XXX.XXXX.dat" +CHARACTER(LEN=60) :: DUMP_DIR WRITE(FILENAME(1:1),'(A1)') FLDCHAR WRITE(FILENAME(3:5),'(I3.3)') JSTEP WRITE(FILENAME(7:10),'(I4.4)') MYPROC -OPEN(NOUTDUMP, FILE=FILENAME, FORM="UNFORMATTED") +CALL GETENV("DUMP_DIR", DUMP_DIR) +IF (TRIM(DUMP_DIR) == "") CALL GETCWD(DUMP_DIR) +OPEN(NOUTDUMP, FILE=TRIM(DUMP_DIR)//'/'//FILENAME, FORM="UNFORMATTED") WRITE(NOUTDUMP) FLD CLOSE(NOUTDUMP) @@ -1342,12 +1345,15 @@ SUBROUTINE DUMP_GRIDPOINT_FIELD_3D(JSTEP, MYPROC, FLD, FLDCHAR, NOUTDUMP) INTEGER(KIND=JPIM), INTENT(IN) :: NOUTDUMP ! Unit number for output file CHARACTER(LEN=14) :: FILENAME = "X.XXX.XXXX.dat" +CHARACTER(LEN=60) :: DUMP_DIR WRITE(FILENAME(1:1),'(A1)') FLDCHAR WRITE(FILENAME(3:5),'(I3.3)') JSTEP WRITE(FILENAME(7:10),'(I4.4)') MYPROC -OPEN(NOUTDUMP, FILE=FILENAME, FORM="UNFORMATTED") +CALL GETENV("DUMP_DIR", DUMP_DIR) +IF (TRIM(DUMP_DIR) == "") CALL GETCWD(DUMP_DIR) +OPEN(NOUTDUMP, FILE=TRIM(DUMP_DIR)//'/'//FILENAME, FORM="UNFORMATTED") WRITE(NOUTDUMP) FLD CLOSE(NOUTDUMP) @@ -1363,12 +1369,15 @@ SUBROUTINE DUMP_GRIDPOINT_FIELD_4D(JSTEP, MYPROC, FLD, FLDCHAR, NOUTDUMP) INTEGER(KIND=JPIM), INTENT(IN) :: NOUTDUMP ! Unit number for output file CHARACTER(LEN=14) :: FILENAME = "X.XXX.XXXX.dat" +CHARACTER(LEN=60) :: DUMP_DIR WRITE(FILENAME(1:1),'(A1)') FLDCHAR WRITE(FILENAME(3:5),'(I3.3)') JSTEP WRITE(FILENAME(7:10),'(I4.4)') MYPROC -OPEN(NOUTDUMP, FILE=FILENAME, FORM="UNFORMATTED") +CALL GETENV("DUMP_DIR", DUMP_DIR) +IF (TRIM(DUMP_DIR) == "") CALL GETCWD(DUMP_DIR) +OPEN(NOUTDUMP, FILE=TRIM(DUMP_DIR)//'/'//FILENAME, FORM="UNFORMATTED") WRITE(NOUTDUMP) FLD CLOSE(NOUTDUMP) From eb7e29ba9e992e143f128de858a7e244b281d20a Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Fri, 29 Jul 2022 02:59:53 -0700 Subject: [PATCH 250/263] fix size of zinp/zout in ledir --- src/trans/gpu/external/setup_trans.F90 | 5 +- src/trans/gpu/internal/ledir_mod.F90 | 82 ++++++++++++++----- src/trans/gpu/internal/ltdir_mod.F90 | 24 +++--- src/trans/gpu/internal/tpm_trans.F90 | 5 ++ src/trans/gpu/internal/trltom_pack_unpack.F90 | 81 +++++++++--------- 5 files changed, 122 insertions(+), 75 deletions(-) diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index 34eb818d9..91e54d445 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -118,7 +118,8 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& USE TPM_FFTW ,ONLY : TW, FFTW_RESOL #endif USE TPM_FLT -USE TPM_TRANS ,ONLY : ZAVE, ZMINGL, ZMAXGL, ZMINGPN, ZMAXGPN +USE TPM_TRANS ,ONLY : ZAVE, ZMINGL, ZMAXGL, ZMINGPN, ZMAXGPN, LEDIR_CONF +USE LEDIR_MOD, ONLY: SETUP_LEDIR USE TPM_CTL USE SET_RESOL_MOD ,ONLY : SET_RESOL @@ -627,4 +628,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ENDIF +CALL SETUP_LEDIR(LEDIR_CONF) + END SUBROUTINE SETUP_TRANS diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index d834c691c..ca08c7f89 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -11,43 +11,82 @@ MODULE LEDIR_MOD USE PARKIND_ECTRANS ,ONLY : JPIM + USE TPM_TRANS, ONLY: LEDIR_CONFIG IMPLICIT NONE PRIVATE PUBLIC :: LEDIR_STRIDES, LEDIR + PUBLIC :: SETUP_LEDIR INTEGER(KIND=JPIM) :: A = 8 !Alignment CONTAINS - SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& - IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) + SUBROUTINE SETUP_LEDIR(CONFIG) + USE TPM_DISTR, ONLY: D + USE TPM_DIM, ONLY: R + USE TPM_GEOMETRY, ONLY: G + USE PARKIND_ECTRANS, ONLY: JPIM + + IMPLICIT NONE + + TYPE(LEDIR_CONFIG), INTENT(OUT) :: CONFIG + + INTEGER(KIND=JPIM) :: KM, KMLOC, N_OFFSET, K_OFFSET + + ALLOCATE(CONFIG%OFFSETS_N(D%NUMP+1)) + ALLOCATE(CONFIG%OFFSETS_K(D%NUMP+1)) + + N_OFFSET = 0 + K_OFFSET = 0 + DO KMLOC=1,D%NUMP + KM = D%MYMS(KMLOC) + CONFIG%OFFSETS_K(KMLOC) = K_OFFSET + CONFIG%OFFSETS_N(KMLOC) = N_OFFSET + + !KM=0 is transformed in double precision, no need to store here + IF (KM /= 0) THEN + K_OFFSET = K_OFFSET + ALIGN(G%NDGLU(KM),A) + ! N_OFFSET takes the max of the two GEMMs + N_OFFSET = N_OFFSET + ALIGN((R%NSMAX-KM+3)/2,A) + ENDIF + ENDDO + CONFIG%OFFSETS_K(D%NUMP+1) = K_OFFSET + CONFIG%OFFSETS_N(D%NUMP+1) = N_OFFSET + + !$ACC ENTER DATA COPYIN(CONFIG,CONFIG%OFFSETS_K,CONFIG%OFFSETS_N) + END SUBROUTINE + + SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& + IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD USE TPM_DIM ,ONLY : R + USE TPM_DISTR, ONLY: D + USE TPM_TRANS, ONLY: LEDIR_CONF IMPLICIT NONE INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS - INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0, IOUT_STRIDES1 - INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0, IIN_STRIDES1 - INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_STRIDES1 - INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_SIZE IF (PRESENT(IOUT_STRIDES0)) & IOUT_STRIDES0 = ALIGN(2*KF_FS,A) - IF (PRESENT(IOUT_STRIDES1)) & - IOUT_STRIDES1 = IOUT_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) + IF (PRESENT(IOUT_SIZE)) & + IOUT_SIZE = IOUT_STRIDES0*LEDIR_CONF%OFFSETS_N(D%NUMP+1) IF (PRESENT(IIN_STRIDES0)) & IIN_STRIDES0 = ALIGN(2*KF_FS,A) - IF (PRESENT(IIN_STRIDES1)) & - IIN_STRIDES1 = IIN_STRIDES0 * ALIGN(R%NDGNH,A) + IF (PRESENT(IIN_SIZE)) & + IIN_SIZE = IIN_STRIDES0*LEDIR_CONF%OFFSETS_K(D%NUMP+1) IF (PRESENT(IOUT0_STRIDES0)) & IOUT0_STRIDES0 = ALIGN(KF_FS,A) - IF (PRESENT(IOUT0_STRIDES1)) & - IOUT0_STRIDES1 = IOUT0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) + IF (PRESENT(IOUT0_SIZE)) & + IOUT0_SIZE = IOUT0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) IF (PRESENT(IIN0_STRIDES0)) & IIN0_STRIDES0 = ALIGN(KF_FS,A) - IF (PRESENT(IIN0_STRIDES1)) & - IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(R%NDGNH,A) + IF (PRESENT(IIN0_SIZE)) & + IIN0_SIZE = IIN0_STRIDES0 * ALIGN(R%NDGNH,A) END SUBROUTINE SUBROUTINE LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) @@ -105,6 +144,7 @@ SUBROUTINE LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) USE, INTRINSIC :: ISO_C_BINDING USE IEEE_ARITHMETIC USE OPENACC + USE TPM_TRANS, ONLY: LEDIR_CONF IMPLICIT NONE @@ -143,7 +183,7 @@ SUBROUTINE LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) !$ACC& PRESENT(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) & !$ACC& PRESENT(F,F%RW) & !$ACC& PRESENT(D,D_MYMS,R,G,G_NDGLU) & - !$ACC& PRESENT(ZAA,ZAS,POA1) + !$ACC& PRESENT(ZAA,ZAS,POA1,LEDIR_CONF) ! anti-symmetric IF(KMLOC0 > 0) THEN @@ -177,9 +217,9 @@ SUBROUTINE LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) KM = D_MYMS(KMLOC) NS(KMLOC) = (R%NSMAX-KM+2)/2 KS(KMLOC) = G%NDGLU(KM) - AOFFSETS(KMLOC) = IIN_STRIDES1*(KMLOC-1) + AOFFSETS(KMLOC) = IIN_STRIDES0*LEDIR_CONF%OFFSETS_K(KMLOC) BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) - COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) + COFFSETS(KMLOC) = IOUT_STRIDES0*LEDIR_CONF%OFFSETS_N(KMLOC) ENDDO IF(KMLOC0 > 0) THEN NS(KMLOC0) = 0 @@ -211,7 +251,7 @@ SUBROUTINE LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) IF (KM /= 0) THEN !$ACC LOOP SEQ DO J=1,(R%NSMAX-KM+2)/2 - POA1(JF,IA+1+(J-1)*2,KMLOC) = ZOUT(JF+(J-1)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) + POA1(JF,IA+1+(J-1)*2,KMLOC) = ZOUT(JF+(J-1)*IOUT_STRIDES0+LEDIR_CONF%OFFSETS_N(KMLOC)*IOUT_STRIDES0) ENDDO ELSEIF (MOD(JF-1,2) == 0) THEN !$ACC LOOP SEQ @@ -252,9 +292,9 @@ SUBROUTINE LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) KM = D_MYMS(KMLOC) NS(KMLOC) = (R%NSMAX-KM+3)/2 KS(KMLOC) = G%NDGLU(KM) - AOFFSETS(KMLOC) = IIN_STRIDES1*(KMLOC-1) + AOFFSETS(KMLOC) = IIN_STRIDES0*LEDIR_CONF%OFFSETS_K(KMLOC) BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) - COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) + COFFSETS(KMLOC) = IOUT_STRIDES0*LEDIR_CONF%OFFSETS_N(KMLOC) ENDDO IF(KMLOC0 > 0) THEN NS(KMLOC0) = 0 @@ -286,7 +326,7 @@ SUBROUTINE LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) IF (KM /= 0) THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+3)/2 - POA1(JF,IS+1+(J-1)*2,KMLOC) = ZOUT(JF+(J-1)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) + POA1(JF,IS+1+(J-1)*2,KMLOC) = ZOUT(JF+(J-1)*IOUT_STRIDES0+LEDIR_CONF%OFFSETS_N(KMLOC)*IOUT_STRIDES0) ENDDO ELSEIF (MOD(JF-1,2) == 0) THEN !$ACC LOOP SEQ diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index c941103ce..1990a154e 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -36,23 +36,23 @@ FUNCTION PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) RESULT(HLTDIR) TYPE(LTDIR_HANDLE) :: HLTDIR INTEGER(KIND=C_SIZE_T) :: IALLOC_SZ - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 - INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE REAL(KIND=JPRBT) :: ZPRBT_DUMMY REAL(KIND=JPRD) :: ZPRD_DUMMY - CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_STRIDES1=IOUT_STRIDES1,& - IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_STRIDES1=IOUT0_STRIDES1) + CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_SIZE=IOUT_SIZE,& + IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_SIZE=IOUT0_SIZE) ! POA1 IALLOC_SZ = ALIGN(2*KF_FS*(R%NTMAX+3)*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) ! POA2 IALLOC_SZ = IALLOC_SZ + ALIGN(4*KF_UV*(R%NTMAX+3)*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) ! ZOUT - IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_SIZE*SIZEOF(ZPRBT_DUMMY),128) ! ZOUT0 - IALLOC_SZ = IALLOC_SZ+ ALIGN(IOUT0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) + IALLOC_SZ = IALLOC_SZ+ ALIGN(IOUT0_SIZE*SIZEOF(ZPRD_DUMMY),128) HLTDIR%HOUT_AND_POA = RESERVE(ALLOCATOR, IALLOC_SZ) END FUNCTION @@ -166,8 +166,8 @@ SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALA TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(LTDIR_HANDLE), INTENT(IN) :: HLTDIR INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 - INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE @@ -184,8 +184,8 @@ SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALA !* 2. PREPARE WORK ARRAYS. ! -------------------- - CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_STRIDES1=IOUT_STRIDES1,& - IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_STRIDES1=IOUT0_STRIDES1) + CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_SIZE=IOUT_SIZE,& + IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_SIZE=IOUT0_SIZE) IALLOC_POS = 1 @@ -202,13 +202,13 @@ SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALA IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZOUT - IALLOC_SZ = ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZOUT(1)),128) + IALLOC_SZ = ALIGN(IOUT_SIZE*SIZEOF(ZOUT(1)),128) CALL ASSIGN_PTR(ZOUT, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZOUT0 - IALLOC_SZ = ALIGN(IOUT0_STRIDES1*SIZEOF(ZOUT0(1)),128) + IALLOC_SZ = ALIGN(IOUT0_SIZE*SIZEOF(ZOUT0(1)),128) CALL ASSIGN_PTR(ZOUT0, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS = IALLOC_POS + IALLOC_SZ diff --git a/src/trans/gpu/internal/tpm_trans.F90 b/src/trans/gpu/internal/tpm_trans.F90 index ac6081272..256832cee 100755 --- a/src/trans/gpu/internal/tpm_trans.F90 +++ b/src/trans/gpu/internal/tpm_trans.F90 @@ -72,4 +72,9 @@ MODULE TPM_TRANS ! is going to recreate the graphs if needed) INTEGER(KIND=C_INT8_T),POINTER :: REUSE_PTR(:) +TYPE LEDIR_CONFIG + INTEGER(KIND=JPIM), ALLOCATABLE :: OFFSETS_K(:), OFFSETS_N(:) +END TYPE +TYPE(LEDIR_CONFIG) :: LEDIR_CONF + END MODULE TPM_TRANS diff --git a/src/trans/gpu/internal/trltom_pack_unpack.F90 b/src/trans/gpu/internal/trltom_pack_unpack.F90 index 04acf8936..cf46ca56c 100755 --- a/src/trans/gpu/internal/trltom_pack_unpack.F90 +++ b/src/trans/gpu/internal/trltom_pack_unpack.F90 @@ -129,21 +129,21 @@ FUNCTION PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_UNPACK) INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS TYPE(TRLTOM_UNPACK_HANDLE) :: HTRLTOM_UNPACK - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 - INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE INTEGER(KIND=C_SIZE_T) :: ISIZE REAL(KIND=JPRBT) :: ZPRBT_DUMMY REAL(KIND=JPRD) :: ZPRD_DUMMY - CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& - IIN0_STRIDES0=IIN0_STRIDES0,IIN0_STRIDES1=IIN0_STRIDES1) + CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_SIZE=IIN_SIZE,& + IIN0_STRIDES0=IIN0_STRIDES0,IIN0_SIZE=IIN0_SIZE) ! Check if the reuse buffer is large enough - ISIZE = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) - ISIZE = ISIZE + ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) - ISIZE = ISIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) - ISIZE = ISIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) + ISIZE = ALIGN(IIN_SIZE*SIZEOF(ZPRBT_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN_SIZE*SIZEOF(ZPRBT_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN0_SIZE*SIZEOF(ZPRD_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN0_SIZE*SIZEOF(ZPRD_DUMMY),128) HTRLTOM_UNPACK%HINPS_AND_ZINPA = RESERVE(ALLOCATOR, ISIZE) END FUNCTION @@ -155,6 +155,7 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP USE TPM_FIELDS, ONLY : F USE TPM_DISTR, ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1 USE LEDIR_MOD, ONLY : LEDIR_STRIDES + USE TPM_TRANS, ONLY: LEDIR_CONF USE, INTRINSIC :: ISO_C_BINDING IMPLICIT NONE @@ -168,8 +169,8 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 - INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ @@ -178,27 +179,27 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP REAL(KIND=JPRBT) :: PAIA, PAIS - CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& - IIN0_STRIDES0=IIN0_STRIDES0,IIN0_STRIDES1=IIN0_STRIDES1) + CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_SIZE=IIN_SIZE,& + IIN0_STRIDES0=IIN0_STRIDES0,IIN0_SIZE=IIN0_SIZE) IALLOC_POS=1 - IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPS(0)),128) + IALLOC_SZ = ALIGN(IIN_SIZE*SIZEOF(ZINPS(0)),128) CALL ASSIGN_PTR(ZINPS, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ - IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPA(0)),128) + IALLOC_SZ = ALIGN(IIN_SIZE*SIZEOF(ZINPA(0)),128) CALL ASSIGN_PTR(ZINPA, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ - IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPS0(0)),128) + IALLOC_SZ = ALIGN(IIN0_SIZE*SIZEOF(ZINPS0(0)),128) CALL ASSIGN_PTR(ZINPS0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ - IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPA0(0)),128) + IALLOC_SZ = ALIGN(IIN0_SIZE*SIZEOF(ZINPA0(0)),128) CALL ASSIGN_PTR(ZINPA0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ @@ -209,7 +210,7 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP !$ACC& PRESENT(D,D_MYMS,R,G,G_NDGLU) & !$ACC& PRESENT(D_NPNTGTB1) - !$ACC DATA PRESENT(FOUBUF) + !$ACC DATA PRESENT(FOUBUF,LEDIR_CONF) !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) ASYNC(1) DO KMLOC=1,D_NUMP DO JGL=1,R_NDGNH @@ -229,8 +230,8 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP PAIS = PAIS*F%RACTHE(JGL) ENDIF IF (KM /= 0) THEN - ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIA*F%RW(JGL) - ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIS*F%RW(JGL) + ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*LEDIR_CONF%OFFSETS_K(KMLOC))=PAIA*F%RW(JGL) + ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*LEDIR_CONF%OFFSETS_K(KMLOC))=PAIS*F%RW(JGL) ELSEIF (MOD(JF-1,2) == 0) THEN ! every other field is sufficient because Im(KM=0) == 0 ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F%RW(JGL) @@ -256,21 +257,21 @@ FUNCTION PREPARE_TRLTOM_DIRECT(ALLOCATOR, KF_FS) RESULT(HTRLTOM_DIRECT) INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS TYPE(TRLTOM_DIRECT_HANDLE) :: HTRLTOM_DIRECT - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 - INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE INTEGER(KIND=C_SIZE_T) :: ISIZE REAL(KIND=JPRBT) :: ZPRBT_DUMMY REAL(KIND=JPRD) :: ZPRD_DUMMY - CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& - IIN0_STRIDES0=IIN0_STRIDES0,IIN0_STRIDES1=IIN0_STRIDES1) + CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_SIZE=IIN_SIZE, & + & IIN0_STRIDES0=IIN0_STRIDES0,IIN0_SIZE=IIN0_SIZE) ! Check if the reuse buffer is large enough - ISIZE = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) - ISIZE = ISIZE + ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) - ISIZE = ISIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) - ISIZE = ISIZE + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) + ISIZE = ALIGN(IIN_SIZE*SIZEOF(ZPRBT_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN_SIZE*SIZEOF(ZPRBT_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN0_SIZE*SIZEOF(ZPRD_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN0_SIZE*SIZEOF(ZPRD_DUMMY),128) HTRLTOM_DIRECT%HINPS_AND_ZINPA = RESERVE(ALLOCATOR, ISIZE) END FUNCTION @@ -282,6 +283,7 @@ SUBROUTINE TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINP USE TPM_FIELDS ,ONLY : F USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,MYSETW,D_NSTAGTF,D_NPTRLS USE LEDIR_MOD, ONLY: LEDIR_STRIDES + USE TPM_TRANS, ONLY: LEDIR_CONF USE, INTRINSIC :: ISO_C_BINDING IMPLICIT NONE @@ -293,8 +295,8 @@ SUBROUTINE TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINP TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(TRLTOM_DIRECT_HANDLE), INTENT(IN) :: HTRLTOM_DIRECT - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 - INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ @@ -303,39 +305,36 @@ SUBROUTINE TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINP REAL(KIND=JPRBT) :: PAIA, PAIS, V1, V2, SCAL - CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& - IIN0_STRIDES0=IIN0_STRIDES0,IIN0_STRIDES1=IIN0_STRIDES1) + CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_SIZE=IIN_SIZE, & + & IIN0_STRIDES0=IIN0_STRIDES0,IIN0_SIZE=IIN0_SIZE) IALLOC_POS=1 - IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPS(0)),128) + IALLOC_SZ = ALIGN(IIN_SIZE*SIZEOF(ZINPS(0)),128) CALL ASSIGN_PTR(ZINPS, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ - IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINPA(0)),128) + IALLOC_SZ = ALIGN(IIN_SIZE*SIZEOF(ZINPA(0)),128) CALL ASSIGN_PTR(ZINPA, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ - IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPS0(0)),128) + IALLOC_SZ = ALIGN(IIN0_SIZE*SIZEOF(ZINPS0(0)),128) CALL ASSIGN_PTR(ZINPS0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ - IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINPA0(0)),128) + IALLOC_SZ = ALIGN(IIN0_SIZE*SIZEOF(ZINPA0(0)),128) CALL ASSIGN_PTR(ZINPA0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ - CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_STRIDES1=IIN_STRIDES1,& - IIN0_STRIDES0=IIN0_STRIDES0) - !$ACC DATA & !$ACC& PRESENT(ZINPS,ZINPA,ZINPS0,ZINPA0) & !$ACC& PRESENT(F,F%RW) & !$ACC& PRESENT(D,D_MYMS,G,G_NDGLU) & - !$ACC& PRESENT(PREEL_COMPLEX,D_NSTAGTF,G_NLOEN) + !$ACC& PRESENT(PREEL_COMPLEX,D_NSTAGTF,G_NLOEN,LEDIR_CONF) OFFSET_VAR=D_NPTRLS(MYSETW) @@ -368,8 +367,8 @@ SUBROUTINE TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINP PAIS = PAIS*F%RACTHE(JGL) ENDIF IF (KM /= 0) THEN - ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIA*F%RW(JGL) - ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PAIS*F%RW(JGL) + ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*LEDIR_CONF%OFFSETS_K(KMLOC))=PAIA*F%RW(JGL) + ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*LEDIR_CONF%OFFSETS_K(KMLOC))=PAIS*F%RW(JGL) ELSEIF (MOD(JF-1,2) == 0) THEN ! every other field is sufficient because Im(KM=0) == 0 ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F%RW(JGL) From 187d17f218d2ed8f2ee068b328bd0cefed7fbcb1 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Fri, 29 Jul 2022 03:19:27 -0700 Subject: [PATCH 251/263] fix size of zinp/zout in leinv --- src/trans/gpu/external/setup_trans.F90 | 4 +- src/trans/gpu/internal/leinv_mod.F90 | 91 +++++++++++++------ src/trans/gpu/internal/ltinv_mod.F90 | 48 +++++----- src/trans/gpu/internal/tpm_trans.F90 | 5 + src/trans/gpu/internal/trmtol_pack_unpack.F90 | 15 +-- 5 files changed, 105 insertions(+), 58 deletions(-) diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index 91e54d445..e97a10f97 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -118,8 +118,9 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& USE TPM_FFTW ,ONLY : TW, FFTW_RESOL #endif USE TPM_FLT -USE TPM_TRANS ,ONLY : ZAVE, ZMINGL, ZMAXGL, ZMINGPN, ZMAXGPN, LEDIR_CONF +USE TPM_TRANS ,ONLY : ZAVE, ZMINGL, ZMAXGL, ZMINGPN, ZMAXGPN, LEDIR_CONF, LEINV_CONF USE LEDIR_MOD, ONLY: SETUP_LEDIR +USE LEINV_MOD, ONLY: SETUP_LEINV USE TPM_CTL USE SET_RESOL_MOD ,ONLY : SET_RESOL @@ -629,5 +630,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ENDIF CALL SETUP_LEDIR(LEDIR_CONF) +CALL SETUP_LEINV(LEINV_CONF) END SUBROUTINE SETUP_TRANS diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index c6cf5b903..221185052 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -11,45 +11,83 @@ MODULE LEINV_MOD USE PARKIND_ECTRANS ,ONLY : JPIM + USE TPM_TRANS, ONLY: LEINV_CONFIG IMPLICIT NONE PRIVATE PUBLIC :: LEINV_STRIDES, LEINV + PUBLIC :: SETUP_LEINV INTEGER(KIND=JPIM) :: A = 8 !Alignment CONTAINS - SUBROUTINE LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& - IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) + SUBROUTINE SETUP_LEINV(CONFIG) + USE TPM_DISTR, ONLY: D + USE TPM_DIM, ONLY: R + USE TPM_GEOMETRY, ONLY: G + USE PARKIND_ECTRANS, ONLY: JPIM + + IMPLICIT NONE + + TYPE(LEINV_CONFIG), INTENT(OUT) :: CONFIG + + INTEGER(KIND=JPIM) :: KM, KMLOC, N_OFFSET, K_OFFSET + + ALLOCATE(CONFIG%OFFSETS_N(D%NUMP+1)) + ALLOCATE(CONFIG%OFFSETS_K(D%NUMP+1)) + + N_OFFSET = 0 + K_OFFSET = 0 + DO KMLOC=1,D%NUMP + KM = D%MYMS(KMLOC) + CONFIG%OFFSETS_K(KMLOC) = K_OFFSET + CONFIG%OFFSETS_N(KMLOC) = N_OFFSET + + !KM=0 is transformed in double precision, no need to store here + IF (KM /= 0) THEN + ! K_OFFSET takes the max of the two GEMMs + K_OFFSET = K_OFFSET + ALIGN((R%NSMAX-KM+3)/2,A) + N_OFFSET = N_OFFSET + ALIGN(G%NDGLU(KM),A) + ENDIF + ENDDO + CONFIG%OFFSETS_K(D%NUMP+1) = K_OFFSET + CONFIG%OFFSETS_N(D%NUMP+1) = N_OFFSET + + !$ACC ENTER DATA COPYIN(CONFIG,CONFIG%OFFSETS_K,CONFIG%OFFSETS_N) + END SUBROUTINE + SUBROUTINE LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& + IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD USE TPM_DIM ,ONLY : R + USE TPM_DISTR, ONLY: D + USE TPM_TRANS, ONLY: LEINV_CONF IMPLICIT NONE INTEGER(KIND=JPIM), INTENT(IN) :: KF_LEG - INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0, IOUT_STRIDES1 - INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0, IIN_STRIDES1 - INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_STRIDES1 - INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_SIZE IF (PRESENT(IOUT_STRIDES0)) & IOUT0_STRIDES0 = ALIGN(KF_LEG,A) - IF (PRESENT(IOUT_STRIDES1)) & - IOUT0_STRIDES1 = IOUT0_STRIDES0 * ALIGN(R%NDGNH,A) + IF (PRESENT(IOUT0_SIZE)) & + IOUT0_SIZE = IOUT0_STRIDES0 * ALIGN(R%NDGNH,A) IF (PRESENT(IIN_STRIDES0)) & IIN_STRIDES0 = ALIGN(2*KF_LEG,A) - IF (PRESENT(IIN_STRIDES1)) & - IIN_STRIDES1 = IIN_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) + IF (PRESENT(IIN_SIZE)) & + IIN_SIZE = IIN_STRIDES0*LEINV_CONF%OFFSETS_K(D%NUMP+1) IF (PRESENT(IOUT0_STRIDES0)) & IOUT_STRIDES0 = ALIGN(2*KF_LEG,A) - IF (PRESENT(IOUT0_STRIDES1)) & - IOUT_STRIDES1 = IOUT_STRIDES0 * ALIGN(R%NDGNH,A) + IF (PRESENT(IOUT_SIZE)) & + IOUT_SIZE = IOUT_STRIDES0*LEINV_CONF%OFFSETS_N(D%NUMP+1) IF (PRESENT(IIN0_STRIDES0)) & IIN0_STRIDES0 = ALIGN(KF_LEG,A) - IF (PRESENT(IIN0_STRIDES1)) & - IIN0_STRIDES1 = IIN0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) + IF (PRESENT(IIN0_SIZE)) & + IIN0_SIZE = IIN0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) END SUBROUTINE SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) @@ -101,6 +139,7 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) USE CUDA_GEMM_BATCHED_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX + USE TPM_TRANS, ONLY: LEINV_CONF IMPLICIT NONE @@ -112,10 +151,10 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) ! LOCAL INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) INTEGER(KIND=JPIM) :: KM, KMLOC, IA, IS, ISL, J1, JGL, JK, J - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 - INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 - INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE @@ -130,8 +169,8 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) !* 1.1 PREPARATIONS. - CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& - IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) + CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& + IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) !$ACC DATA PRESENT(D,D_MYMS,G,G_NDGLU,R) & @@ -162,7 +201,7 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) IF(KM /= 0)THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+2)/2 - ZINP(JK+(J-1)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PIA(JK,IA+1+(J-1)*2,KMLOC) + ZINP(JK+(J-1)*IIN_STRIDES0+LEINV_CONF%OFFSETS_K(KMLOC)*IIN_STRIDES0)=PIA(JK,IA+1+(J-1)*2,KMLOC) ENDDO ELSEIF (MOD((JK-1),2) .EQ. 0) THEN ! every other field is sufficient because Im(KM=0) == 0 @@ -199,9 +238,9 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) KM = D_MYMS(KMLOC) KS(KMLOC) = (R%NSMAX-KM+2)/2 NS(KMLOC) = G%NDGLU(KM) - AOFFSETS(KMLOC) = IIN_STRIDES1*(KMLOC-1) + AOFFSETS(KMLOC) = IIN_STRIDES0*LEINV_CONF%OFFSETS_K(KMLOC) BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) - COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) + COFFSETS(KMLOC) = IOUT_STRIDES0*LEINV_CONF%OFFSETS_N(KMLOC) ENDDO IF(KMLOC0 > 0) THEN NS(KMLOC0) = 0 @@ -244,7 +283,7 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) IF(KM /= 0) THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+3)/2 - ZINP(JK+(J-1)*IIN_STRIDES0+(KMLOC-1)*IIN_STRIDES1)=PIA(JK,IS+1+(J-1)*2,KMLOC) + ZINP(JK+(J-1)*IIN_STRIDES0+LEINV_CONF%OFFSETS_K(KMLOC)*IIN_STRIDES0)=PIA(JK,IS+1+(J-1)*2,KMLOC) ENDDO ELSEIF (MOD((JK-1),2) == 0) THEN !$ACC LOOP SEQ @@ -279,9 +318,9 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) KM = D_MYMS(KMLOC) KS(KMLOC) = (R%NSMAX-KM+3)/2 NS(KMLOC) = G%NDGLU(KM) - AOFFSETS(KMLOC) = IIN_STRIDES1*(KMLOC-1) + AOFFSETS(KMLOC) = IIN_STRIDES0*LEINV_CONF%OFFSETS_K(KMLOC) BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) - COFFSETS(KMLOC) = IOUT_STRIDES1*(KMLOC-1) + COFFSETS(KMLOC) = IOUT_STRIDES0*LEINV_CONF%OFFSETS_N(KMLOC) ENDDO IF(KMLOC0 > 0) THEN NS(KMLOC0) = 0 diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index 8861796a4..1c1c69466 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -40,10 +40,10 @@ FUNCTION PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT( TYPE(LTINV_HANDLE) :: HLTINV INTEGER(KIND=C_SIZE_T) :: IALLOC_SZ, IPIA_SZ - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 - INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 - INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE REAL(KIND=JPRBT) :: ZPRBT_DUMMY REAL(KIND=JPRD) :: ZPRD_DUMMY @@ -70,27 +70,27 @@ FUNCTION PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT( IF(.NOT. LVORGP) IF_LEG = IF_LEG - KF_UV ! No vorticity needed IF(.NOT. LDIVGP) IF_LEG = IF_LEG - KF_UV ! No divergence needed - CALL LEINV_STRIDES(IF_LEG,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& - IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) + CALL LEINV_STRIDES(IF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& + IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) ! PIA IALLOC_SZ = IPIA_SZ ! ZINP - IALLOC_SZ = IALLOC_SZ + ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + IALLOC_SZ = IALLOC_SZ + ALIGN(IIN_SIZE*SIZEOF(ZPRBT_DUMMY),128) ! ZINP0 - IALLOC_SZ = IALLOC_SZ + ALIGN(IIN0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) + IALLOC_SZ = IALLOC_SZ + ALIGN(IIN0_SIZE*SIZEOF(ZPRD_DUMMY),128) HLTINV%HPIA_AND_IN = RESERVE(ALLOCATOR, IALLOC_SZ) IALLOC_SZ = 0 ! ZOUTA - IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_SIZE*SIZEOF(ZPRBT_DUMMY),128) ! ZOUTS - IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_SIZE*SIZEOF(ZPRBT_DUMMY),128) ! ZOUTA0 - IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT0_SIZE*SIZEOF(ZPRD_DUMMY),128) ! ZOUTS0 - IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT0_STRIDES1*SIZEOF(ZPRD_DUMMY),128) + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT0_SIZE*SIZEOF(ZPRD_DUMMY),128) HLTINV%HOUTS_AND_OUTA = RESERVE(ALLOCATOR, IALLOC_SZ) END FUNCTION @@ -192,10 +192,10 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(LTINV_HANDLE), INTENT(IN) :: HLTINV - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 - INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 - INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE + INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE INTEGER(KIND=JPIM) :: IF_READIN, IF_LEG INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ @@ -226,8 +226,8 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& IF(.NOT. LVORGP) IF_LEG = IF_LEG - KF_UV ! No vorticity needed IF(.NOT. LDIVGP) IF_LEG = IF_LEG - KF_UV ! No divergence needed - CALL LEINV_STRIDES(IF_LEG,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& - IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) + CALL LEINV_STRIDES(IF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& + IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) IALLOC_POS = 1 @@ -239,13 +239,13 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZINP - IALLOC_SZ = ALIGN(IIN_STRIDES1*D%NUMP*SIZEOF(ZINP(1)),128) + IALLOC_SZ = ALIGN(IIN_SIZE*SIZEOF(ZINP(1)),128) CALL ASSIGN_PTR(ZINP, GET_ALLOCATION(ALLOCATOR, HLTINV%HPIA_AND_IN),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZINP0 - IALLOC_SZ = ALIGN(IIN0_STRIDES1*SIZEOF(ZINP0(1)),128) + IALLOC_SZ = ALIGN(IIN0_SIZE*SIZEOF(ZINP0(1)),128) CALL ASSIGN_PTR(ZINP0, GET_ALLOCATION(ALLOCATOR, HLTINV%HPIA_AND_IN),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS = IALLOC_POS + IALLOC_SZ @@ -253,25 +253,25 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& IALLOC_POS = 1 ! ZOUTA - IALLOC_SZ = ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZOUTA(1)),128) + IALLOC_SZ = ALIGN(IOUT_SIZE*SIZEOF(ZOUTA(1)),128) CALL ASSIGN_PTR(ZOUTA, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZOUTS - IALLOC_SZ = ALIGN(IOUT_STRIDES1*D%NUMP*SIZEOF(ZOUTS(1)),128) + IALLOC_SZ = ALIGN(IOUT_SIZE*SIZEOF(ZOUTS(1)),128) CALL ASSIGN_PTR(ZOUTS, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZOUTA0 - IALLOC_SZ = ALIGN(IOUT0_STRIDES1*SIZEOF(ZOUTA0(1)),128) + IALLOC_SZ = ALIGN(IOUT0_SIZE*SIZEOF(ZOUTA0(1)),128) CALL ASSIGN_PTR(ZOUTA0, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZOUTS0 - IALLOC_SZ = ALIGN(IOUT0_STRIDES1*SIZEOF(ZOUTS0(1)),128) + IALLOC_SZ = ALIGN(IOUT0_SIZE*SIZEOF(ZOUTS0(1)),128) CALL ASSIGN_PTR(ZOUTS0, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS = IALLOC_POS + IALLOC_SZ diff --git a/src/trans/gpu/internal/tpm_trans.F90 b/src/trans/gpu/internal/tpm_trans.F90 index 256832cee..a50d3a9d5 100755 --- a/src/trans/gpu/internal/tpm_trans.F90 +++ b/src/trans/gpu/internal/tpm_trans.F90 @@ -77,4 +77,9 @@ MODULE TPM_TRANS END TYPE TYPE(LEDIR_CONFIG) :: LEDIR_CONF +TYPE LEINV_CONFIG + INTEGER(KIND=JPIM), ALLOCATABLE :: OFFSETS_K(:), OFFSETS_N(:) +END TYPE +TYPE(LEINV_CONFIG) :: LEINV_CONF + END MODULE TPM_TRANS diff --git a/src/trans/gpu/internal/trmtol_pack_unpack.F90 b/src/trans/gpu/internal/trmtol_pack_unpack.F90 index 4e741a040..a71b8ab16 100755 --- a/src/trans/gpu/internal/trmtol_pack_unpack.F90 +++ b/src/trans/gpu/internal/trmtol_pack_unpack.F90 @@ -91,6 +91,7 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I USE TPM_GEOMETRY,ONLY : G,G_NDGLU USE TPM_DISTR, ONLY : D,D_NUMP,D_MYMS,D_NPNTGTB1 USE LEINV_MOD, ONLY: LEINV_STRIDES + USE TPM_TRANS, ONLY: LEINV_CONF IMPLICIT NONE @@ -107,8 +108,8 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I REAL(KIND=JPRBT) :: ZAOA, ZSOA INTEGER(KIND=JPIM) :: KMLOC, KM, ISL, JGL, JK, IGLS, OFFSET1, OFFSET2 - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 - INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE @@ -117,11 +118,11 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRMTOL_PACK%HFOUBUF_IN),& & 1_C_SIZE_T, D%NLENGT1B*2*KF_LEG*SIZEOF(FOUBUF_IN(1))) - CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_STRIDES1=IOUT_STRIDES1,& - IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_STRIDES1=IOUT0_STRIDES1) + CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_SIZE=IOUT_SIZE,& + IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_SIZE=IOUT0_SIZE) !$ACC DATA PRESENT(D,D_MYMS,D_NPNTGTB1,G,G_NDGLU) & - !$ACC& PRESENT(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN) + !$ACC& PRESENT(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,LEINV_CONF) !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,ZAOA,ZSOA) ASYNC(1) DO KMLOC=1,D_NUMP @@ -136,8 +137,8 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_LEG IF(KM /= 0) THEN - ZSOA = ZOUTS(JK+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) - ZAOA = ZOUTA(JK+(JGL-ISL)*IOUT_STRIDES0+(KMLOC-1)*IOUT_STRIDES1) + ZSOA = ZOUTS(JK+(JGL-ISL)*IOUT_STRIDES0+LEINV_CONF%OFFSETS_N(KMLOC)*IOUT_STRIDES0) + ZAOA = ZOUTA(JK+(JGL-ISL)*IOUT_STRIDES0+LEINV_CONF%OFFSETS_N(KMLOC)*IOUT_STRIDES0) ELSEIF (MOD((JK-1),2) .EQ. 0) THEN ZSOA = ZOUTS0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) ZAOA = ZOUTA0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) From 14b7f039e8d37139d751ceee880d2a0e897588c5 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Fri, 29 Jul 2022 05:12:00 -0700 Subject: [PATCH 252/263] use same strategy as for other offset arrays --- src/trans/gpu/external/setup_trans.F90 | 7 +-- src/trans/gpu/internal/ledir_mod.F90 | 60 ++++--------------- src/trans/gpu/internal/leinv_mod.F90 | 60 ++++--------------- src/trans/gpu/internal/sump_trans_mod.F90 | 28 ++++++++- src/trans/gpu/internal/tpm_distr.F90 | 5 ++ src/trans/gpu/internal/tpm_trans.F90 | 10 ---- src/trans/gpu/internal/trltom_pack_unpack.F90 | 18 +++--- src/trans/gpu/internal/trmtol_pack_unpack.F90 | 9 ++- 8 files changed, 66 insertions(+), 131 deletions(-) diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index e97a10f97..34eb818d9 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -118,9 +118,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& USE TPM_FFTW ,ONLY : TW, FFTW_RESOL #endif USE TPM_FLT -USE TPM_TRANS ,ONLY : ZAVE, ZMINGL, ZMAXGL, ZMINGPN, ZMAXGPN, LEDIR_CONF, LEINV_CONF -USE LEDIR_MOD, ONLY: SETUP_LEDIR -USE LEINV_MOD, ONLY: SETUP_LEINV +USE TPM_TRANS ,ONLY : ZAVE, ZMINGL, ZMAXGL, ZMINGPN, ZMAXGPN USE TPM_CTL USE SET_RESOL_MOD ,ONLY : SET_RESOL @@ -629,7 +627,4 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ENDIF -CALL SETUP_LEDIR(LEDIR_CONF) -CALL SETUP_LEINV(LEINV_CONF) - END SUBROUTINE SETUP_TRANS diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index ca08c7f89..4f7954e6d 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -16,51 +16,14 @@ MODULE LEDIR_MOD PRIVATE PUBLIC :: LEDIR_STRIDES, LEDIR - PUBLIC :: SETUP_LEDIR INTEGER(KIND=JPIM) :: A = 8 !Alignment CONTAINS - SUBROUTINE SETUP_LEDIR(CONFIG) - USE TPM_DISTR, ONLY: D - USE TPM_DIM, ONLY: R - USE TPM_GEOMETRY, ONLY: G - USE PARKIND_ECTRANS, ONLY: JPIM - - IMPLICIT NONE - - TYPE(LEDIR_CONFIG), INTENT(OUT) :: CONFIG - - INTEGER(KIND=JPIM) :: KM, KMLOC, N_OFFSET, K_OFFSET - - ALLOCATE(CONFIG%OFFSETS_N(D%NUMP+1)) - ALLOCATE(CONFIG%OFFSETS_K(D%NUMP+1)) - - N_OFFSET = 0 - K_OFFSET = 0 - DO KMLOC=1,D%NUMP - KM = D%MYMS(KMLOC) - CONFIG%OFFSETS_K(KMLOC) = K_OFFSET - CONFIG%OFFSETS_N(KMLOC) = N_OFFSET - - !KM=0 is transformed in double precision, no need to store here - IF (KM /= 0) THEN - K_OFFSET = K_OFFSET + ALIGN(G%NDGLU(KM),A) - ! N_OFFSET takes the max of the two GEMMs - N_OFFSET = N_OFFSET + ALIGN((R%NSMAX-KM+3)/2,A) - ENDIF - ENDDO - CONFIG%OFFSETS_K(D%NUMP+1) = K_OFFSET - CONFIG%OFFSETS_N(D%NUMP+1) = N_OFFSET - - !$ACC ENTER DATA COPYIN(CONFIG,CONFIG%OFFSETS_K,CONFIG%OFFSETS_N) - END SUBROUTINE - SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD USE TPM_DIM ,ONLY : R - USE TPM_DISTR, ONLY: D - USE TPM_TRANS, ONLY: LEDIR_CONF + USE TPM_DISTR, ONLY: D,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2 IMPLICIT NONE @@ -74,11 +37,11 @@ SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& IF (PRESENT(IOUT_STRIDES0)) & IOUT_STRIDES0 = ALIGN(2*KF_FS,A) IF (PRESENT(IOUT_SIZE)) & - IOUT_SIZE = IOUT_STRIDES0*LEDIR_CONF%OFFSETS_N(D%NUMP+1) + IOUT_SIZE = IOUT_STRIDES0*D_OFFSETS_GEMM2(D%NUMP+1) IF (PRESENT(IIN_STRIDES0)) & IIN_STRIDES0 = ALIGN(2*KF_FS,A) IF (PRESENT(IIN_SIZE)) & - IIN_SIZE = IIN_STRIDES0*LEDIR_CONF%OFFSETS_K(D%NUMP+1) + IIN_SIZE = IIN_STRIDES0*D_OFFSETS_GEMM1(D%NUMP+1) IF (PRESENT(IOUT0_STRIDES0)) & IOUT0_STRIDES0 = ALIGN(KF_FS,A) IF (PRESENT(IOUT0_SIZE)) & @@ -137,14 +100,13 @@ SUBROUTINE LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX,R_NTMAX,R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU USE TPM_FIELDS ,ONLY : F,ZAA,ZAS,ZAA0,ZAS0,KMLOC0 - USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS + USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2 USE CUDA_GEMM_BATCHED_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX USE, INTRINSIC :: ISO_C_BINDING USE IEEE_ARITHMETIC USE OPENACC - USE TPM_TRANS, ONLY: LEDIR_CONF IMPLICIT NONE @@ -183,7 +145,7 @@ SUBROUTINE LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) !$ACC& PRESENT(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) & !$ACC& PRESENT(F,F%RW) & !$ACC& PRESENT(D,D_MYMS,R,G,G_NDGLU) & - !$ACC& PRESENT(ZAA,ZAS,POA1,LEDIR_CONF) + !$ACC& PRESENT(ZAA,ZAS,POA1,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2) ! anti-symmetric IF(KMLOC0 > 0) THEN @@ -217,9 +179,9 @@ SUBROUTINE LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) KM = D_MYMS(KMLOC) NS(KMLOC) = (R%NSMAX-KM+2)/2 KS(KMLOC) = G%NDGLU(KM) - AOFFSETS(KMLOC) = IIN_STRIDES0*LEDIR_CONF%OFFSETS_K(KMLOC) + AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC) BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) - COFFSETS(KMLOC) = IOUT_STRIDES0*LEDIR_CONF%OFFSETS_N(KMLOC) + COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC) ENDDO IF(KMLOC0 > 0) THEN NS(KMLOC0) = 0 @@ -251,7 +213,7 @@ SUBROUTINE LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) IF (KM /= 0) THEN !$ACC LOOP SEQ DO J=1,(R%NSMAX-KM+2)/2 - POA1(JF,IA+1+(J-1)*2,KMLOC) = ZOUT(JF+(J-1)*IOUT_STRIDES0+LEDIR_CONF%OFFSETS_N(KMLOC)*IOUT_STRIDES0) + POA1(JF,IA+1+(J-1)*2,KMLOC) = ZOUT(JF+(J-1)*IOUT_STRIDES0+D_OFFSETS_GEMM2(KMLOC)*IOUT_STRIDES0) ENDDO ELSEIF (MOD(JF-1,2) == 0) THEN !$ACC LOOP SEQ @@ -292,9 +254,9 @@ SUBROUTINE LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) KM = D_MYMS(KMLOC) NS(KMLOC) = (R%NSMAX-KM+3)/2 KS(KMLOC) = G%NDGLU(KM) - AOFFSETS(KMLOC) = IIN_STRIDES0*LEDIR_CONF%OFFSETS_K(KMLOC) + AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC) BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) - COFFSETS(KMLOC) = IOUT_STRIDES0*LEDIR_CONF%OFFSETS_N(KMLOC) + COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC) ENDDO IF(KMLOC0 > 0) THEN NS(KMLOC0) = 0 @@ -326,7 +288,7 @@ SUBROUTINE LEDIR(ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) IF (KM /= 0) THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+3)/2 - POA1(JF,IS+1+(J-1)*2,KMLOC) = ZOUT(JF+(J-1)*IOUT_STRIDES0+LEDIR_CONF%OFFSETS_N(KMLOC)*IOUT_STRIDES0) + POA1(JF,IS+1+(J-1)*2,KMLOC) = ZOUT(JF+(J-1)*IOUT_STRIDES0+D_OFFSETS_GEMM2(KMLOC)*IOUT_STRIDES0) ENDDO ELSEIF (MOD(JF-1,2) == 0) THEN !$ACC LOOP SEQ diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 221185052..e893732b2 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -11,56 +11,19 @@ MODULE LEINV_MOD USE PARKIND_ECTRANS ,ONLY : JPIM - USE TPM_TRANS, ONLY: LEINV_CONFIG IMPLICIT NONE PRIVATE PUBLIC :: LEINV_STRIDES, LEINV - PUBLIC :: SETUP_LEINV INTEGER(KIND=JPIM) :: A = 8 !Alignment CONTAINS - SUBROUTINE SETUP_LEINV(CONFIG) - USE TPM_DISTR, ONLY: D - USE TPM_DIM, ONLY: R - USE TPM_GEOMETRY, ONLY: G - USE PARKIND_ECTRANS, ONLY: JPIM - - IMPLICIT NONE - - TYPE(LEINV_CONFIG), INTENT(OUT) :: CONFIG - - INTEGER(KIND=JPIM) :: KM, KMLOC, N_OFFSET, K_OFFSET - - ALLOCATE(CONFIG%OFFSETS_N(D%NUMP+1)) - ALLOCATE(CONFIG%OFFSETS_K(D%NUMP+1)) - - N_OFFSET = 0 - K_OFFSET = 0 - DO KMLOC=1,D%NUMP - KM = D%MYMS(KMLOC) - CONFIG%OFFSETS_K(KMLOC) = K_OFFSET - CONFIG%OFFSETS_N(KMLOC) = N_OFFSET - - !KM=0 is transformed in double precision, no need to store here - IF (KM /= 0) THEN - ! K_OFFSET takes the max of the two GEMMs - K_OFFSET = K_OFFSET + ALIGN((R%NSMAX-KM+3)/2,A) - N_OFFSET = N_OFFSET + ALIGN(G%NDGLU(KM),A) - ENDIF - ENDDO - CONFIG%OFFSETS_K(D%NUMP+1) = K_OFFSET - CONFIG%OFFSETS_N(D%NUMP+1) = N_OFFSET - - !$ACC ENTER DATA COPYIN(CONFIG,CONFIG%OFFSETS_K,CONFIG%OFFSETS_N) - END SUBROUTINE SUBROUTINE LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD USE TPM_DIM ,ONLY : R - USE TPM_DISTR, ONLY: D - USE TPM_TRANS, ONLY: LEINV_CONF + USE TPM_DISTR, ONLY: D,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2 IMPLICIT NONE @@ -79,11 +42,11 @@ SUBROUTINE LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& IF (PRESENT(IIN_STRIDES0)) & IIN_STRIDES0 = ALIGN(2*KF_LEG,A) IF (PRESENT(IIN_SIZE)) & - IIN_SIZE = IIN_STRIDES0*LEINV_CONF%OFFSETS_K(D%NUMP+1) + IIN_SIZE = IIN_STRIDES0*D_OFFSETS_GEMM2(D%NUMP+1) IF (PRESENT(IOUT0_STRIDES0)) & IOUT_STRIDES0 = ALIGN(2*KF_LEG,A) IF (PRESENT(IOUT_SIZE)) & - IOUT_SIZE = IOUT_STRIDES0*LEINV_CONF%OFFSETS_N(D%NUMP+1) + IOUT_SIZE = IOUT_STRIDES0*D_OFFSETS_GEMM1(D%NUMP+1) IF (PRESENT(IIN0_STRIDES0)) & IIN0_STRIDES0 = ALIGN(KF_LEG,A) IF (PRESENT(IIN0_SIZE)) & @@ -135,11 +98,10 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX, R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU USE TPM_FIELDS ,ONLY : ZAA,ZAS,ZAA0,ZAS0,KMLOC0 - USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,MYPROC + USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,MYPROC,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2 USE CUDA_GEMM_BATCHED_MOD USE MPL_MODULE ,ONLY : MPL_BARRIER USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX - USE TPM_TRANS, ONLY: LEINV_CONF IMPLICIT NONE @@ -176,7 +138,7 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) !$ACC DATA PRESENT(D,D_MYMS,G,G_NDGLU,R) & !$ACC& PRESENT(ZINP,ZOUTS,ZOUTA,ZINP0,ZOUTS0,ZOUTA0) & !$ACC& PRESENT(ZAA,ZAS,PIA) & - !$ACC& PRESENT(D_MYMS,G_NDGLU) + !$ACC& PRESENT(D_MYMS,G_NDGLU,D_OFFSETS_GEMM2) IF (KMLOC0 > 0) THEN print*,'computing m=0 in double precision' @@ -201,7 +163,7 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) IF(KM /= 0)THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+2)/2 - ZINP(JK+(J-1)*IIN_STRIDES0+LEINV_CONF%OFFSETS_K(KMLOC)*IIN_STRIDES0)=PIA(JK,IA+1+(J-1)*2,KMLOC) + ZINP(JK+(J-1)*IIN_STRIDES0+D_OFFSETS_GEMM2(KMLOC)*IIN_STRIDES0)=PIA(JK,IA+1+(J-1)*2,KMLOC) ENDDO ELSEIF (MOD((JK-1),2) .EQ. 0) THEN ! every other field is sufficient because Im(KM=0) == 0 @@ -238,9 +200,9 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) KM = D_MYMS(KMLOC) KS(KMLOC) = (R%NSMAX-KM+2)/2 NS(KMLOC) = G%NDGLU(KM) - AOFFSETS(KMLOC) = IIN_STRIDES0*LEINV_CONF%OFFSETS_K(KMLOC) + AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC) BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) - COFFSETS(KMLOC) = IOUT_STRIDES0*LEINV_CONF%OFFSETS_N(KMLOC) + COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC) ENDDO IF(KMLOC0 > 0) THEN NS(KMLOC0) = 0 @@ -283,7 +245,7 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) IF(KM /= 0) THEN !$ACC LOOP SEQ DO J=1,(R_NSMAX-KM+3)/2 - ZINP(JK+(J-1)*IIN_STRIDES0+LEINV_CONF%OFFSETS_K(KMLOC)*IIN_STRIDES0)=PIA(JK,IS+1+(J-1)*2,KMLOC) + ZINP(JK+(J-1)*IIN_STRIDES0+D_OFFSETS_GEMM2(KMLOC)*IIN_STRIDES0)=PIA(JK,IS+1+(J-1)*2,KMLOC) ENDDO ELSEIF (MOD((JK-1),2) == 0) THEN !$ACC LOOP SEQ @@ -318,9 +280,9 @@ SUBROUTINE LEINV(PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) KM = D_MYMS(KMLOC) KS(KMLOC) = (R%NSMAX-KM+3)/2 NS(KMLOC) = G%NDGLU(KM) - AOFFSETS(KMLOC) = IIN_STRIDES0*LEINV_CONF%OFFSETS_K(KMLOC) + AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC) BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) - COFFSETS(KMLOC) = IOUT_STRIDES0*LEINV_CONF%OFFSETS_N(KMLOC) + COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC) ENDDO IF(KMLOC0 > 0) THEN NS(KMLOC0) = 0 diff --git a/src/trans/gpu/internal/sump_trans_mod.F90 b/src/trans/gpu/internal/sump_trans_mod.F90 index d750a9367..60d5821e3 100755 --- a/src/trans/gpu/internal/sump_trans_mod.F90 +++ b/src/trans/gpu/internal/sump_trans_mod.F90 @@ -1,3 +1,4 @@ +#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2022- NVIDIA. ! @@ -22,7 +23,7 @@ SUBROUTINE SUMP_TRANS USE TPM_GEN ,ONLY : NOUT, NPRINTLEV USE TPM_DIM ,ONLY : R USE TPM_GEOMETRY ,ONLY : G -USE TPM_DISTR ,ONLY : D, LEQ_REGIONS, MYSETW, NPRTRNS, NPRTRW, NPROC, MYPROC +USE TPM_DISTR ,ONLY : D, LEQ_REGIONS, MYSETW, NPRTRNS, NPRTRW, NPROC, MYPROC,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2 !USE SUWAVEDI_MOD !USE PE2SET_MOD @@ -39,7 +40,7 @@ SUBROUTINE SUMP_TRANS INTEGER(KIND=JPIM) :: JM INTEGER(KIND=JPIM) :: JGL,IGL,IPLAT,ISENDSET,IRECVSET,JML,IPOS,IM -INTEGER(KIND=JPIM) :: IGPTOT,IMEDIAP,IRESTM,JA,JB,IOFF +INTEGER(KIND=JPIM) :: IGPTOT,IMEDIAP,IRESTM,JA,JB,IOFF,OFFSET1,OFFSET2,KMLOC,KM INTEGER(KIND=JPIM),ALLOCATABLE :: IGPTOTL(:,:) REAL(KIND=JPRBT),ALLOCATABLE :: ZDUM(:) @@ -268,6 +269,29 @@ SUBROUTINE SUMP_TRANS IF(ALLOCATED(ZDUM)) DEALLOCATE(ZDUM) DEALLOCATE(IGPTOTL) +ALLOCATE(D_OFFSETS_GEMM1(D%NUMP+1)) +ALLOCATE(D_OFFSETS_GEMM2(D%NUMP+1)) + +OFFSET1 = 0 +OFFSET2 = 0 +DO KMLOC=1,D%NUMP + KM = D%MYMS(KMLOC) + D_OFFSETS_GEMM1(KMLOC) = OFFSET1 + D_OFFSETS_GEMM2(KMLOC) = OFFSET2 + + !KM=0 is transformed in double precision, no need to store here + IF (KM /= 0) THEN + OFFSET1 = OFFSET1 + ALIGN(G%NDGLU(KM),8) + ! N_OFFSET takes the max of the two GEMMs + OFFSET2 = OFFSET2 + ALIGN((R%NSMAX-KM+3)/2,8) + ENDIF +ENDDO +D_OFFSETS_GEMM1(D%NUMP+1) = OFFSET1 +D_OFFSETS_GEMM2(D%NUMP+1) = OFFSET2 + +!$ACC ENTER DATA COPYIN(D_OFFSETS_GEMM1,D_OFFSETS_GEMM2) + + ! ------------------------------------------------------------------ 9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8) diff --git a/src/trans/gpu/internal/tpm_distr.F90 b/src/trans/gpu/internal/tpm_distr.F90 index 8901edf7c..6a0bf0fdc 100755 --- a/src/trans/gpu/internal/tpm_distr.F90 +++ b/src/trans/gpu/internal/tpm_distr.F90 @@ -179,6 +179,11 @@ MODULE TPM_DISTR INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPTRLS(:) ! Pointer to first lat. (F.S) +! The offsets in the input and output arrays to the gemms. +! (1) are the offsets in the "inputs" of dirtrans ("outputs" invtrans) +! (2) are the offsets in the "outputs" of invtrans ("inputs" dirtrans) +INTEGER(KIND=JPIM), ALLOCATABLE :: D_OFFSETS_GEMM1(:), D_OFFSETS_GEMM2(:) + TYPE(DISTR_TYPE),ALLOCATABLE,TARGET :: DISTR_RESOL(:) TYPE(DISTR_TYPE),POINTER :: D diff --git a/src/trans/gpu/internal/tpm_trans.F90 b/src/trans/gpu/internal/tpm_trans.F90 index a50d3a9d5..ac6081272 100755 --- a/src/trans/gpu/internal/tpm_trans.F90 +++ b/src/trans/gpu/internal/tpm_trans.F90 @@ -72,14 +72,4 @@ MODULE TPM_TRANS ! is going to recreate the graphs if needed) INTEGER(KIND=C_INT8_T),POINTER :: REUSE_PTR(:) -TYPE LEDIR_CONFIG - INTEGER(KIND=JPIM), ALLOCATABLE :: OFFSETS_K(:), OFFSETS_N(:) -END TYPE -TYPE(LEDIR_CONFIG) :: LEDIR_CONF - -TYPE LEINV_CONFIG - INTEGER(KIND=JPIM), ALLOCATABLE :: OFFSETS_K(:), OFFSETS_N(:) -END TYPE -TYPE(LEINV_CONFIG) :: LEINV_CONF - END MODULE TPM_TRANS diff --git a/src/trans/gpu/internal/trltom_pack_unpack.F90 b/src/trans/gpu/internal/trltom_pack_unpack.F90 index cf46ca56c..f91d44c43 100755 --- a/src/trans/gpu/internal/trltom_pack_unpack.F90 +++ b/src/trans/gpu/internal/trltom_pack_unpack.F90 @@ -153,9 +153,8 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP USE TPM_DIM, ONLY : R, R_NDGNH, R_NDGL USE TPM_GEOMETRY, ONLY : G, G_NDGLU USE TPM_FIELDS, ONLY : F - USE TPM_DISTR, ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1 + USE TPM_DISTR, ONLY : D,D_NUMP,D_MYMS, D_NPNTGTB1,D_OFFSETS_GEMM1 USE LEDIR_MOD, ONLY : LEDIR_STRIDES - USE TPM_TRANS, ONLY: LEDIR_CONF USE, INTRINSIC :: ISO_C_BINDING IMPLICIT NONE @@ -210,7 +209,7 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP !$ACC& PRESENT(D,D_MYMS,R,G,G_NDGLU) & !$ACC& PRESENT(D_NPNTGTB1) - !$ACC DATA PRESENT(FOUBUF,LEDIR_CONF) + !$ACC DATA PRESENT(FOUBUF,D_OFFSETS_GEMM1) !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) ASYNC(1) DO KMLOC=1,D_NUMP DO JGL=1,R_NDGNH @@ -230,8 +229,8 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP PAIS = PAIS*F%RACTHE(JGL) ENDIF IF (KM /= 0) THEN - ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*LEDIR_CONF%OFFSETS_K(KMLOC))=PAIA*F%RW(JGL) - ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*LEDIR_CONF%OFFSETS_K(KMLOC))=PAIS*F%RW(JGL) + ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC))=PAIA*F%RW(JGL) + ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC))=PAIS*F%RW(JGL) ELSEIF (MOD(JF-1,2) == 0) THEN ! every other field is sufficient because Im(KM=0) == 0 ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F%RW(JGL) @@ -281,9 +280,8 @@ SUBROUTINE TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINP USE TPM_DIM ,ONLY : R, R_NDGNH, R_NDGL USE TPM_GEOMETRY ,ONLY : G, G_NDGLU, G_NLOEN USE TPM_FIELDS ,ONLY : F - USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,MYSETW,D_NSTAGTF,D_NPTRLS + USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,MYSETW,D_NSTAGTF,D_NPTRLS,D_OFFSETS_GEMM1 USE LEDIR_MOD, ONLY: LEDIR_STRIDES - USE TPM_TRANS, ONLY: LEDIR_CONF USE, INTRINSIC :: ISO_C_BINDING IMPLICIT NONE @@ -334,7 +332,7 @@ SUBROUTINE TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINP !$ACC& PRESENT(ZINPS,ZINPA,ZINPS0,ZINPA0) & !$ACC& PRESENT(F,F%RW) & !$ACC& PRESENT(D,D_MYMS,G,G_NDGLU) & - !$ACC& PRESENT(PREEL_COMPLEX,D_NSTAGTF,G_NLOEN,LEDIR_CONF) + !$ACC& PRESENT(PREEL_COMPLEX,D_NSTAGTF,G_NLOEN,D_OFFSETS_GEMM1) OFFSET_VAR=D_NPTRLS(MYSETW) @@ -367,8 +365,8 @@ SUBROUTINE TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINP PAIS = PAIS*F%RACTHE(JGL) ENDIF IF (KM /= 0) THEN - ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*LEDIR_CONF%OFFSETS_K(KMLOC))=PAIA*F%RW(JGL) - ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*LEDIR_CONF%OFFSETS_K(KMLOC))=PAIS*F%RW(JGL) + ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC))=PAIA*F%RW(JGL) + ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC))=PAIS*F%RW(JGL) ELSEIF (MOD(JF-1,2) == 0) THEN ! every other field is sufficient because Im(KM=0) == 0 ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F%RW(JGL) diff --git a/src/trans/gpu/internal/trmtol_pack_unpack.F90 b/src/trans/gpu/internal/trmtol_pack_unpack.F90 index a71b8ab16..9b3595a51 100755 --- a/src/trans/gpu/internal/trmtol_pack_unpack.F90 +++ b/src/trans/gpu/internal/trmtol_pack_unpack.F90 @@ -89,9 +89,8 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I USE YOMHOOK, ONLY : LHOOK,DR_HOOK, JPHOOK USE TPM_DIM, ONLY : R, R_NDGNH,R_NDGL USE TPM_GEOMETRY,ONLY : G,G_NDGLU - USE TPM_DISTR, ONLY : D,D_NUMP,D_MYMS,D_NPNTGTB1 + USE TPM_DISTR, ONLY : D,D_NUMP,D_MYMS,D_NPNTGTB1,D_OFFSETS_GEMM1 USE LEINV_MOD, ONLY: LEINV_STRIDES - USE TPM_TRANS, ONLY: LEINV_CONF IMPLICIT NONE @@ -122,7 +121,7 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_SIZE=IOUT0_SIZE) !$ACC DATA PRESENT(D,D_MYMS,D_NPNTGTB1,G,G_NDGLU) & - !$ACC& PRESENT(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,LEINV_CONF) + !$ACC& PRESENT(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,D_OFFSETS_GEMM1) !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,ZAOA,ZSOA) ASYNC(1) DO KMLOC=1,D_NUMP @@ -137,8 +136,8 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_LEG IF(KM /= 0) THEN - ZSOA = ZOUTS(JK+(JGL-ISL)*IOUT_STRIDES0+LEINV_CONF%OFFSETS_N(KMLOC)*IOUT_STRIDES0) - ZAOA = ZOUTA(JK+(JGL-ISL)*IOUT_STRIDES0+LEINV_CONF%OFFSETS_N(KMLOC)*IOUT_STRIDES0) + ZSOA = ZOUTS(JK+(JGL-ISL)*IOUT_STRIDES0+D_OFFSETS_GEMM1(KMLOC)*IOUT_STRIDES0) + ZAOA = ZOUTA(JK+(JGL-ISL)*IOUT_STRIDES0+D_OFFSETS_GEMM1(KMLOC)*IOUT_STRIDES0) ELSEIF (MOD((JK-1),2) .EQ. 0) THEN ZSOA = ZOUTS0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) ZAOA = ZOUTA0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) From cbbb666c3a4bf6b050c3d3a4c22bb964990bca53 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Fri, 29 Jul 2022 05:18:11 -0700 Subject: [PATCH 253/263] Remove direct transform --- src/trans/gpu/internal/dir_trans_ctl_mod.F90 | 26 +--- src/trans/gpu/internal/trltom_pack_unpack.F90 | 139 ------------------ 2 files changed, 7 insertions(+), 158 deletions(-) diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 3611f28aa..0355fe889 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -135,7 +135,6 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& TYPE(TRLTOM_PACK_HANDLE) :: HTRLTOM_PACK TYPE(TRLTOM_HANDLE) :: HTRLTOM TYPE(TRLTOM_UNPACK_HANDLE) :: HTRLTOM_UNPACK - TYPE(TRLTOM_DIRECT_HANDLE) :: HTRLTOM_DIRECT TYPE(LTDIR_HANDLE) :: HLTDIR IF(NPROMATR > 0) THEN @@ -147,13 +146,9 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ALLOCATOR = MAKE_BUFFERED_ALLOCATOR() HTRGTOL = PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) HFTDIR = PREPARE_FTDIR() - IF (NPROC > 1) THEN - HTRLTOM_PACK = PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) - HTRLTOM = PREPARE_TRLTOM(ALLOCATOR, KF_FS) - HTRLTOM_UNPACK = PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) - ELSE - HTRLTOM_DIRECT = PREPARE_TRLTOM_DIRECT(ALLOCATOR, KF_FS) - ENDIF + HTRLTOM_PACK = PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) + HTRLTOM = PREPARE_TRLTOM(ALLOCATOR, KF_FS) + HTRLTOM_UNPACK = PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) HLTDIR = PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, REUSE_PTR) @@ -173,17 +168,10 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& CALL GSTATS(153,0) - IF (NPROC > 1) THEN - WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_CUDAAWARE' - CALL TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) - CALL TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,FOUBUF_IN,FOUBUF,KF_FS) - CALL TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) - ELSE - WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_DIRECT' - ! Short cut - no need to go through tansforms, we will go directly into - ! the legendre space - CALL TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) - ENDIF + WRITE(NOUT,*) 'ltdir_ctl:TRLTOM_CUDAAWARE' + CALL TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) + CALL TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,FOUBUF_IN,FOUBUF,KF_FS) + CALL TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) CALL GSTATS(153,1) CALL LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALARS, & diff --git a/src/trans/gpu/internal/trltom_pack_unpack.F90 b/src/trans/gpu/internal/trltom_pack_unpack.F90 index f91d44c43..5c2a8fc01 100755 --- a/src/trans/gpu/internal/trltom_pack_unpack.F90 +++ b/src/trans/gpu/internal/trltom_pack_unpack.F90 @@ -16,7 +16,6 @@ MODULE TRLTOM_PACK_UNPACK PRIVATE PUBLIC :: TRLTOM_PACK_HANDLE, PREPARE_TRLTOM_PACK, TRLTOM_PACK PUBLIC :: TRLTOM_UNPACK_HANDLE, PREPARE_TRLTOM_UNPACK, TRLTOM_UNPACK - PUBLIC :: TRLTOM_DIRECT_HANDLE, PREPARE_TRLTOM_DIRECT, TRLTOM_DIRECT TYPE TRLTOM_PACK_HANDLE TYPE(ALLOCATION_RESERVATION_HANDLE) :: HFOUBUF_IN @@ -24,9 +23,6 @@ MODULE TRLTOM_PACK_UNPACK TYPE TRLTOM_UNPACK_HANDLE TYPE(ALLOCATION_RESERVATION_HANDLE) :: HINPS_AND_ZINPA END TYPE - TYPE TRLTOM_DIRECT_HANDLE - TYPE(ALLOCATION_RESERVATION_HANDLE) :: HINPS_AND_ZINPA - END TYPE CONTAINS FUNCTION PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_PACK) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT @@ -244,140 +240,5 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP !$ACC END DATA END SUBROUTINE - - FUNCTION PREPARE_TRLTOM_DIRECT(ALLOCATOR, KF_FS) RESULT(HTRLTOM_DIRECT) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD - USE TPM_DISTR, ONLY : D - USE LEDIR_MOD, ONLY : LEDIR_STRIDES - - IMPLICIT NONE - - TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR - INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS - TYPE(TRLTOM_DIRECT_HANDLE) :: HTRLTOM_DIRECT - - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE - INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE - INTEGER(KIND=C_SIZE_T) :: ISIZE - - REAL(KIND=JPRBT) :: ZPRBT_DUMMY - REAL(KIND=JPRD) :: ZPRD_DUMMY - - CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_SIZE=IIN_SIZE, & - & IIN0_STRIDES0=IIN0_STRIDES0,IIN0_SIZE=IIN0_SIZE) - - ! Check if the reuse buffer is large enough - ISIZE = ALIGN(IIN_SIZE*SIZEOF(ZPRBT_DUMMY),128) - ISIZE = ISIZE + ALIGN(IIN_SIZE*SIZEOF(ZPRBT_DUMMY),128) - ISIZE = ISIZE + ALIGN(IIN0_SIZE*SIZEOF(ZPRD_DUMMY),128) - ISIZE = ISIZE + ALIGN(IIN0_SIZE*SIZEOF(ZPRD_DUMMY),128) - - HTRLTOM_DIRECT%HINPS_AND_ZINPA = RESERVE(ALLOCATOR, ISIZE) - END FUNCTION - - SUBROUTINE TRLTOM_DIRECT(ALLOCATOR,HTRLTOM_DIRECT,PREEL_COMPLEX,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) - USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD - USE TPM_DIM ,ONLY : R, R_NDGNH, R_NDGL - USE TPM_GEOMETRY ,ONLY : G, G_NDGLU, G_NLOEN - USE TPM_FIELDS ,ONLY : F - USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,MYSETW,D_NSTAGTF,D_NPTRLS,D_OFFSETS_GEMM1 - USE LEDIR_MOD, ONLY: LEDIR_STRIDES - USE, INTRINSIC :: ISO_C_BINDING - - IMPLICIT NONE - - REAL(KIND=JPRBT), INTENT(IN) :: PREEL_COMPLEX(:) - REAL(KIND=JPRBT), POINTER, INTENT(INOUT) :: ZINPS(:), ZINPA(:) - REAL(KIND=JPRD), POINTER, INTENT(INOUT) :: ZINPS0(:), ZINPA0(:) - INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV - TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR - TYPE(TRLTOM_DIRECT_HANDLE), INTENT(IN) :: HTRLTOM_DIRECT - - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE - INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE - - INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ - - INTEGER(KIND=8) :: JF, OFFSET_VAR - INTEGER(KIND=JPIM) :: KM, ISL, IGLS, OFFSET1, OFFSET2, JGL, IGLG, KMLOC - - REAL(KIND=JPRBT) :: PAIA, PAIS, V1, V2, SCAL - - CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_SIZE=IIN_SIZE, & - & IIN0_STRIDES0=IIN0_STRIDES0,IIN0_SIZE=IIN0_SIZE) - - IALLOC_POS=1 - - IALLOC_SZ = ALIGN(IIN_SIZE*SIZEOF(ZINPS(0)),128) - CALL ASSIGN_PTR(ZINPS, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%HINPS_AND_ZINPA),& - & IALLOC_POS, IALLOC_SZ) - IALLOC_POS=IALLOC_POS+IALLOC_SZ - - IALLOC_SZ = ALIGN(IIN_SIZE*SIZEOF(ZINPA(0)),128) - CALL ASSIGN_PTR(ZINPA, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%HINPS_AND_ZINPA),& - & IALLOC_POS, IALLOC_SZ) - IALLOC_POS=IALLOC_POS+IALLOC_SZ - - IALLOC_SZ = ALIGN(IIN0_SIZE*SIZEOF(ZINPS0(0)),128) - CALL ASSIGN_PTR(ZINPS0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%HINPS_AND_ZINPA),& - & IALLOC_POS, IALLOC_SZ) - IALLOC_POS=IALLOC_POS+IALLOC_SZ - - IALLOC_SZ = ALIGN(IIN0_SIZE*SIZEOF(ZINPA0(0)),128) - CALL ASSIGN_PTR(ZINPA0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_DIRECT%HINPS_AND_ZINPA),& - & IALLOC_POS, IALLOC_SZ) - IALLOC_POS=IALLOC_POS+IALLOC_SZ - - !$ACC DATA & - !$ACC& PRESENT(ZINPS,ZINPA,ZINPS0,ZINPA0) & - !$ACC& PRESENT(F,F%RW) & - !$ACC& PRESENT(D,D_MYMS,G,G_NDGLU) & - !$ACC& PRESENT(PREEL_COMPLEX,D_NSTAGTF,G_NLOEN,D_OFFSETS_GEMM1) - - OFFSET_VAR=D_NPTRLS(MYSETW) - - !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(KM,ISL,OFFSET1,OFFSET2,PAIA,PAIS,IGLS,V1,V2,SCAL,IGLG) ASYNC(1) TILE(32,16,1) - DO JGL=1,R_NDGNH - DO KMLOC=1,D_NUMP - DO JF=1,KF_FS*2 - KM = D_MYMS(KMLOC) - ISL = R_NDGNH-G_NDGLU(KM)+1 - IF (JGL >= ISL) THEN - !(DO JGL=ISL,R_NDGNH) - IGLS = JGL - - OFFSET1 = KF_FS*D_NSTAGTF(IGLS)+(JF-1)/2*(D_NSTAGTF(IGLS+1)-D_NSTAGTF(IGLS)) - IGLG = OFFSET_VAR+IGLS-1 - SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) - V1 = SCAL * PREEL_COMPLEX(OFFSET1+2*(KMLOC-1)+1+MOD(JF-1,2)) - - IGLS = R_NDGL+1-JGL - OFFSET2 = KF_FS*D_NSTAGTF(IGLS)+(JF-1)/2*(D_NSTAGTF(IGLS+1)-D_NSTAGTF(IGLS)) - IGLG = OFFSET_VAR+IGLS-1 - SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) - V2 = SCAL * PREEL_COMPLEX(OFFSET2+2*(KMLOC-1)+1+MOD(JF-1,2)) - - PAIA = V1-V2 - PAIS = V1+V2 - IF (JF <= 4*KF_UV) THEN - ! Multiply in case of velocity - PAIA = PAIA*F%RACTHE(JGL) - PAIS = PAIS*F%RACTHE(JGL) - ENDIF - IF (KM /= 0) THEN - ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC))=PAIA*F%RW(JGL) - ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC))=PAIS*F%RW(JGL) - ELSEIF (MOD(JF-1,2) == 0) THEN - ! every other field is sufficient because Im(KM=0) == 0 - ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*F%RW(JGL) - ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIS*F%RW(JGL) - ENDIF - ENDIF - ENDDO - ENDDO - END DO - - !$ACC END DATA - END SUBROUTINE END MODULE TRLTOM_PACK_UNPACK From fcf48cad2b5e803378098147de84642cb87fc141 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 9 Aug 2022 01:46:36 -0700 Subject: [PATCH 254/263] Add functionality to allocator to set all data to NaN - Verified that V100 passes - Small fixes in returned arrays (1 element too large sometimes) --- src/trans/gpu/internal/allocator_mod.F90 | 87 +++++++++++++++++++----- src/trans/gpu/internal/ltdir_mod.F90 | 8 +-- 2 files changed, 73 insertions(+), 22 deletions(-) diff --git a/src/trans/gpu/internal/allocator_mod.F90 b/src/trans/gpu/internal/allocator_mod.F90 index 48ceffa68..dad805205 100644 --- a/src/trans/gpu/internal/allocator_mod.F90 +++ b/src/trans/gpu/internal/allocator_mod.F90 @@ -25,8 +25,9 @@ MODULE ALLOCATOR_MOD ! the double buffer. + INTEGER(KIND=JPIM), PARAMETER :: NBUF = 2 TYPE BUFFERED_ALLOCATOR - INTEGER(KIND=C_SIZE_T) :: BUFR_SZ(0:1) + INTEGER(KIND=C_SIZE_T) :: BUFR_SZ(0:NBUF-1) INTEGER(KIND=JPIM) :: NEXT_BUF INTEGER(C_INT8_T), POINTER :: PTR(:) END TYPE @@ -36,19 +37,23 @@ MODULE ALLOCATOR_MOD END TYPE INTERFACE ASSIGN_PTR - SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES) + SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE, SET_STREAM) USE ISO_C_BINDING IMPLICIT NONE - INTEGER(KIND=C_INT8_T), POINTER, INTENT(IN) :: SRC(:) + INTEGER(KIND=C_INT8_T), POINTER, INTENT(INOUT) :: SRC(:) REAL(KIND=C_FLOAT), POINTER, INTENT(OUT) :: DST(:) INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES + LOGICAL, INTENT(IN), OPTIONAL :: SET_VALUE + INTEGER(KIND=4), INTENT(IN), OPTIONAL :: SET_STREAM END SUBROUTINE - SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES) + SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE, SET_STREAM) USE ISO_C_BINDING IMPLICIT NONE - INTEGER(KIND=C_INT8_T), POINTER, INTENT(IN) :: SRC(:) + INTEGER(KIND=C_INT8_T), POINTER, INTENT(INOUT) :: SRC(:) REAL(KIND=C_DOUBLE), POINTER, INTENT(OUT) :: DST(:) INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES + LOGICAL, INTENT(IN), OPTIONAL :: SET_VALUE + INTEGER(KIND=4), INTENT(IN), OPTIONAL :: SET_STREAM END SUBROUTINE END INTERFACE @@ -78,16 +83,18 @@ FUNCTION RESERVE(ALLOCATOR, SZ) RESERVE%BUF = ALLOCATOR%NEXT_BUF RESERVE%SZ = SZ - ALLOCATOR%NEXT_BUF = 1-ALLOCATOR%NEXT_BUF + ALLOCATOR%NEXT_BUF = MOD(ALLOCATOR%NEXT_BUF+1,NBUF) END FUNCTION SUBROUTINE INSTANTIATE_ALLOCATOR(ALLOCATOR, OLD_PTR) IMPLICIT NONE TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR INTEGER(C_INT8_T), OPTIONAL, INTENT(INOUT), POINTER :: OLD_PTR(:) + INTEGER :: I - ALLOCATOR%BUFR_SZ(0) = ALIGN(ALLOCATOR%BUFR_SZ(0),128) - ALLOCATOR%BUFR_SZ(1) = ALIGN(ALLOCATOR%BUFR_SZ(1),128) + DO I = 0, NBUF-1 + ALLOCATOR%BUFR_SZ(I) = ALIGN(ALLOCATOR%BUFR_SZ(I),128) + ENDDO IF (ASSOCIATED(OLD_PTR)) THEN IF (SIZEOF(OLD_PTR) < SUM(ALLOCATOR%BUFR_SZ) ) THEN @@ -122,35 +129,79 @@ FUNCTION GET_ALLOCATION(ALLOCATOR, RESERVATION) IF (RESERVATION%BUF == 0) THEN GET_ALLOCATION(1:) => ALLOCATOR%PTR(1:RESERVATION%SZ) ELSE - GET_ALLOCATION(1:) => ALLOCATOR%PTR(ALLOCATOR%BUFR_SZ(0)+1: & - ALLOCATOR%BUFR_SZ(0)+RESERVATION%SZ) + GET_ALLOCATION(1:) => ALLOCATOR%PTR(SUM(ALLOCATOR%BUFR_SZ(0:RESERVATION%BUF-1))+1: & + SUM(ALLOCATOR%BUFR_SZ(0:RESERVATION%BUF-1))+RESERVATION%SZ) ENDIF END FUNCTION - SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES) + SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE, SET_STREAM) USE ISO_C_BINDING + USE OPENACC, ONLY: ACC_ASYNC_SYNC IMPLICIT NONE - INTEGER(KIND=C_INT8_T), POINTER, INTENT(IN) :: SRC(:) + INTEGER(KIND=C_INT8_T), POINTER, INTENT(INOUT) :: SRC(:) REAL(KIND=C_FLOAT), POINTER, INTENT(OUT) :: DST(:) + LOGICAL, INTENT(IN), OPTIONAL :: SET_VALUE + INTEGER(KIND=4), INTENT(IN), OPTIONAL :: SET_STREAM + LOGICAL :: SET_VALUE_EFF + INTEGER(KIND=4) :: SET_STREAM_EFF INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES IF (START_IN_BYTES + LENGTH_IN_BYTES - 1 > SIZE(SRC, KIND=C_SIZE_T)) THEN PRINT *, "Logical Error in ASSIGN_PTR - OOB assignment" STOP 4 ENDIF - CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES)), DST, & - & [SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES))/SIZEOF(DST(0))]) + IF (PRESENT(SET_VALUE)) THEN + SET_VALUE_EFF = SET_VALUE + ELSE + SET_VALUE_EFF = .FALSE. + ENDIF + IF (PRESENT(SET_STREAM)) THEN + SET_STREAM_EFF = SET_STREAM + ELSE + SET_STREAM_EFF = ACC_ASYNC_SYNC + ENDIF + IF (SET_VALUE_EFF) THEN + ! This option is turned off by default, but for experimentation we can turn it on. This is + ! setting all bits to 1 (meaning NaN in floating point) + !$ACC KERNELS PRESENT(SRC) ASYNC(SET_STREAM_EFF) + SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1) = -1 + !$ACC END KERNELS LOOP + ENDIF + CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, & + & [SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1))/SIZEOF(DST(0))]) END SUBROUTINE - SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES) + SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE, SET_STREAM) USE ISO_C_BINDING + USE OPENACC, ONLY: ACC_ASYNC_SYNC IMPLICIT NONE - INTEGER(KIND=C_INT8_T), POINTER, INTENT(IN) :: SRC(:) + INTEGER(KIND=C_INT8_T), POINTER, INTENT(INOUT) :: SRC(:) REAL(KIND=C_DOUBLE), POINTER, INTENT(OUT) :: DST(:) + LOGICAL, INTENT(IN), OPTIONAL :: SET_VALUE + INTEGER(KIND=4), INTENT(IN), OPTIONAL :: SET_STREAM + LOGICAL :: SET_VALUE_EFF + INTEGER(KIND=4) :: SET_STREAM_EFF INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES IF (START_IN_BYTES + LENGTH_IN_BYTES - 1 > SIZE(SRC, KIND=C_SIZE_T)) THEN PRINT *, "Logical Error in ASSIGN_PTR - OOB assignment" STOP 4 ENDIF - CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES)), DST, & - & [SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES))/SIZEOF(DST(0))]) + IF (PRESENT(SET_VALUE)) THEN + SET_VALUE_EFF = SET_VALUE + ELSE + SET_VALUE_EFF = .FALSE. + ENDIF + IF (PRESENT(SET_STREAM)) THEN + SET_STREAM_EFF = SET_STREAM + ELSE + SET_STREAM_EFF = ACC_ASYNC_SYNC + ENDIF + IF (SET_VALUE_EFF) THEN + ! This option is turned off by default, but for experimentation we can turn it on. This is + ! setting all bits to 1 (meaning NaN in floating point) + !$ACC KERNELS PRESENT(SRC) ASYNC(SET_STREAM_EFF) + SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1) = -1 + !$ACC END KERNELS LOOP + ENDIF + CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, & + & [SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1))/SIZEOF(DST(0))]) END SUBROUTINE END MODULE diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index 1990a154e..6058a41e5 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -191,26 +191,26 @@ SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALA IALLOC_SZ = ALIGN(2*KF_FS*(R%NTMAX+3)*D%NUMP*SIZEOF(POA1_L(1)),128) CALL ASSIGN_PTR(POA1_L, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& - & IALLOC_POS, IALLOC_SZ) + & IALLOC_POS, IALLOC_SZ, SET_STREAM=1) CALL C_F_POINTER(C_LOC(POA1_L), POA1, (/ 2*KF_FS, R%NTMAX+3, D%NUMP /)) IALLOC_POS = IALLOC_POS + IALLOC_SZ IALLOC_SZ = ALIGN(4*KF_UV*(R%NTMAX+3)*D%NUMP*SIZEOF(POA2_L(1)),128) CALL ASSIGN_PTR(POA2_L, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& - & IALLOC_POS, IALLOC_SZ) + & IALLOC_POS, IALLOC_SZ, SET_STREAM=1) CALL C_F_POINTER(C_LOC(POA2_L), POA2, (/ 4*KF_UV, R%NTMAX+3, D%NUMP /)) IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZOUT IALLOC_SZ = ALIGN(IOUT_SIZE*SIZEOF(ZOUT(1)),128) CALL ASSIGN_PTR(ZOUT, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& - & IALLOC_POS, IALLOC_SZ) + & IALLOC_POS, IALLOC_SZ, SET_STREAM=1) IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZOUT0 IALLOC_SZ = ALIGN(IOUT0_SIZE*SIZEOF(ZOUT0(1)),128) CALL ASSIGN_PTR(ZOUT0, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& - & IALLOC_POS, IALLOC_SZ) + & IALLOC_POS, IALLOC_SZ, SET_STREAM=1) IALLOC_POS = IALLOC_POS + IALLOC_SZ ! do the legendre transform From f42b15bc0aff8e86d1a4bca323712e114a98986c Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Wed, 10 Aug 2022 08:24:10 -0700 Subject: [PATCH 255/263] Add some trickery for full app --- src/trans/gpu/internal/ext_acc.F90 | 382 ++++++++++++++++++++++++++ src/trans/gpu/internal/trgtol_mod.F90 | 37 ++- src/trans/gpu/internal/trltog_mod.F90 | 37 ++- 3 files changed, 446 insertions(+), 10 deletions(-) create mode 100644 src/trans/gpu/internal/ext_acc.F90 diff --git a/src/trans/gpu/internal/ext_acc.F90 b/src/trans/gpu/internal/ext_acc.F90 new file mode 100644 index 000000000..283cd27b7 --- /dev/null +++ b/src/trans/gpu/internal/ext_acc.F90 @@ -0,0 +1,382 @@ +module openacc_ext_type + use iso_c_binding + implicit none + private + public :: ext_acc_arr_desc + + ! to my knowledge, this cannot be part of openacc_ext + type ext_acc_arr_desc + integer(c_size_t) :: ptr, sz + end type +end module +module openacc_ext + use iso_c_binding + use openacc + use openacc_ext_type + implicit none + + private + public :: ext_acc_pass, ext_acc_create, ext_acc_copyin, ext_acc_copyout, & + & ext_acc_delete, ext_acc_arr_desc, acc_handle_kind + + type common_pointer_descr + type(c_ptr) :: ptr + integer(c_size_t) :: sz + end type + + interface ext_acc_pass + function ext_acc_pass_2d_r4(arr) result(ret) + use openacc_ext_type + implicit none + type(ext_acc_arr_desc) :: ret + real(4), intent(in) :: arr(:,:) + end function + function ext_acc_pass_3d_r4(arr) result(ret) + use openacc_ext_type + implicit none + type(ext_acc_arr_desc) :: ret + real(4), intent(in) :: arr(:,:,:) + end function + function ext_acc_pass_4d_r4(arr) result(ret) + use openacc_ext_type + implicit none + type(ext_acc_arr_desc) :: ret + real(4), intent(in) :: arr(:,:,:,:) + end function + function ext_acc_pass_2d_r8(arr) result(ret) + use openacc_ext_type + implicit none + type(ext_acc_arr_desc) :: ret + real(8), intent(in) :: arr(:,:) + end function + function ext_acc_pass_3d_r8(arr) result(ret) + use openacc_ext_type + implicit none + type(ext_acc_arr_desc) :: ret + real(8), intent(in) :: arr(:,:,:) + end function + function ext_acc_pass_4d_r8(arr) result(ret) + use openacc_ext_type + implicit none + type(ext_acc_arr_desc) :: ret + real(8), intent(in) :: arr(:,:,:,:) + end function + end interface +contains + + function ext_acc_pass_2d_r4(arr) result(ret) + implicit none + type(ext_acc_arr_desc) :: ret + real(4), intent(in) :: arr(:,:) + + type(c_ptr) :: ptr1, ptr2 + integer(c_size_t) :: ptr1_v, ptr2_v + + ! get full slices for all but the last slice + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2)+1)) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + + ret%ptr = ptr1_v + ret%sz = (ptr2_v - ptr1_v) * (size(arr, 2) - 1) + + ! for the last slice, take the actual offset, otherwise we imght go OOB + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2))) + ptr2 = c_loc(arr(lbound(arr,1)+1, lbound(arr,2))) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 1) + end function + function ext_acc_pass_3d_r4(arr) result(ret) + implicit none + type(ext_acc_arr_desc) :: ret + real(4), intent(in) :: arr(:,:,:) + + type(c_ptr) :: ptr1, ptr2 + integer(c_size_t) :: ptr1_v, ptr2_v + + ! get full slices for all but the last slice + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3)+1)) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + + ret%ptr = ptr1_v + ret%sz = (ptr2_v - ptr1_v) * (size(arr, 3) - 1) + + ! for the last slice, take the actual offset, otherwise we imght go OOB + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2)+1, lbound(arr,3))) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 2) + end function + function ext_acc_pass_4d_r4(arr) result(ret) + implicit none + type(ext_acc_arr_desc) :: ret + real(4), intent(in) :: arr(:,:,:,:) + + type(c_ptr) :: ptr1, ptr2 + integer(c_size_t) :: ptr1_v, ptr2_v + + ! get full slices for all but the last slice + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4)+1)) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + + ret%ptr = ptr1_v + ret%sz = (ptr2_v - ptr1_v) * (size(arr, 4) - 1) + + ! for the last slice, take the actual offset, otherwise we imght go OOB + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3)+1, lbound(arr,4))) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 3) + end function + function ext_acc_pass_2d_r8(arr) result(ret) + implicit none + type(ext_acc_arr_desc) :: ret + real(8), intent(in) :: arr(:,:) + + type(c_ptr) :: ptr1, ptr2 + integer(c_size_t) :: ptr1_v, ptr2_v + + ! get full slices for all but the last slice + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2)+1)) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + + ret%ptr = ptr1_v + ret%sz = (ptr2_v - ptr1_v) * (size(arr, 2) - 1) + + ! for the last slice, take the actual offset, otherwise we imght go OOB + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2))) + ptr2 = c_loc(arr(lbound(arr,1)+1, lbound(arr,2))) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 1) + end function + function ext_acc_pass_3d_r8(arr) result(ret) + implicit none + type(ext_acc_arr_desc) :: ret + real(8), intent(in) :: arr(:,:,:) + + type(c_ptr) :: ptr1, ptr2 + integer(c_size_t) :: ptr1_v, ptr2_v + + ! get full slices for all but the last slice + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3)+1)) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + + ret%ptr = ptr1_v + ret%sz = (ptr2_v - ptr1_v) * (size(arr, 3) - 1) + + ! for the last slice, take the actual offset, otherwise we imght go OOB + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2)+1, lbound(arr,3))) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 2) + end function + function ext_acc_pass_4d_r8(arr) result(ret) + implicit none + type(ext_acc_arr_desc) :: ret + real(8), intent(in) :: arr(:,:,:,:) + + type(c_ptr) :: ptr1, ptr2 + integer(c_size_t) :: ptr1_v, ptr2_v + + ! get full slices for all but the last slice + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4)+1)) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + + ret%ptr = ptr1_v + ret%sz = (ptr2_v - ptr1_v) * (size(arr, 4) - 1) + + ! for the last slice, take the actual offset, otherwise we imght go OOB + ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4))) + ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3)+1, lbound(arr,4))) + ptr1_v= transfer(ptr1, ptr1_v) + ptr2_v= transfer(ptr2, ptr2_v) + ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 3) + end function + function get_common_pointers(in_ptrs, out_ptrs) result(num_ranges) + implicit none + type(ext_acc_arr_desc), intent(in) :: in_ptrs(:) + type(common_pointer_descr), intent(out) :: out_ptrs(:) + + integer(c_size_t), allocatable :: ptrs_only(:) + logical, allocatable :: mask(:) + integer, allocatable :: sort_index(:) + + type(ext_acc_arr_desc), allocatable :: common_ptrs(:) + integer :: i, j, num_ranges + integer(c_size_t) :: start1, start2, end1, end2 + logical :: found + + ! first sort the pointers increasingly such that no gaps are possible + allocate(ptrs_only(size(in_ptrs))) + do i = 1, size(in_ptrs) + ptrs_only(i) = in_ptrs(i)%ptr + enddo + allocate(mask(size(in_ptrs))) + do i = 1, size(in_ptrs) + mask(i) = .true. + enddo + allocate(sort_index(size(in_ptrs))) + do i = 1, size(in_ptrs) + j = minloc(ptrs_only, 1, mask=mask) + mask(j) = .false. + sort_index(i) = j + enddo + + ! initialize + allocate(common_ptrs(size(in_ptrs))) + do i = 1, size(in_ptrs) + common_ptrs(1)%ptr = 0 + common_ptrs(1)%sz = 0 + enddo + + num_ranges = 1 + common_ptrs(1) = in_ptrs(sort_index(1)) + do i = 2, size(in_ptrs) + found = .false. + start1 = in_ptrs(sort_index(i))%ptr + end1 = in_ptrs(sort_index(i))%ptr + in_ptrs(sort_index(i))%sz + do j = 1, num_ranges + start2 = common_ptrs(j)%ptr + end2 = common_ptrs(j)%ptr + common_ptrs(j)%sz + if (max(start1, start2) <= min(end1, end2)) then + ! if we intersect with this range, extend the range + common_ptrs(j)%ptr = min(start1, start2) + common_ptrs(j)%sz = max(end1, end2) - common_ptrs(j)%ptr + found = .true. + exit + endif + enddo + if (.not. found) then + ! if we did not find anything: add a new one + num_ranges = num_ranges + 1 + common_ptrs(num_ranges)%ptr = start1 + common_ptrs(num_ranges)%sz = end1 - start1 + endif + enddo + do i = 1, num_ranges + out_ptrs(i)%ptr = transfer(common_ptrs(i)%ptr, out_ptrs(i)%ptr) + out_ptrs(i)%sz = common_ptrs(i)%sz + enddo + end function + subroutine ext_acc_create(ptrs, stream) + use openacc + implicit none + type(ext_acc_arr_desc), intent(in) :: ptrs(:) + integer(acc_handle_kind), optional :: stream + + type(common_pointer_descr), allocatable :: common_ptrs(:) + + integer :: i, num_ranges + integer(4), pointer :: pp(:) + integer(acc_handle_kind) :: stream_act + + if (present(stream)) then + stream_act = stream + else + stream_act = acc_async_sync + endif + allocate(common_ptrs(size(ptrs))) + num_ranges = get_common_pointers(ptrs, common_ptrs) + + do i = 1, num_ranges + print *, "creating ", common_ptrs(i)%sz + call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/sizeof(pp(1))]) + call acc_create_async(pp, common_ptrs(i)%sz, async=stream_act) + enddo + print *, "done" + end subroutine + subroutine ext_acc_copyin(ptrs, stream) + use openacc + implicit none + type(ext_acc_arr_desc), intent(in) :: ptrs(:) + integer(acc_handle_kind), optional :: stream + + type(common_pointer_descr), allocatable :: common_ptrs(:) + + integer :: i, num_ranges + integer(4), pointer :: pp(:) + + integer(acc_handle_kind) :: stream_act + + if (present(stream)) then + stream_act = stream + else + stream_act = acc_async_sync + endif + allocate(common_ptrs(size(ptrs))) + num_ranges = get_common_pointers(ptrs, common_ptrs) + + do i = 1, num_ranges + call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/sizeof(pp(1))]) + call acc_copyin_async(pp, common_ptrs(i)%sz, async=stream_act) + enddo + end subroutine + subroutine ext_acc_copyout(ptrs, stream) + use openacc + implicit none + type(ext_acc_arr_desc), intent(in) :: ptrs(:) + integer(acc_handle_kind), optional :: stream + + type(common_pointer_descr), allocatable :: common_ptrs(:) + + integer :: i, num_ranges + integer(4), pointer :: pp(:) + + integer(acc_handle_kind) :: stream_act + + if (present(stream)) then + stream_act = stream + else + stream_act = acc_async_sync + endif + allocate(common_ptrs(size(ptrs))) + num_ranges = get_common_pointers(ptrs, common_ptrs) + + do i = 1, num_ranges + call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/sizeof(pp(1))]) + call acc_copyout_async(pp, common_ptrs(i)%sz, async=stream_act) + enddo + end subroutine + subroutine ext_acc_delete(ptrs, stream) + use openacc + implicit none + type(ext_acc_arr_desc), intent(in) :: ptrs(:) + integer(acc_handle_kind), optional :: stream + + type(common_pointer_descr), allocatable :: common_ptrs(:) + + integer :: i, num_ranges + integer(4), pointer :: pp(:) + + integer(acc_handle_kind) :: stream_act + + if (present(stream)) then + stream_act = stream + else + stream_act = acc_async_sync + endif + allocate(common_ptrs(size(ptrs))) + num_ranges = get_common_pointers(ptrs, common_ptrs) + + do i = 1, num_ranges + call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/sizeof(pp(1))]) + call acc_delete_async(pp, common_ptrs(i)%sz, async=stream_act) + enddo + end subroutine +end module diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 51dd4f3b3..733f20af7 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -116,6 +116,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX USE TPM_TRANS ,ONLY : NPROMA USE ALLOCATOR_MOD + USE OPENACC_EXT IMPLICIT NONE @@ -167,6 +168,9 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, INTEGER(JPIM), PARAMETER :: PGP_INDICES_END = 5 INTEGER(JPIM) :: PGP_INDICES(PGP_INDICES_END) + TYPE(EXT_ACC_ARR_DESC) :: ACC_POINTERS(5) ! at most 5 copyins... + INTEGER(KIND=JPIM) :: ACC_POINTERS_CNT = 0 + #ifdef PARKINDTRANS_SINGLE #define TRGTOL_DTYPE MPI_REAL #else @@ -319,11 +323,33 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, CALL GSTATS(430,1) ENDIF CALL GSTATS(412,0) - !$ACC DATA IF(PRESENT(PGP)) COPYIN(PGP) ASYNC(1) - !$ACC DATA IF(PRESENT(PGPUV)) COPYIN(PGPUV) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP2)) COPYIN(PGP2) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3A)) COPYIN(PGP3A) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3B)) COPYIN(PGP3B) ASYNC(1) + ACC_POINTERS_CNT = 0 + IF (PRESENT(PGP)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP) + ENDIF + IF (PRESENT(PGPUV)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGPUV) + ENDIF + IF (PRESENT(PGP2)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP2) + ENDIF + IF (PRESENT(PGP3A)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3A) + ENDIF + IF (PRESENT(PGP3B)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B) + ENDIF + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_COPYIN(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) + !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) + !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) ASYNC(1) IF (LSYNC_TRANS) THEN !$ACC WAIT(1) CALL GSTATS(432,0) @@ -579,6 +605,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC END DATA !PGP2 !$ACC END DATA !PGPUV !$ACC END DATA !PGP + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) IF (LHOOK) CALL DR_HOOK('TRGTOL',1,ZHOOK_HANDLE) END SUBROUTINE TRGTOL diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 90ea26e40..96f78e563 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -115,6 +115,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, NPROMA + USE OPENACC_EXT IMPLICIT NONE @@ -173,6 +174,9 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR + TYPE(EXT_ACC_ARR_DESC) :: ACC_POINTERS(5) ! at most 5 copyins... + INTEGER(KIND=JPIM) :: ACC_POINTERS_CNT = 0 + #ifdef PARKINDTRANS_SINGLE #define TRLTOG_DTYPE MPI_REAL #else @@ -465,11 +469,33 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ !$ACC DATA COPYIN(IIN_TO_SEND_BUFR,IGP_OFFSETS) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP)) COPYOUT(PGP) ASYNC(1) - !$ACC DATA IF(PRESENT(PGPUV)) COPYOUT(PGPUV) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP2)) COPYOUT(PGP2) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3A)) COPYOUT(PGP3A) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3B)) COPYOUT(PGP3B) ASYNC(1) + ACC_POINTERS_CNT = 0 + IF (PRESENT(PGP)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP) + ENDIF + IF (PRESENT(PGPUV)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGPUV) + ENDIF + IF (PRESENT(PGP2)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP2) + ENDIF + IF (PRESENT(PGP3A)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3A) + ENDIF + IF (PRESENT(PGP3B)) THEN + ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 + ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B) + ENDIF + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_COPYIN(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) + !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) + !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) ASYNC(1) + !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) ASYNC(1) ! Present until self contribution and packing are done !$ACC DATA PRESENT(PREEL_REAL) @@ -719,6 +745,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ !$ACC END DATA ! PGP2 !$ACC END DATA ! PGPUV !$ACC END DATA ! PGP + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_COPYOUT(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) IF (LSYNC_TRANS) THEN !$ACC WAIT(1) CALL GSTATS(442,0) From 7b7b7b35c0dfdd3d6ea096fe9cdfa5ed1d660d28 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Thu, 11 Aug 2022 00:26:50 -0700 Subject: [PATCH 256/263] Fix to support different resolutions --- src/trans/gpu/external/setup_trans.F90 | 6 +++++- src/trans/gpu/internal/ext_acc.F90 | 2 -- src/trans/gpu/internal/sump_trans_mod.F90 | 17 +++++++---------- src/trans/gpu/internal/tpm_distr.F90 | 4 +++- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index 34eb818d9..c3d971c72 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -107,7 +107,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& & NMAX_RESOL, NPRINTLEV, LENABLED, NERR USE TPM_DIM ,ONLY : R, DIM_RESOL, R_NSMAX,R_NTMAX, R_NDGNH, R_NDGL USE TPM_DISTR ,ONLY : D, DISTR_RESOL,NPROC,nprtrv, D_NUMP,D_MYMS,D_NSTAGT0B,D_NSTAGT1B,D_NPROCL,D_NPNTGTB1, D_NASM0, & -& D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS,mysetv,mysetw, MYPROC +& D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS,mysetv,mysetw, MYPROC,D_OFFSETS_GEMM1, D_OFFSETS_GEMM2 USE TPM_GEOMETRY ,ONLY : G, GEOM_RESOL, G_NDGLU, G_NMEN, G_NMEN_MAX,G_NLOEN, G_NLOEN_MAX USE TPM_FIELDS ,ONLY : FIELDS_RESOL, F,F_RW, ZEPSNM, & & ZAA,ZAS,& @@ -587,6 +587,10 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& end DO END DO +D_OFFSETS_GEMM1 => D%OFFSETS_GEMM1 +D_OFFSETS_GEMM2 => D%OFFSETS_GEMM2 +!$ACC ENTER DATA COPYIN(D_OFFSETS_GEMM1,D_OFFSETS_GEMM2) + D_NUMP=D%NUMP KMLOC0 = -1 diff --git a/src/trans/gpu/internal/ext_acc.F90 b/src/trans/gpu/internal/ext_acc.F90 index 283cd27b7..777f8cd9b 100644 --- a/src/trans/gpu/internal/ext_acc.F90 +++ b/src/trans/gpu/internal/ext_acc.F90 @@ -295,11 +295,9 @@ subroutine ext_acc_create(ptrs, stream) num_ranges = get_common_pointers(ptrs, common_ptrs) do i = 1, num_ranges - print *, "creating ", common_ptrs(i)%sz call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/sizeof(pp(1))]) call acc_create_async(pp, common_ptrs(i)%sz, async=stream_act) enddo - print *, "done" end subroutine subroutine ext_acc_copyin(ptrs, stream) use openacc diff --git a/src/trans/gpu/internal/sump_trans_mod.F90 b/src/trans/gpu/internal/sump_trans_mod.F90 index 60d5821e3..0f43f02ff 100755 --- a/src/trans/gpu/internal/sump_trans_mod.F90 +++ b/src/trans/gpu/internal/sump_trans_mod.F90 @@ -23,7 +23,7 @@ SUBROUTINE SUMP_TRANS USE TPM_GEN ,ONLY : NOUT, NPRINTLEV USE TPM_DIM ,ONLY : R USE TPM_GEOMETRY ,ONLY : G -USE TPM_DISTR ,ONLY : D, LEQ_REGIONS, MYSETW, NPRTRNS, NPRTRW, NPROC, MYPROC,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2 +USE TPM_DISTR ,ONLY : D, LEQ_REGIONS, MYSETW, NPRTRNS, NPRTRW, NPROC, MYPROC !USE SUWAVEDI_MOD !USE PE2SET_MOD @@ -269,15 +269,15 @@ SUBROUTINE SUMP_TRANS IF(ALLOCATED(ZDUM)) DEALLOCATE(ZDUM) DEALLOCATE(IGPTOTL) -ALLOCATE(D_OFFSETS_GEMM1(D%NUMP+1)) -ALLOCATE(D_OFFSETS_GEMM2(D%NUMP+1)) +ALLOCATE(D%OFFSETS_GEMM1(D%NUMP+1)) +ALLOCATE(D%OFFSETS_GEMM2(D%NUMP+1)) OFFSET1 = 0 OFFSET2 = 0 DO KMLOC=1,D%NUMP KM = D%MYMS(KMLOC) - D_OFFSETS_GEMM1(KMLOC) = OFFSET1 - D_OFFSETS_GEMM2(KMLOC) = OFFSET2 + D%OFFSETS_GEMM1(KMLOC) = OFFSET1 + D%OFFSETS_GEMM2(KMLOC) = OFFSET2 !KM=0 is transformed in double precision, no need to store here IF (KM /= 0) THEN @@ -286,11 +286,8 @@ SUBROUTINE SUMP_TRANS OFFSET2 = OFFSET2 + ALIGN((R%NSMAX-KM+3)/2,8) ENDIF ENDDO -D_OFFSETS_GEMM1(D%NUMP+1) = OFFSET1 -D_OFFSETS_GEMM2(D%NUMP+1) = OFFSET2 - -!$ACC ENTER DATA COPYIN(D_OFFSETS_GEMM1,D_OFFSETS_GEMM2) - +D%OFFSETS_GEMM1(D%NUMP+1) = OFFSET1 +D%OFFSETS_GEMM2(D%NUMP+1) = OFFSET2 ! ------------------------------------------------------------------ 9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8) diff --git a/src/trans/gpu/internal/tpm_distr.F90 b/src/trans/gpu/internal/tpm_distr.F90 index 6a0bf0fdc..9cf2f809d 100755 --- a/src/trans/gpu/internal/tpm_distr.F90 +++ b/src/trans/gpu/internal/tpm_distr.F90 @@ -160,6 +160,8 @@ MODULE TPM_DISTR REAL(KIND=JPRBT) ,ALLOCATABLE :: RWEIGHT(:) ! Weight per grid-point (if weighted distribution) INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPROCA_GP(:) ! Number of grid-points per a-set +INTEGER(KIND=JPIM), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:) + END TYPE DISTR_TYPE !flat versions of the above @@ -182,7 +184,7 @@ MODULE TPM_DISTR ! The offsets in the input and output arrays to the gemms. ! (1) are the offsets in the "inputs" of dirtrans ("outputs" invtrans) ! (2) are the offsets in the "outputs" of invtrans ("inputs" dirtrans) -INTEGER(KIND=JPIM), ALLOCATABLE :: D_OFFSETS_GEMM1(:), D_OFFSETS_GEMM2(:) +INTEGER(KIND=JPIM), POINTER :: D_OFFSETS_GEMM1(:), D_OFFSETS_GEMM2(:) TYPE(DISTR_TYPE),ALLOCATABLE,TARGET :: DISTR_RESOL(:) TYPE(DISTR_TYPE),POINTER :: D From 2ac1b3b6c8bc95c29a76a20f78f1cc2602e0acf0 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Thu, 11 Aug 2022 00:27:35 -0700 Subject: [PATCH 257/263] Typo in driver --- src/programs/driver-spectraltransform_indiv.F90 | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/programs/driver-spectraltransform_indiv.F90 b/src/programs/driver-spectraltransform_indiv.F90 index 920d92fa5..55992dbbd 100644 --- a/src/programs/driver-spectraltransform_indiv.F90 +++ b/src/programs/driver-spectraltransform_indiv.F90 @@ -951,7 +951,7 @@ PROGRAM TRANSFORM_TEST ZGMV = 0 CALL INV_TRANS(PSPSC2=ZT(:,:,1),& & PGP2=ZGMV(:,:,5,:), & - & KRESOL=1,KPROMA=NGPTOT,KVSETSC2=IVSET) + & KRESOL=1,KPROMA=NPROMA,KVSETSC2=IVSET) CALL DUMP_GRIDPOINT_FIELD_4D(JJSTEP, MYPROC, ZGMV(:,:,:,:), 'M', NOUTDUMP) ZTSTEP1(JSTEP)=(TIMEF()-ZTSTEP1(JSTEP))/1000.0_JPRD @@ -1013,7 +1013,7 @@ PROGRAM TRANSFORM_TEST ZDIV = 0 CALL DIR_TRANS(PSPSC2=ZT(:,:,1),& & PGP2=ZGMV(:,:,5,:), & - & KRESOL=1,KPROMA=NGPTOT,KVSETSC2=IVSET) + & KRESOL=1,KPROMA=NPROMA,KVSETSC2=IVSET) CALL DUMP_GRIDPOINT_FIELD_3D(JJSTEP, MYPROC, ZT(:,:,:), 'T', NOUTDUMP) JJSTEP = 14 @@ -1044,7 +1044,8 @@ PROGRAM TRANSFORM_TEST ZT = 0 ZVOR = 0 ZDIV = 0 - CALL DIR_TRANS(PSPSCALAR=ZSP(1:1,:),PGP=ZGMVS(:,1:1,:),KVSETSC=IVSETSC(1:1)) + CALL DIR_TRANS(PSPSCALAR=ZSP(1:1,:),PGP=ZGMVS(:,1:1,:),KVSETSC=IVSETSC(1:1),& + & KPROMA=NPROMA) CALL DUMP_GRIDPOINT_FIELD_2D(JJSTEP, MYPROC, ZSP(:,:), 'V', NOUTDUMP) JJSTEP = 15 From c771877be8229f2db25c9f153b1d5364c79e174a Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Thu, 11 Aug 2022 06:11:32 -0700 Subject: [PATCH 258/263] Cleanup setup_trans / do no re-allocate arrays --- src/trans/gpu/external/setup_trans.F90 | 129 +++++------------------- src/trans/gpu/internal/tpm_distr.F90 | 22 ++-- src/trans/gpu/internal/tpm_fields.F90 | 2 +- src/trans/gpu/internal/tpm_geometry.F90 | 7 +- 4 files changed, 40 insertions(+), 120 deletions(-) diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index c3d971c72..a4714b476 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -486,121 +486,43 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& CALL PREPSNM !$ACC ENTER DATA COPYIN(ZEPSNM) -! TODO: I guess tose might be needed again -! add arrays for GPNORM1 -!ALLOCATE(ZAVE(IF_FS,R%NDGL)) -!ALLOCATE(ZMINGL(IF_FS,R%NDGL)) -!ALLOCATE(ZMAXGL(IF_FS,R%NDGL)) -!ALLOCATE(ZMINGPN(IF_FS)) -!ALLOCATE(ZMAXGPN(IF_FS)) - -!ZAVE = 0._JPRBT -!ZMINGL = 0._JPRBT -!ZMAXGL = 0._JPRBT -!ZMINGPN = 0._JPRBT -!ZMAXGPN = 0._JPRBT -!!$ACC ENTER DATA COPYIN(ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) - !set up flat copies of constant data R_NSMAX=R%NSMAX R_NTMAX=R%NTMAX R_NDGNH=R%NDGNH R_NDGL=R%NDGL +G_NDGLU => G%NDGLU + +G_NMEN => G%NMEN +G_NMEN_MAX=MAXVAL(G_NMEN) + +G_NLOEN => G%NLOEN +G_NLOEN_MAX=MAXVAL(G_NLOEN) + +D_NSTAGT0B => D%NSTAGT0B +D_NSTAGT1B => D%NSTAGT1B + +D_NPROCL => D%NPROCL +D_NASM0 => D%NASM0 +D_NSTAGTF => D%NSTAGTF +D_MSTABF => D%MSTABF +D_NPROCM => D%NPROCM +D_NPTRLS => D%NPTRLS -ALLOCATE(D_NSTAGT0B(SIZE(D%NSTAGT0B))) -ALLOCATE(D_NSTAGT1B(SIZE(D%NSTAGT1B))) -ALLOCATE(D_NPNTGTB0(0:SIZE(D%NPNTGTB0,1)-1,SIZE(D%NPNTGTB0,2))) -ALLOCATE(D_NPNTGTB1(SIZE(D%NPNTGTB1,1),SIZE(D%NPNTGTB1,2))) -ALLOCATE(D_MYMS(SIZE(D%MYMS))) -ALLOCATE(D_NPROCL(SIZE(D%NPROCL))) -ALLOCATE(D_NASM0(0:SIZE(D%NASM0)-1)) -ALLOCATE(D_NSTAGTF(SIZE(D%NSTAGTF))) -ALLOCATE(D_MSTABF(SIZE(D%MSTABF))) -ALLOCATE(D_NPROCM(0:SIZE(D%NPROCM)-1)) -ALLOCATE(D_NPTRLS(SIZE(D%NPTRLS))) - -ALLOCATE(G_NDGLU(0:SIZE(G%NDGLU)-1)) -ALLOCATE(G_NMEN(SIZE(G%NMEN))) -ALLOCATE(G_NLOEN(SIZE(G%NLOEN))) - -ALLOCATE(F_RW(SIZE(F%RW))) - - -DO I=0,SIZE(G%NDGLU)-1 - G_NDGLU(I)=G%NDGLU(I) -end DO - -G_NMEN_MAX=0 -DO I=1,SIZE(G%NMEN) - G_NMEN(I)=G%NMEN(I) - if (G_NMEN(I) .gt. G_NMEN_MAX) G_NMEN_MAX=G_NMEN(I) -end DO - -G_NLOEN_MAX=0 -DO I=1,SIZE(G%NLOEN) - G_NLOEN(I)=G%NLOEN(I) - if (G_NLOEN(I) .gt. G_NLOEN_MAX) G_NLOEN_MAX=G_NLOEN(I) -end DO - -DO I=1,SIZE(D%NSTAGT0B) - D_NSTAGT0B(I)=D%NSTAGT0B(I) -END DO - -DO I=1,SIZE(D%NSTAGT1B) - D_NSTAGT1B(I)=D%NSTAGT1B(I) -END DO - -DO I=1,SIZE(D%NPROCL) - D_NPROCL(I)=D%NPROCL(I) -END DO - -DO I=0,SIZE(D%NASM0)-1 - D_NASM0(I)=D%NASM0(I) -END DO - -DO I=1,SIZE(D%NSTAGTF) - D_NSTAGTF(I)=D%NSTAGTF(I) -END DO - -DO I=1,SIZE(D%MSTABF) - D_MSTABF(I)=D%MSTABF(I) -END DO - -DO I=0,SIZE(D%NPROCM)-1 - D_NPROCM(I)=D%NPROCM(I) -END DO - -DO I=1,SIZE(D%NPTRLS) - D_NPTRLS(I)=D%NPTRLS(I) -END DO - -DO I=1,SIZE(D%NPNTGTB0,2) - DO J=0,SIZE(D%NPNTGTB0,1)-1 - D_NPNTGTB0(J,I)=D%NPNTGTB0(J,I) - end DO -END DO - -DO I=1,SIZE(D%NPNTGTB1,2) - DO J=1,SIZE(D%NPNTGTB1,1) - D_NPNTGTB1(J,I)=D%NPNTGTB1(J,I) - end DO -END DO +D_NPNTGTB0 => D%NPNTGTB0 +D_NPNTGTB1 => D%NPNTGTB1 D_OFFSETS_GEMM1 => D%OFFSETS_GEMM1 D_OFFSETS_GEMM2 => D%OFFSETS_GEMM2 -!$ACC ENTER DATA COPYIN(D_OFFSETS_GEMM1,D_OFFSETS_GEMM2) D_NUMP=D%NUMP -KMLOC0 = -1 -DO I=1,SIZE(D%MYMS) - D_MYMS(I)=D%MYMS(I) - IF(D_MYMS(I) == 0) KMLOC0 = I -end DO +KMLOC0 = FINDLOC(D%MYMS, VALUE=0, DIM=1) +D_MYMS => D%MYMS ! arrays for m=0 in ledir_mod: -IF(KMLOC0 >= 0) THEN +IF(KMLOC0 > 0) THEN ALLOCATE(ZAA0(SIZE(ZAA,1),SIZE(ZAA,2))) ALLOCATE(ZAS0(SIZE(ZAS,1),SIZE(ZAS,2))) ZAA0 = ZAA(:,:,KMLOC0) @@ -609,17 +531,14 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& WRITE(NOUT,*) 'GPU arrays for m=0 successfully allocated' ENDIF -DO I=1,SIZE(F%RW) - F_RW(I)=F%RW(I) -END DO +F_RW => F%RW !$ACC ENTER DATA COPYIN(D_NSTAGT0B,D_NSTAGT1B,& !$ACC& D_NPNTGTB1,D_NPROCL,D_MYMS,D_NASM0,D_NSTAGTF,D_MSTABF,& !$ACC& D_NPNTGTB0,D_NPROCM,D_NPTRLS,G_NDGLU,G_NMEN,G_NLOEN,& -!$ACC& F_RW) +!$ACC& F_RW,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2) WRITE(NOUT,*) '===GPU arrays successfully allocated' -!$ACC wait ! free memory !DO JMLOC=1,D%NUMP diff --git a/src/trans/gpu/internal/tpm_distr.F90 b/src/trans/gpu/internal/tpm_distr.F90 index 9cf2f809d..e72b70560 100755 --- a/src/trans/gpu/internal/tpm_distr.F90 +++ b/src/trans/gpu/internal/tpm_distr.F90 @@ -166,19 +166,19 @@ MODULE TPM_DISTR !flat versions of the above INTEGER(KIND=JPIM) :: D_NUMP ! No. of spectral waves handled by this processor -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_MYMS(:) ! Wave numbers handled by this PE -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGT0B(:) ! Start adresses for segments within buffer +INTEGER(KIND=JPIM) ,POINTER :: D_MYMS(:) ! Wave numbers handled by this PE +INTEGER(KIND=JPIM) ,POINTER :: D_NSTAGT0B(:) ! Start adresses for segments within buffer ! (according to processors to whom data ! is going to be sent) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGT1B(:) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPROCL(:) ! Process responsible for each lat. (F.S) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPNTGTB1(:,:) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NASM0(:) ! Address in a spectral array of (m, n=m) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGTF(:) ! Offset for specific latitude in -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_MSTABF(:) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPNTGTB0(:,:) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPROCM(:) ! Process that does the calc. for certain -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPTRLS(:) ! Pointer to first lat. (F.S) +INTEGER(KIND=JPIM) ,POINTER :: D_NSTAGT1B(:) +INTEGER(KIND=JPIM) ,POINTER :: D_NPROCL(:) ! Process responsible for each lat. (F.S) +INTEGER(KIND=JPIM) ,POINTER :: D_NPNTGTB1(:,:) +INTEGER(KIND=JPIM) ,POINTER :: D_NASM0(:) ! Address in a spectral array of (m, n=m) +INTEGER(KIND=JPIM) ,POINTER :: D_NSTAGTF(:) ! Offset for specific latitude in +INTEGER(KIND=JPIM) ,POINTER :: D_MSTABF(:) +INTEGER(KIND=JPIM) ,POINTER :: D_NPNTGTB0(:,:) +INTEGER(KIND=JPIM) ,POINTER :: D_NPROCM(:) ! Process that does the calc. for certain +INTEGER(KIND=JPIM) ,POINTER :: D_NPTRLS(:) ! Pointer to first lat. (F.S) ! The offsets in the input and output arrays to the gemms. diff --git a/src/trans/gpu/internal/tpm_fields.F90 b/src/trans/gpu/internal/tpm_fields.F90 index a27ad81a2..1492e6e32 100755 --- a/src/trans/gpu/internal/tpm_fields.F90 +++ b/src/trans/gpu/internal/tpm_fields.F90 @@ -33,7 +33,7 @@ MODULE TPM_FIELDS END TYPE FIELDS_TYPE !flat copies of the above -REAL(KIND=JPRBT) ,ALLOCATABLE :: F_RW(:) ! Weights of the Gaussian quadrature +REAL(KIND=JPRBT) ,POINTER :: F_RW(:) ! Weights of the Gaussian quadrature TYPE(FIELDS_TYPE),ALLOCATABLE,TARGET :: FIELDS_RESOL(:) TYPE(FIELDS_TYPE),POINTER :: F diff --git a/src/trans/gpu/internal/tpm_geometry.F90 b/src/trans/gpu/internal/tpm_geometry.F90 index 21ce925bd..c2738c6a4 100755 --- a/src/trans/gpu/internal/tpm_geometry.F90 +++ b/src/trans/gpu/internal/tpm_geometry.F90 @@ -1,4 +1,5 @@ ! (C) Copyright 2000- ECMWF. +! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -31,10 +32,10 @@ MODULE TPM_GEOMETRY END TYPE GEOM_TYPE !flat copies of the above -INTEGER(KIND=JPIM),ALLOCATABLE :: G_NDGLU(:) ! NUMBER OF HEMISPERIC LATITUDES -INTEGER(KIND=JPIM),ALLOCATABLE :: G_NMEN(:) ! ASSOCIATED CUT-OFF WAVE NUMBER +INTEGER(KIND=JPIM),POINTER :: G_NDGLU(:) ! NUMBER OF HEMISPERIC LATITUDES +INTEGER(KIND=JPIM),POINTER :: G_NMEN(:) ! ASSOCIATED CUT-OFF WAVE NUMBER INTEGER(KIND=JPIM) :: G_NMEN_MAX -INTEGER(KIND=JPIM),ALLOCATABLE :: G_NLOEN(:) ! NUMBER OF POINTS ON A PARALLEL +INTEGER(KIND=JPIM),POINTER :: G_NLOEN(:) ! NUMBER OF POINTS ON A PARALLEL INTEGER(KIND=JPIM) :: G_NLOEN_MAX TYPE(GEOM_TYPE),ALLOCATABLE,TARGET :: GEOM_RESOL(:) From d7f8b37dfa5c0bb64c665e4be8f9ecc80a6b18ec Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 30 Aug 2022 02:45:11 -0700 Subject: [PATCH 259/263] Tiny mix in allocator (not acually used in production) --- src/trans/gpu/internal/allocator_mod.F90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/trans/gpu/internal/allocator_mod.F90 b/src/trans/gpu/internal/allocator_mod.F90 index dad805205..cf65d215b 100644 --- a/src/trans/gpu/internal/allocator_mod.F90 +++ b/src/trans/gpu/internal/allocator_mod.F90 @@ -159,7 +159,7 @@ SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE ELSE SET_STREAM_EFF = ACC_ASYNC_SYNC ENDIF - IF (SET_VALUE_EFF) THEN + IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN ! This option is turned off by default, but for experimentation we can turn it on. This is ! setting all bits to 1 (meaning NaN in floating point) !$ACC KERNELS PRESENT(SRC) ASYNC(SET_STREAM_EFF) @@ -194,7 +194,7 @@ SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALU ELSE SET_STREAM_EFF = ACC_ASYNC_SYNC ENDIF - IF (SET_VALUE_EFF) THEN + IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN ! This option is turned off by default, but for experimentation we can turn it on. This is ! setting all bits to 1 (meaning NaN in floating point) !$ACC KERNELS PRESENT(SRC) ASYNC(SET_STREAM_EFF) From b2953e46143962c0e3046d82150cd1ed6c2f4587 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Wed, 31 Aug 2022 23:10:18 -0700 Subject: [PATCH 260/263] FIX: GPNORM issue when NLEV changes across calls --- src/trans/gpu/external/gpnorm_trans.F90 | 41 ++++++++----------------- src/trans/gpu/external/setup_trans.F90 | 1 - src/trans/gpu/external/trans_end.F90 | 2 +- src/trans/gpu/internal/tpm_trans.F90 | 8 ----- 4 files changed, 13 insertions(+), 39 deletions(-) diff --git a/src/trans/gpu/external/gpnorm_trans.F90 b/src/trans/gpu/external/gpnorm_trans.F90 index f81a0f04e..18dff4e99 100755 --- a/src/trans/gpu/external/gpnorm_trans.F90 +++ b/src/trans/gpu/external/gpnorm_trans.F90 @@ -57,9 +57,9 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) USE TPM_GEN ,ONLY : NOUT USE TPM_DIM ,ONLY : R -USE TPM_TRANS ,ONLY : LGPNORM, NGPBLKS, NPROMA, ZAVE, ZMINGL, ZMAXGL, ZMINGPN, ZMAXGPN +USE TPM_TRANS ,ONLY : LGPNORM, NGPBLKS, NPROMA USE TPM_DISTR ,ONLY : D, NPRCIDS, NPRTRV, NPRTRW, MYSETV, MYSETW, NPROC, D_NSTAGTF,D_NPTRLS, MYPROC -USE TPM_GEOMETRY ,ONLY : G,G_NLOEN,G_NLOEN_MAX +USE TPM_GEOMETRY ,ONLY : G,G_NLOEN USE TPM_FIELDS ,ONLY : F USE SET_RESOL_MOD ,ONLY : SET_RESOL USE SET2PE_MOD ,ONLY : SET2PE @@ -96,11 +96,11 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) !GPU REAL(KIND=JPRBT) :: V REAL(KIND=JPRBT), POINTER :: PREEL_REAL(:) -!REAL(KIND=JPRD),ALLOCATABLE :: ZAVE(:,:) -!REAL(KIND=JPRBT),ALLOCATABLE :: ZMINGL(:,:) -!REAL(KIND=JPRBT),ALLOCATABLE :: ZMAXGL(:,:) -!REAL(KIND=JPRBT),ALLOCATABLE :: ZMIN(:) -!REAL(KIND=JPRBT),ALLOCATABLE :: ZMAX(:) +REAL(KIND=JPRD),ALLOCATABLE :: ZAVE(:,:) +REAL(KIND=JPRBT),ALLOCATABLE :: ZMINGL(:,:) +REAL(KIND=JPRBT),ALLOCATABLE :: ZMAXGL(:,:) +REAL(KIND=JPRBT),ALLOCATABLE :: ZMINGPN(:) +REAL(KIND=JPRBT),ALLOCATABLE :: ZMAXGPN(:) REAL(KIND=JPRD),ALLOCATABLE :: ZAVEG(:,:) REAL(KIND=JPRB),ALLOCATABLE :: ZMING(:) REAL(KIND=JPRB),ALLOCATABLE :: ZMAXG(:) @@ -151,7 +151,6 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) IF_FS=IF_FS+1 ENDIF ENDDO -if (.not. allocated(zave)) then ALLOCATE(ZAVE(IF_FS,R%NDGL)) ALLOCATE(ZMINGL(IF_FS,R%NDGL)) ALLOCATE(ZMAXGL(IF_FS,R%NDGL)) @@ -163,9 +162,7 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) ZMAXGL = 0._JPRBT ZMINGPN = 0._JPRBT ZMAXGPN = 0._JPRBT -!$ACC ENTER DATA COPYIN(ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) - -endif +!$ACC DATA COPY(ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) ALLOCATE(IVSETS(NPRTRV)) IVSETS(:)=0 @@ -190,7 +187,6 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) HTRGTOL = PREPARE_TRGTOL(ALLOCATOR,IF_GP,IF_FS) CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, REUSE_PTR) -! done in setup_trans LGPNORM=.TRUE. CALL TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,IF_FS,IF_GP,0,IF_SCALARS_G,& & KVSETSC=IVSET,PGP=PGP) @@ -202,10 +198,9 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) CALL GSTATS(1429,0) IF( IF_FS > 0 )THEN - !$ACC data & - !$ACC& COPY(F,F%RW) & - !$ACC& COPY(D,D_NSTAGTF,D_NPTRLS,G_NLOEN,G_NLOEN_MAX) & - !$ACC& present(PREEL_REAL,ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) + !$ACC DATA & + !$ACC& PRESENT(F,F%RW) & + !$ACC& PRESENT(D,D_NSTAGTF,D_NPTRLS,G_NLOEN) !$ACC KERNELS DO JF=1,IF_FS @@ -252,20 +247,8 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) !$ACC end data -!$ACC update host(ZAVE) -!$ACC update host(ZMINGPN) -!$ACC update host(ZMAXGPN) -!$ACC wait - -!iunit=300+myproc -!DO JGL=IBEG,IEND -! IGL = D_NPTRLS(MYSETW) + JGL - 1 -! DO JF=1,IF_FS -! write(iunit,*) 'aver final ',JF,IF_FS,IGL,ZAVE(JF,JGL),ZMINGPN(JF),ZMAXGPN(JF) -! ENDDO -!ENDDO - ENDIF +!$ACC end data CALL GSTATS(1429,1) ! from here rest on CPU diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index a4714b476..e64d8c5ae 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -118,7 +118,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& USE TPM_FFTW ,ONLY : TW, FFTW_RESOL #endif USE TPM_FLT -USE TPM_TRANS ,ONLY : ZAVE, ZMINGL, ZMAXGL, ZMINGPN, ZMAXGPN USE TPM_CTL USE SET_RESOL_MOD ,ONLY : SET_RESOL diff --git a/src/trans/gpu/external/trans_end.F90 b/src/trans/gpu/external/trans_end.F90 index 590a850ef..cc03abc6d 100755 --- a/src/trans/gpu/external/trans_end.F90 +++ b/src/trans/gpu/external/trans_end.F90 @@ -57,7 +57,7 @@ SUBROUTINE TRANS_END(CDMODE) USE TPM_FFTW ,ONLY : TW, FFTW_RESOL #endif USE TPM_FLT -USE TPM_TRANS ,ONLY : FOUBUF, FOUBUF_IN, ZGTF +USE TPM_TRANS ,ONLY : FOUBUF, FOUBUF_IN USE EQ_REGIONS_MOD ,ONLY : N_REGIONS USE SET_RESOL_MOD ,ONLY : SET_RESOL diff --git a/src/trans/gpu/internal/tpm_trans.F90 b/src/trans/gpu/internal/tpm_trans.F90 index ac6081272..8b72deae9 100755 --- a/src/trans/gpu/internal/tpm_trans.F90 +++ b/src/trans/gpu/internal/tpm_trans.F90 @@ -56,14 +56,6 @@ MODULE TPM_TRANS LOGICAL :: LGPNORM = .FALSE. ! indicates whether transform is being done for gpnorm -REAL(KIND=JPRBT),ALLOCATABLE,TARGET :: ZGTF(:,:) - -REAL(KIND=JPRBT),ALLOCATABLE :: ZAVE(:,:) -REAL(KIND=JPRBT),ALLOCATABLE :: ZMINGL(:,:) -REAL(KIND=JPRBT),ALLOCATABLE :: ZMAXGL(:,:) -REAL(KIND=JPRBT),ALLOCATABLE :: ZMINGPN(:) -REAL(KIND=JPRBT),ALLOCATABLE :: ZMAXGPN(:) - ! This is used in fourier space and in spectral space. It's reused among ! the transforms because we cannot reallocate - the captured CUDA graphs ! should not be modified. Hence, we keep it if it is large enough, otherwise From bcae6f9a445a20bda8eaaf34968436da4fc75304 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Fri, 2 Sep 2022 11:08:24 +0200 Subject: [PATCH 261/263] Remove redundant transfers --- src/trans/gpu/internal/trgtol_mod.F90 | 17 ++++++++++++++++- src/trans/gpu/internal/trltog_mod.F90 | 19 +++++++++++++++++-- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 733f20af7..6ed2a3273 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -344,7 +344,22 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B) ENDIF - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_COPYIN(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) + IF (PRESENT(PGP)) THEN + !$ACC UPDATE DEVICE(PGP) ASYNC(1) + ENDIF + IF (PRESENT(PGPUV)) THEN + !$ACC UPDATE DEVICE(PGPUV) ASYNC(1) + ENDIF + IF (PRESENT(PGP2)) THEN + !$ACC UPDATE DEVICE(PGP2) ASYNC(1) + ENDIF + IF (PRESENT(PGP3A)) THEN + !$ACC UPDATE DEVICE(PGP3A) ASYNC(1) + ENDIF + IF (PRESENT(PGP3B)) THEN + !$ACC UPDATE DEVICE(PGP3B) ASYNC(1) + ENDIF !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1) !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) ASYNC(1) diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 96f78e563..67bd18b5f 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -490,7 +490,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B) ENDIF - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_COPYIN(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1) !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) ASYNC(1) @@ -745,7 +745,22 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ !$ACC END DATA ! PGP2 !$ACC END DATA ! PGPUV !$ACC END DATA ! PGP - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_COPYOUT(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) + IF (PRESENT(PGP)) THEN + !$ACC UPDATE HOST(PGP) ASYNC(1) + ENDIF + IF (PRESENT(PGPUV)) THEN + !$ACC UPDATE HOST(PGPUV) ASYNC(1) + ENDIF + IF (PRESENT(PGP2)) THEN + !$ACC UPDATE HOST(PGP2) ASYNC(1) + ENDIF + IF (PRESENT(PGP3A)) THEN + !$ACC UPDATE HOST(PGP3A) ASYNC(1) + ENDIF + IF (PRESENT(PGP3B)) THEN + !$ACC UPDATE HOST(PGP3B) ASYNC(1) + ENDIF + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) IF (LSYNC_TRANS) THEN !$ACC WAIT(1) CALL GSTATS(442,0) From a017a5b0f8ffa827a5cbdb71a6f8321552f79593 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Tue, 4 Oct 2022 05:46:49 -0700 Subject: [PATCH 262/263] Add missing copyrights --- AUTHORS | 11 ++++++----- CMakeLists.txt | 1 - src/trans/gpu/algor/external/fourier/fft_wrapper.cu | 8 ++++++++ src/trans/gpu/algor/external/gemm/gemm_wrapper.cu | 8 ++++++++ src/trans/gpu/internal/allocator_mod.F90 | 7 +++++++ src/trans/gpu/internal/cuda_gemm_batched_mod.F90 | 7 +++++++ src/trans/gpu/internal/ext_acc.F90 | 7 +++++++ 7 files changed, 43 insertions(+), 6 deletions(-) diff --git a/AUTHORS b/AUTHORS index c8f0a1ccc..43724eccd 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,20 +1,21 @@ Authors and Contributors ======================== +- P. Courtier (ECMWF) - W. Deconinck (ECMWF) +- D. Degrauwe (RMI) - D. Dent (ECMWF) - P. Dueben (ECMWF) - R. El Khatib (Meteo France) +- D. Giard (Meteo France) - J. Hague (ECMWF) - M. Hamrud (ECMWF) +- M. Hortal (ECMWF) - L. Isaksen (ECMWF) -- G. Mozdzynski (ECMWF) - P. Marguinaud (Meteo France) +- L. Mosimann (NVIDIA) +- G. Mozdzynski (ECMWF) - A. Mueller (ECMWF) -- M. Hortal (ECMWF) -- P. Courtier (ECMWF) -- D. Degrauwe (RMI) -- D. Giard (Meteo France) - G. Radnoti (ECMWF) - D. Salmond (ECMWF) - Y. Seity (Meteo France) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1e57b1b26..85f0bf425 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,4 @@ # (C) Copyright 2020- ECMWF. -# (C) Copyright 2022- NVIDIA. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. diff --git a/src/trans/gpu/algor/external/fourier/fft_wrapper.cu b/src/trans/gpu/algor/external/fourier/fft_wrapper.cu index fcff3eefe..d4f3d7f5d 100644 --- a/src/trans/gpu/algor/external/fourier/fft_wrapper.cu +++ b/src/trans/gpu/algor/external/fourier/fft_wrapper.cu @@ -1,3 +1,11 @@ +// (C) Copyright 2022- NVIDIA. +// +// This software is licensed under the terms of the Apache Licence Version 2.0 +// which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +// In applying this licence, ECMWF does not waive the privileges and immunities +// granted to it by virtue of its status as an intergovernmental organisation +// nor does it submit to any jurisdiction. + #include "cufft.h" #include "stdio.h" #include diff --git a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu index 9f3afc161..70ec2e47a 100644 --- a/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu +++ b/src/trans/gpu/algor/external/gemm/gemm_wrapper.cu @@ -1,3 +1,11 @@ +// (C) Copyright 2022- NVIDIA. +// +// This software is licensed under the terms of the Apache Licence Version 2.0 +// which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +// In applying this licence, ECMWF does not waive the privileges and immunities +// granted to it by virtue of its status as an intergovernmental organisation +// nor does it submit to any jurisdiction. + #include #include diff --git a/src/trans/gpu/internal/allocator_mod.F90 b/src/trans/gpu/internal/allocator_mod.F90 index cf65d215b..521d82cd6 100644 --- a/src/trans/gpu/internal/allocator_mod.F90 +++ b/src/trans/gpu/internal/allocator_mod.F90 @@ -1,3 +1,10 @@ +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. #define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) MODULE ALLOCATOR_MOD diff --git a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 index ab7ec73a7..3edf59305 100755 --- a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 +++ b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 @@ -1,3 +1,10 @@ +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. MODULE CUDA_GEMM_BATCHED_MOD USE PARKIND1, ONLY: JPRD, JPRM, JPIM USE CUBLAS, ONLY: CUBLAS_OP_N, CUBLAS_OP_T diff --git a/src/trans/gpu/internal/ext_acc.F90 b/src/trans/gpu/internal/ext_acc.F90 index 777f8cd9b..8beaa0172 100644 --- a/src/trans/gpu/internal/ext_acc.F90 +++ b/src/trans/gpu/internal/ext_acc.F90 @@ -1,3 +1,10 @@ +! (C) Copyright 2022- NVIDIA. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. module openacc_ext_type use iso_c_binding implicit none From 19b4d1378d484e86a5aadfc341ca7d94a2ba82b0 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Mon, 17 Oct 2022 23:44:14 -0700 Subject: [PATCH 263/263] Fix install of interface --- src/trans/gpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/trans/gpu/CMakeLists.txt b/src/trans/gpu/CMakeLists.txt index 05e4beca8..bdd3c4f85 100644 --- a/src/trans/gpu/CMakeLists.txt +++ b/src/trans/gpu/CMakeLists.txt @@ -87,7 +87,7 @@ set_property( TARGET gpu PROPERTY CUDA_ARCHITECTURES 70 ) ## Install trans interface -file( GLOB trans_interface interface/* ) +file( GLOB trans_interface include/ectrans/* ) install( FILES ${trans_interface} DESTINATION include/ectrans