Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run the entire advection loop on device #34

Open
wants to merge 17 commits into
base: develop-1.3
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,17 @@ ecbuild_add_option( FEATURE ACC
REQUIRED_PACKAGES "OpenACC COMPONENTS Fortran"
CONDITION HAVE_LOKI AND NOT LOKI_MODE MATCHES "idem|idem-stack" )

### CUDA-aware MPI
### CUDA
include(CheckLanguage)
check_language(CUDA)
ecbuild_add_option( FEATURE CUDA
DESCRIPTION "CUDA" DEFAULT OFF
CONDITION CMAKE_CUDA_COMPILER AND HAVE_ACC )
if( HAVE_CUDA )
enable_language( CUDA )
endif()

### GPU-aware MPI
ecbuild_add_option( FEATURE GPU_AWARE_MPI
DEFAULT OFF
DESCRIPTION "Enable GPU-aware MPI"
Expand Down
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -232,16 +232,15 @@ Building
The recommended option for building the GPU enabled ecWAM is to use the provided bundle, and pass the
`--with-loki --with-acc` options. Different Loki transformations can also be chosen at build-time via the following
bundle option: `--loki-mode=<trafo>`. Direct GPU-to-GPU MPI communications can be enabled by passing the
`--with-gpu-aware-mpi` option.
`--with-gpu-aware-mpi` option. CPU to GPU data transfers can be accelerated (via pinning of host-side allocations)
by building with the `--with-cuda` option.

The ecwam-bundle also provides appropriate arch files for the nvhpc suite on the ECMWF ATOS system.

Running
-------
No extra run-time options are needed to run the GPU enabled ecWam. Please note that this means that if ecWam is built
using the `--with-loki` and `--with-acc` bundle arguments, it will necessarily be offloaded for GPU execution.
For multi-GPU runs, the number of GPUs maps to the number of MPI ranks. Thus multiple GPUs can be requested by
launching with multiple MPI ranks. The mapping of MPI ranks to GPUs assumes at most 4 GPUs per host node.

Environment variables
---------------------
Expand Down
2 changes: 1 addition & 1 deletion cmake/ecwam_expand_drv_types.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ macro( ecwam_expand_drv_types )
list(APPEND FYPP_ARGS -DPARKIND1_SINGLE_NEMO)
endif()

if( HAVE_ACC )
if( HAVE_LOKI AND NOT LOKI_MODE MATCHES "idem|idem-stack" )
list(APPEND FYPP_ARGS -DWAM_GPU)
endif()

Expand Down
5 changes: 5 additions & 0 deletions package/bundle/bundle.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ options :
cmake : >
ENABLE_ACC=ON

- with-cuda :
help : Enable FIELD_API CUDA backend
cmake : >
ENABLE_CUDA=ON

- without-loki-install :
help : Skip installation of Loki (Requires Loki to be on the PATH)
cmake : >
Expand Down
8 changes: 6 additions & 2 deletions src/ecwam/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,6 @@ list( APPEND ecwam_srcs
w_mode_st.F90
w_pdf.F90
w_pmax.F90
wam_init_gpu_mod.F90
wam_multio_mod.F90
wam_nproma.F90
wam_sorti.F90
Expand Down Expand Up @@ -363,6 +362,7 @@ if(HAVE_LOKI AND NOT LOKI_MODE MATCHES "idem|idem-stack")
list(APPEND ecwam_srcs wamintgr_loki_gpu.F90)
list(REMOVE_ITEM ecwam_srcs wamintgr.F90)
list(APPEND ecwam_srcs cireduce_loki_gpu.F90)
list(APPEND ecwam_srcs outbs_loki_gpu.F90)
endif()

# expand derived-types using src/ecwam/yowdrvtype_config.yml
Expand Down Expand Up @@ -445,7 +445,7 @@ ecbuild_add_library(
$<${HAVE_ACC}:OpenACC::OpenACC_Fortran>
PUBLIC_INCLUDES $<INSTALL_INTERFACE:include>
PRIVATE_INCLUDES ${CMAKE_CURRENT_SOURCE_DIR}
PRIVATE_DEFINITIONS ${ECWAM_PRIVATE_DEFINITIONS}
PRIVATE_DEFINITIONS ${ECWAM_PRIVATE_DEFINITIONS} $<${HAVE_CUDA}:_CUDA>
PUBLIC_DEFINITIONS ${ECWAM_DEFINITIONS}
)

Expand All @@ -460,6 +460,10 @@ if( HAVE_ACC AND CMAKE_Fortran_COMPILER_ID MATCHES "PGI|NVHPC")
target_compile_options( ${ecwam} PRIVATE "-gpu=maxregcount:128" )
endif()

if( HAVE_CUDA )
target_link_options( ${ecwam} PUBLIC "-cuda;-gpu=pinned" )
endif()

ecwam_target_compile_definitions_FILENAME( ${ecwam} )

### The file mubuf.F90, which is only used for "preproc" is sensitive to optimisations
Expand Down
1 change: 1 addition & 0 deletions src/ecwam/aki.F90
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ REAL(KIND=JWRB) FUNCTION AKI(OM,BETA)
! ----------------------------------------------------------------------

IMPLICIT NONE
!$loki routine seq

REAL(KIND=JWRB), INTENT(IN) :: OM, BETA

Expand Down
26 changes: 13 additions & 13 deletions src/ecwam/cal_second_order_spec.F90
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ SUBROUTINE CAL_SECOND_ORDER_SPEC(KIJS, KIJL, F1, WAVNUM, DEPTH, SIG)
USE YOWPARAM, ONLY : NANG, NFRE
USE YOWPCONS, ONLY : G, PI, ZPI
USE YOWSHAL , ONLY : NDEPTH, DEPTHA, DEPTHD
USE YOWTABL , ONLY : MR, XMR, MA, XMA, NFREH, NANGH, NMAX, &
& OMEGA, DFDTH, THH, DELTHH, IM_P, IM_M, &
USE YOWTABL , ONLY : MR, XMR, MA, XMA, NFREH, NANGH, &
& OMEGA, DFDTH, THH, DELTHH, IM_P, IM_M, &
& TA, TB, TC_QL, TT_4M, TT_4P
USE YOWTEST , ONLY : IU06

Expand All @@ -66,20 +66,20 @@ SUBROUTINE CAL_SECOND_ORDER_SPEC(KIJS, KIJL, F1, WAVNUM, DEPTH, SIG)
#include "secspom.intfb.h"

INTEGER(KIND=JWIM), INTENT(IN) :: KIJS, KIJL
REAL(KIND=JWRB), DIMENSION(KIJS:KIJL,NANG,NFRE), INTENT(INOUT) :: F1
REAL(KIND=JWRB), DIMENSION(KIJS:KIJL,NFRE), INTENT(IN) :: WAVNUM
REAL(KIND=JWRB), DIMENSION(KIJS:KIJL), INTENT(IN) :: DEPTH
REAL(KIND=JWRB), DIMENSION(KIJL,NANG,NFRE), INTENT(INOUT) :: F1
REAL(KIND=JWRB), DIMENSION(KIJL,NFRE), INTENT(IN) :: WAVNUM
REAL(KIND=JWRB), DIMENSION(KIJL), INTENT(IN) :: DEPTH
REAL(KIND=JWRB), INTENT(IN) :: SIG

INTEGER(KIND=JWIM) :: IJ,M,K,K0,M0,MP,KP,MM,KM,KL,KLL,ML

REAL(KIND=JWRB) :: FRAC,CO1,DEL,DELF,D1,D2,D3,D4,C1
REAL(KIND=JWRB) :: C2,XM,XK,OMSTART,AREA,SUM,SUM1,SUM3,GAM_B_J,ZFAC
REAL(KIND=JPHOOK) :: ZHOOK_HANDLE
REAL(KIND=JWRB), DIMENSION(KIJS:KIJL) :: EMEAN, FMEAN
REAL(KIND=JWRB), DIMENSION(KIJS:KIJL) :: F1MEAN, AKMEAN, XKMEAN, EMAXL
REAL(KIND=JWRB), DIMENSION(KIJS:KIJL,NANG,NFRE) :: F3
REAL(KIND=JWRB), DIMENSION(KIJS:KIJL,NANGH,NFREH) :: PF1, PF3
REAL(KIND=JWRB), DIMENSION(KIJL) :: EMEAN, FMEAN
REAL(KIND=JWRB), DIMENSION(KIJL) :: F1MEAN, AKMEAN, XKMEAN, EMAXL
REAL(KIND=JWRB), DIMENSION(KIJL,NANG,NFRE) :: F3
REAL(KIND=JWRB), DIMENSION(KIJL,NANGH,NFREH) :: PF1, PF3

!-----------------------------------------------------------------------

Expand All @@ -104,8 +104,8 @@ SUBROUTINE CAL_SECOND_ORDER_SPEC(KIJS, KIJL, F1, WAVNUM, DEPTH, SIG)
!*** 1.11 NO INTERPOLATION.
! ----------------------

CALL SECSPOM(F1,F3,KIJS,KIJL,NFRE,NANG,NMAX,NDEPTH,DEPTHA, &
& DEPTHD,OMSTART,FRAC,MR,DFDTH,OMEGA,DEPTH, &
CALL SECSPOM(F1,F3,KIJS,KIJL,NFRE,NANG,NDEPTH,DEPTHA, &
& DEPTHD,OMSTART,FRAC,MR,DFDTH,OMEGA,DEPTH, &
& AKMEAN,TA,TB,TC_QL,TT_4M,TT_4P,IM_P,IM_M)
DO M=1,NFRE
DO K=1,NANG
Expand Down Expand Up @@ -138,8 +138,8 @@ SUBROUTINE CAL_SECOND_ORDER_SPEC(KIJS, KIJL, F1, WAVNUM, DEPTH, SIG)
!*** 1.13 DETERMINE SECOND-ORDER SPEC
! --------------------------------

CALL SECSPOM(PF1,PF3,KIJS,KIJL,NFREH,NANGH,NMAX,NDEPTH,DEPTHA, &
& DEPTHD,OMSTART,FRAC,MR,DFDTH,OMEGA,DEPTH, &
CALL SECSPOM(PF1,PF3,KIJS,KIJL,NFREH,NANGH,NDEPTH,DEPTHA, &
& DEPTHD,OMSTART,FRAC,MR,DFDTH,OMEGA,DEPTH, &
& AKMEAN,TA,TB,TC_QL,TT_4M,TT_4P,IM_P,IM_M)

!*** 2.24 INTERPOLATE TOWARDS HIGH-RES GRID
Expand Down
6 changes: 4 additions & 2 deletions src/ecwam/cireduce_loki_gpu.F90
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,14 @@ SUBROUTINE CIREDUCE_LOKI_GPU (WVPRPT, FF_NOW)
ENDIF
CALL GSTATS(1493,0)
! DETERMINE THE WAVE ATTENUATION FACTOR
!$acc parallel loop gang present(FF_NOW, WVPRPT) vector_length(NPROMA_WAM)
!$acc data present(FF_NOW, WVPRPT)

DO ICHNK = 1, NCHNK
CALL CIWAF(1, NPROMA_WAM, WVPRPT%CGROUP(:,:,ICHNK), FF_NOW%CICOVER(:,ICHNK), &
& FF_NOW%CITHICK(:,ICHNK), WVPRPT%CIWA(:,:,ICHNK))
ENDDO
!$acc end parallel loop

!$acc end data
CALL GSTATS(1493,1)
ENDIF

Expand Down
10 changes: 5 additions & 5 deletions src/ecwam/ctcor.F90
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,16 @@ SUBROUTINE CTCOR (KIJS, KIJL, F, CTR)
IMPLICIT NONE

INTEGER(KIND=JWIM), INTENT(IN) :: KIJS, KIJL
REAL(KIND=JWRB), INTENT(IN) :: F(KIJS:KIJL,NANG,NFRE)
REAL(KIND=JWRB), DIMENSION(KIJS:KIJL), INTENT(OUT) :: CTR
REAL(KIND=JWRB), INTENT(IN) :: F(KIJL,NANG,NFRE)
REAL(KIND=JWRB), DIMENSION(KIJL), INTENT(OUT) :: CTR


INTEGER(KIND=JWIM) :: IJ, K, M
REAL(KIND=JWRB) :: FR1M1, ZARG, ZAMP
REAL(KIND=JPHOOK) :: ZHOOK_HANDLE
REAL(KIND=JWRB), DIMENSION(KIJS:KIJL) :: EM, ZT1
REAL(KIND=JWRB), DIMENSION(KIJS:KIJL) :: ZRHO, ZLAM
REAL(KIND=JWRB), DIMENSION(KIJS:KIJL, NFRE) :: TEMP
REAL(KIND=JWRB), DIMENSION(KIJL) :: EM, ZT1
REAL(KIND=JWRB), DIMENSION(KIJL) :: ZRHO, ZLAM
REAL(KIND=JWRB), DIMENSION(KIJL, NFRE) :: TEMP

! ----------------------------------------------------------------------

Expand Down
8 changes: 4 additions & 4 deletions src/ecwam/dominant_period.F90
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,14 @@ SUBROUTINE DOMINANT_PERIOD (KIJS, KIJL, FL1, DP)


INTEGER(KIND=JWIM), INTENT(IN) :: KIJS, KIJL
REAL(KIND=JWRB), DIMENSION(KIJS:KIJL,NANG,NFRE), INTENT(IN) :: FL1
REAL(KIND=JWRB), DIMENSION(KIJS:KIJL), INTENT(OUT) :: DP
REAL(KIND=JWRB), DIMENSION(KIJL,NANG,NFRE), INTENT(IN) :: FL1
REAL(KIND=JWRB), DIMENSION(KIJL), INTENT(OUT) :: DP

REAL(KIND=JWRB), PARAMETER :: FLTHRS = 0.1_JWRB

INTEGER(KIND=JWIM) :: IJ, K, M
REAL(KIND=JWRB), DIMENSION(KIJS:KIJL) :: TEMP, EM, FCROP
REAL(KIND=JWRB), DIMENSION(KIJS:KIJL,NFRE) :: F1D4
REAL(KIND=JWRB), DIMENSION(KIJL) :: TEMP, EM, FCROP
REAL(KIND=JWRB), DIMENSION(KIJL,NFRE) :: F1D4

REAL(KIND=JPHOOK) :: ZHOOK_HANDLE

Expand Down
Loading
Loading