diff --git a/src/trans/gpu/CMakeLists.txt b/src/trans/gpu/CMakeLists.txt index a9b16ba8..3fcdfb9c 100644 --- a/src/trans/gpu/CMakeLists.txt +++ b/src/trans/gpu/CMakeLists.txt @@ -27,6 +27,7 @@ if( HAVE_HIP ) algor/hicblas_gemm.hip.cpp algor/hicfft.hip.cpp ) + ecbuild_info("warn: IN_PLACE_FFT not defined for hipFFT") elseif( HAVE_CUDA ) set( GPU_RUNTIME "CUDA" ) set( ECTRANS_GPU_HIP_LIBRARIES CUDA::cufft CUDA::cublas nvhpcwrapnvtx CUDA::cudart ) @@ -34,6 +35,7 @@ elseif( HAVE_CUDA ) algor/hicblas_gemm.cuda.cu algor/hicfft.cuda.cu ) + ecbuild_info("warn: IN_PLACE_FFT defined for cuFFT") else() ecbuild_info("warn: HIP and CUDA not found") endif() @@ -166,6 +168,11 @@ foreach( prec dp sp ) target_compile_definitions( ectrans_gpu_${prec} PRIVATE TRANS_SINGLE PARKINDTRANS_SINGLE ) endif() + # cuFFT can do in-place FFT, hipFFT cannot + if( HAVE_CUDA ) + target_compile_definitions( ectrans_gpu_${prec} PRIVATE IN_PLACE_FFT ) + endif() + if( HAVE_OMP AND CMAKE_Fortran_COMPILER_ID MATCHES Cray ) # Propagate flags as link options for downstream targets. Only required for Cray target_link_options( ectrans_gpu_${prec} INTERFACE diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index af873d76..95425107 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -146,7 +146,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ALLOCATOR = MAKE_BUFFERED_ALLOCATOR() HTRGTOL = PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) IF (KF_FS > 0) THEN - HFTDIR = PREPARE_FTDIR() + HFTDIR = PREPARE_FTDIR(ALLOCATOR,KF_FS) HTRLTOM_PACK = PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) HTRLTOM = PREPARE_TRLTOM(ALLOCATOR, KF_FS) HTRLTOM_UNPACK = PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 65e50138..e8801ac9 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -10,19 +10,35 @@ ! MODULE FTDIR_MOD + USE BUFFERED_ALLOCATOR_MOD ,ONLY : ALLOCATION_RESERVATION_HANDLE IMPLICIT NONE PRIVATE PUBLIC :: FTDIR, FTDIR_HANDLE, PREPARE_FTDIR TYPE FTDIR_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HREEL_COMPLEX END TYPE CONTAINS - FUNCTION PREPARE_FTDIR() RESULT(HFTDIR) + FUNCTION PREPARE_FTDIR(ALLOCATOR,KF_FS) RESULT(HFTDIR) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: D + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE + USE ISO_C_BINDING, ONLY: C_SIZE_T + IMPLICIT NONE + + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS TYPE(FTDIR_HANDLE) :: HFTDIR - END FUNCTION + + REAL(KIND=JPRBT) :: DUMMY + +#ifndef IN_PLACE_FFT + HFTDIR%HREEL_COMPLEX = RESERVE(ALLOCATOR, INT(KF_FS*D%NLENGTF*SIZEOF(DUMMY), KIND=C_SIZE_T)) +#endif + END FUNCTION PREPARE_FTDIR SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) !**** *FTDIR - Direct Fourier transform @@ -60,12 +76,13 @@ SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) USE TPM_GEN, ONLY: LSYNC_TRANS USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT USE TPM_DISTR, ONLY: MYSETW, MYPROC, NPROC, D_NSTAGT0B, D_NSTAGTF,D_NPTRLS, & - & D_NPNTGTB0, D_NPROCM, D_NDGL_FS + & D_NPNTGTB0, D_NPROCM, D_NDGL_FS, D USE TPM_GEOMETRY, ONLY: G_NMEN, G_NLOEN - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION USE TPM_HICFFT, ONLY: EXECUTE_DIR_FFT USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX + USE ISO_C_BINDING, ONLY: C_SIZE_T IMPLICIT NONE @@ -77,7 +94,12 @@ SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) INTEGER(KIND=JPIM) :: KGL +#ifdef IN_PLACE_FFT PREEL_COMPLEX => PREEL_REAL +#else + CALL ASSIGN_PTR(PREEL_COMPLEX, GET_ALLOCATION(ALLOCATOR, HFTDIR%HREEL_COMPLEX),& + & 1_C_SIZE_T, int(KFIELD*D%NLENGTF*SIZEOF(PREEL_COMPLEX(1)),kind=c_size_t)) +#endif #ifdef ACCGPU !$ACC DATA PRESENT(PREEL_REAL, PREEL_COMPLEX, & diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index e939f42f..bf59b384 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -10,20 +10,33 @@ ! MODULE FTINV_MOD - USE BUFFERED_ALLOCATOR_MOD ,ONLY : BUFFERED_ALLOCATOR + USE BUFFERED_ALLOCATOR_MOD ,ONLY : BUFFERED_ALLOCATOR, ALLOCATION_RESERVATION_HANDLE IMPLICIT NONE PRIVATE PUBLIC :: FTINV, FTINV_HANDLE, PREPARE_FTINV TYPE FTINV_HANDLE + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HREEL_REAL END TYPE CONTAINS - FUNCTION PREPARE_FTINV(ALLOCATOR) RESULT(HFTINV) + FUNCTION PREPARE_FTINV(ALLOCATOR,KF_FS) RESULT(HFTINV) + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: D + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE + USE ISO_C_BINDING, ONLY: C_SIZE_T + IMPLICIT NONE TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS TYPE(FTINV_HANDLE) :: HFTINV + + REAL(KIND=JPRBT) :: DUMMY + +#ifndef IN_PLACE_FFT + HFTINV%HREEL_REAL = RESERVE(ALLOCATOR, int(D%NLENGTF*KF_FS*SIZEOF(DUMMY),kind=c_size_t)) +#endif END FUNCTION SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) @@ -59,13 +72,15 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) ! G. Mozdzynski (Jun 2015): Support alternative FFTs to FFTW ! ------------------------------------------------------------------ - USE TPM_GEN, ONLY: LSYNC_TRANS - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT - USE TPM_DISTR, ONLY: MYSETW, D_NPTRLS, D_NDGL_FS, D_NSTAGTF - USE TPM_GEOMETRY, ONLY: G_NLOEN - USE TPM_HICFFT, ONLY: EXECUTE_INV_FFT - USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM - USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX + USE TPM_GEN, ONLY: LSYNC_TRANS + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: MYSETW, D_NPTRLS, D_NDGL_FS, D_NSTAGTF, D + USE TPM_GEOMETRY, ONLY: G_NLOEN + USE TPM_HICFFT, ONLY: EXECUTE_INV_FFT + USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM + USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX + USE BUFFERED_ALLOCATOR_MOD, ONLY: ASSIGN_PTR, GET_ALLOCATION + USE ISO_C_BINDING, ONLY: C_SIZE_T IMPLICIT NONE @@ -77,7 +92,12 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) INTEGER(KIND=JPIM) :: KGL +#ifdef IN_PLACE_FFT PREEL_REAL => PREEL_COMPLEX +#else + CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HFTINV%HREEL_REAL),& + & 1_C_SIZE_T, int(KFIELD*D%NLENGTF*SIZEOF(PREEL_REAL(1)),kind=c_size_t)) +#endif #ifdef OMPGPU #endif diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index c928feb7..804e4710 100644 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -202,7 +202,7 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& HTRMTOL = PREPARE_TRMTOL(ALLOCATOR,IF_LEG) HTRMTOL_UNPACK = PREPARE_TRMTOL_UNPACK(ALLOCATOR,IF_FOURIER) HFSC = PREPARE_FSC(ALLOCATOR) - HFTINV = PREPARE_FTINV(ALLOCATOR) + HFTINV = PREPARE_FTINV(ALLOCATOR,IF_FOURIER) ENDIF HTRLTOG = PREPARE_TRLTOG(ALLOCATOR,IF_FOURIER,KF_GP)