Skip to content

Commit

Permalink
WIP: Add CUDA version to HIP branch
Browse files Browse the repository at this point in the history
  • Loading branch information
reuterbal committed Aug 11, 2023
1 parent e018588 commit 498d0e8
Show file tree
Hide file tree
Showing 27 changed files with 1,769 additions and 167 deletions.
49 changes: 43 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ ecbuild_find_package( NAME fiat REQUIRED )

ecbuild_add_option( FEATURE MPI
DESCRIPTION "Support for MPI distributed memory parallelism"
REQUIRED_PACKAGES "MPI COMPONENTS Fortran C CXX"
CONDITION fiat_HAVE_MPI )

ecbuild_add_option( FEATURE OMP
Expand All @@ -35,11 +36,11 @@ ecbuild_add_option( FEATURE OMP
REQUIRED_PACKAGES "OpenMP COMPONENTS Fortran" )

ecbuild_add_option( FEATURE DOUBLE_PRECISION
DEFAULT ON
DEFAULT ON
DESCRIPTION "Support for Double Precision" )

ecbuild_add_option( FEATURE SINGLE_PRECISION
DEFAULT ON
DEFAULT ON
DESCRIPTION "Support for Single Precision" )

if( HAVE_SINGLE_PRECISION )
Expand Down Expand Up @@ -74,18 +75,46 @@ ecbuild_add_option( FEATURE TRANSI
CONDITION HAVE_DOUBLE_PRECISION AND HAVE_CPU )

set( HIP_REQUESTED OFF )
set( CUDA_REQUESTED OFF )
if( ECTRANS_ENABLE_GPU OR (NOT DEFINED ECTRANS_ENABLE_GPU AND ENABLE_GPU))
set( HIP_REQUESTED ON )
# TODO: allow to switch between CUDA and HIP or automatically detect target platform
set( CUDA_REQUESTED ON )
# set( HIP_REQUESTED ON )
endif()

if( HIP_REQUESTED )
ectrans_find_hip() # sets "HAVE_HIP"
endif()
endif()

if( CUDA_REQUESTED )
enable_language( CUDA )
set( HAVE_CUDA ON )
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 70 80)
endif()
find_package( CUDAToolkit )
endif()

ecbuild_add_option( FEATURE GPU
DEFAULT OFF
DESCRIPTION "Compile GPU version of ectrans (Requires OpenACC or sufficient OpenMP offloading support and MPI)"
CONDITION HAVE_HIP )
CONDITION HAVE_HIP OR HAVE_CUDA )

if( ${CMAKE_VERSION} VERSION_LESS "3.25" AND (NOT DEFINED ENABLE_ACC OR ENABLE_ACC ) )
# Incredibly inconvenient: FindOpenACC does _not_ set OpenACC_FOUND, only
# the language-specific components OpenACC_Fortran_FOUND and OpenACC_C_FOUND.
# This means, even internally CMake considers OpenACC as not found.
# (See eg get_property(... GLOBAL PROPERTY PACKAGES_NOT_FOUND))
# Therefore, we search for OpenACC, set OpenACC_FOUND ourselves according to
# the result, and then, trigger a second find_package via ecbuild_add_option.
# This then conveniently takes the previously set OpenACC_FOUND into account
# and rectifies CMake's internal bookkeeping in the process.
# This has been fixed in CMake 3.25
find_package( OpenACC )
if( OpenACC_Fortran_FOUND )
set( OpenACC_FOUND ON )
endif()
endif()

ecbuild_add_option( FEATURE ACC
DEFAULT ON
Expand All @@ -100,10 +129,18 @@ if( HAVE_GPU )
else()
ecbuild_error("Could not enable GPU as OMP or ACC were not enabled")
endif()

if( HAVE_CUDA )
set( GPU_RUNTIME "CUDA" )
elseif( HAVE_HIP )
set( GPU_RUNTIME "HIP" )
else()
ecbuild_error("Could not enable GPU as CUDA or HIP were not found")
endif()
endif()

ecbuild_add_option( FEATURE GPU_AWARE_MPI
DEFAULT OFF
DEFAULT OFF
CONDITION HAVE_GPU
REQUIRED_PACKAGES "MPI COMPONENTS CXX Fortran"
DESCRIPTION "Enable CUDA-aware MPI" )
Expand Down
6 changes: 5 additions & 1 deletion src/programs/ectrans-benchmark.F90
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ program transform_test
! Default parameters
integer(kind=jpim) :: nsmax = 79 ! Spectral truncation
integer(kind=jpim) :: iters = 10 ! Number of iterations for transform test
integer(kind=jpim) :: nfld = 1 ! Number of scalar fields
integer(kind=jpim) :: nfld = 1 ! Number of scalar fields
integer(kind=jpim) :: nlev = 1 ! Number of vertical levels

integer(kind=jpim) :: nflevg
Expand Down Expand Up @@ -608,6 +608,8 @@ program transform_test
! Do inverse transform
!=================================================================================================

write(nout, *) 'inv_trans'
flush(nout)
ztstep1(jstep) = timef()
call gstats(4,0)
if (lvordiv) then
Expand Down Expand Up @@ -658,6 +660,8 @@ program transform_test

ztstep2(jstep) = timef()

write(nout, *) 'dir_trans'
flush(nout)
call gstats(5,0)
if (lvordiv) then
call dir_trans(kresol=1, kproma=nproma, &
Expand Down
34 changes: 26 additions & 8 deletions src/trans/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

if(CMAKE_Fortran_COMPILER_ID MATCHES "NVHPC")

# Compile setup_trans with pinned memory to improve data movement performance.
# Compile setup_trans with pinned memory to improve data movement performance.
ectrans_add_compile_options(
SOURCES external/setup_trans.F90
#FLAGS "-gpu=pinned,deepcopy,fastmath,nordc")
Expand All @@ -30,6 +30,8 @@ ecbuild_list_add_pattern( LIST trans_src
QUIET
)

ecbuild_info( "${trans_src}")

## for reduced memory option, replace source files
if( HAVE_GPU_REDUCED_MEMORY )
ecbuild_list_add_pattern( LIST reducedmem_files
Expand All @@ -50,12 +52,26 @@ ecbuild_list_exclude_pattern( LIST trans_src REGEX dilatation_mod.F90
ecbuild_info("warn: special compile flags ftdir_mod.F90")
#endif()

ectrans_declare_hip_sources( SOURCES_GLOB
sharedmem/*.hip.cpp
algor/*.hip.cpp
internal/*.hip.cpp
external/*.hip.cpp
)
if( HIP_FOUND )
ectrans_declare_hip_sources( SOURCES_GLOB
sharedmem/*.hip.cpp
algor/*.hip.cpp
internal/*.hip.cpp
external/*.hip.cpp
)
ecbuild_list_exclude_pattern( LIST trans_src REGEX .cu )
ecbuild_list_exclude_pattern( LIST trans_src REGEX cublas_mod.F90 )
ecbuild_list_exclude_pattern( LIST trans_src REGEX cuda_device_mod.F90 )
ecbuild_list_exclude_pattern( LIST trans_src REGEX cuda_gemm_batched_mod.F90 )
ecbuild_list_exclude_pattern( LIST trans_src REGEX tpm_fftc.F90 )
else()
set( ECTRANS_GPU_HIP_LIBRARIES CUDA::cufft CUDA::cublas )
ecbuild_list_exclude_pattern( LIST trans_src REGEX .hip.cpp )
ecbuild_list_exclude_pattern( LIST trans_src REGEX hipblas_mod.F90 )
ecbuild_list_exclude_pattern( LIST trans_src REGEX hip_device_mod.F90 )
ecbuild_list_exclude_pattern( LIST trans_src REGEX hip_gemm_batched_mod.F90 )
ecbuild_list_exclude_pattern( LIST trans_src REGEX tpm_ffth.F90 )
endif()

foreach( prec dp sp )
if( HAVE_${prec} )
Expand All @@ -79,9 +95,11 @@ foreach( prec dp sp )
${LAPACK_LIBRARIES} # we still have symbols in some files
$<${HAVE_ACC}:OpenACC::OpenACC_Fortran>
$<${HAVE_OMP}:OpenMP::OpenMP_Fortran>
$<${HAVE_GPU_AWARE_MPI}:MPI::MPI_Fortran MPI::MPI_CXX>
# $<${HAVE_GPU_AWARE_MPI}:MPI::MPI_Fortran MPI::MPI_CXX>
MPI::MPI_Fortran MPI::MPI_CXX
PRIVATE_DEFINITIONS
${GPU_OFFLOAD}GPU
${GPU_RUNTIME}GPU
$<${HAVE_GPU_AWARE_MPI}:USE_CUDA_AWARE_MPI_FT>
$<${HAVE_GPU_REDUCED_MEMORY}:REDUCED_MEM>
)
Expand Down
167 changes: 167 additions & 0 deletions src/trans/gpu/algor/external/fourier/create_plan_fftc.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
#include "cufft.h"
#include "stdio.h"
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";

case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";

case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";

case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";

case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";

case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";

case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";

case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";

case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";

case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}

return "<unknown>";
}

inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error at 1\n");
fprintf(stderr, "CUFFT error in file '%s'\n",__FILE__);
fprintf(stderr, "CUFFT error at 2\n");
/*fprintf(stderr, "CUFFT error line '%s'\n",__LINE__);*/
fprintf(stderr, "CUFFT error at 3\n");
/*fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \*/
fprintf(stderr, "CUFFT error %d: %s\nterminating!\n",err,_cudaGetErrorEnum(err)); \
cudaDeviceReset(); return; \
}
}


static int allocatedWorkspace=0;
static void* planWorkspace;
static int planWorkspaceSize=100*1024*1024; //100MB

extern "C"
void
create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, int *LOTp)
{
int ISIGN = *ISIGNp;
int N = *Np;
int LOT = *LOTp;

cufftHandle plan;

if (cudaDeviceSynchronize() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to synchronize\n");
return;
}


// //create a single re-usable workspace
// if(!allocatedWorkspace){
// allocatedWorkspace=1;
// //allocate plan workspace
// cudaMalloc(&planWorkspace,planWorkspaceSize);
// }
//
// //disable auto allocation so we can re-use a single workspace (created above)
// cufftSetAutoAllocation(plan, false);

int embed[1];
int stride;
int dist;

#ifdef TRANS_SINGLE
cufftType cufft_1 = CUFFT_R2C;
cufftType cufft_2 = CUFFT_C2R;
#else
cufftType cufft_1 = CUFFT_D2Z;
cufftType cufft_2 = CUFFT_Z2D;
#endif

embed[0] = 1;
stride = LOT;
dist = 1;

cufftSafeCall(cufftCreate(&plan));

//printf("CreatePlan cuFFT\n","N=",N);
//printf("%s %d \n","plan=",plan);
//printf("%s %d \n","LOT=",LOT);
//printf("%s %d \n","ISIGN=",ISIGN);
//printf("%s %d \n","Np=",*Np);

if( ISIGN== -1 ){
cufftSafeCall(cufftPlanMany(&plan, 1, &N,
embed, stride, dist,
embed, stride, dist,
cufft_1, LOT));
//cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_D2Z, LOT));
}
else if( ISIGN== 1){
cufftSafeCall(cufftPlanMany(&plan, 1, &N,
embed, stride, dist,
embed, stride, dist,
cufft_2, LOT));
//cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_Z2D, LOT));
}
else {
abort();
}

// // use our reusaable work area for the plan
// cufftSetWorkArea(plan,planWorkspace);

/*
if( ISIGN== -1 ){
cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_D2Z, LOT));
}
else if( ISIGN== 1){
cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_Z2D, LOT));
}
else {
abort();
}
*/

if (cudaDeviceSynchronize() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to synchronize\n");
return;
}

*PLANp=plan;

// // get size used by this plan
// size_t workSize;
// cufftGetSize(plan,&workSize);
//
// // exit if we don't have enough space for the work area in the re-usable workspace
// if(workSize > planWorkspaceSize){
// printf("create_plan_fftc: plan workspace size not large enough - exiting\n");
// exit(1);
// }


return;


}

Loading

0 comments on commit 498d0e8

Please sign in to comment.