From 4af9e522518e08226725bc456b2b50fa45dcba4b Mon Sep 17 00:00:00 2001 From: Suyash Tandon Date: Wed, 19 Oct 2022 21:12:23 -0500 Subject: [PATCH] modified: 1. cmake config for different build types - release, build, etc. 2. `src/acc/hip/hip_kernels` to fix errors with 3D `relion_refine` runs 3. GPU memory management in `src/ml_optimiser_mpi.cpp` corrected to match CUDA impl. --- CMakeLists.txt | 11 +- cmake/BuildTypes.cmake | 41 +++--- src/acc/hip/hip_kernels/diff2.h | 140 ++++++++++----------- src/acc/hip/hip_kernels/helper.h | 103 ++++++++------- src/acc/hip/hip_kernels/hip_device_utils.h | 4 +- src/acc/hip/hip_kernels/wavg.h | 2 +- src/ml_optimiser_mpi.cpp | 20 ++- tests/CMakeLists.txt | 2 +- 8 files changed, 164 insertions(+), 159 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 205e04ad6..4768acbe9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,7 +73,7 @@ endif() # ----------------------------------------------------------------SET CXX STANDARD-- set(CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD 14) -set(CXX_EXTENSIONS OFF) +set(CMAKE_CXX_EXTENSIONS OFF) # ------------------OPTIONS WHICH ARE NEEDED TO SET BUILD-TYPES (COMPILATION FLAGS)-- # ----------------------------------------------------------CUDA-ARCH--OR--HIP-ARCH-- @@ -155,10 +155,6 @@ if(MDT_TYPE_CHECK) add_definitions(-DMETADATA_TABLE_TYPE_CHECK) endif() -# ----------------------------------------------------------INCLUDE ALL BUILD TYPES-- - #This *has* to be AFTER project() -include(${CMAKE_SOURCE_DIR}/cmake/BuildTypes.cmake) - if(CUDA) # -----------------------------------------------------------------------------CUDA-- # DOC: http://www.cmake.org/cmake/help/v3.0/module/FindCUDA.html @@ -213,6 +209,11 @@ else(HIP) message(FATAL_ERROR "HIP enabled but unlable to locate packages. ROCm >= 5.0 is required to configure RELION with HIP using CMake.") endif(HIP_FOUND) endif() + +# ----------------------------------------------------------INCLUDE ALL BUILD TYPES-- + #This *has* to be AFTER project() + include(${CMAKE_SOURCE_DIR}/cmake/BuildTypes.cmake) + # ------------------------------------------------------------------ALLOCATOR CHOICE-- option(CachedAlloc "CachedAlloc" ON) if(NOT CachedAlloc) diff --git a/cmake/BuildTypes.cmake b/cmake/BuildTypes.cmake index 69fc4cbe3..c50c3eea9 100644 --- a/cmake/BuildTypes.cmake +++ b/cmake/BuildTypes.cmake @@ -7,11 +7,15 @@ if(CUDA) set(EXTRA_NVCC_FLAGS "-D__INTEL_COMPILER --default-stream per-thread --std=c++11") set(RELION_NVCC_FLAGS "${CUDARCH} ${WARN_DBL} ${EXTRA_NVCC_FLAGS}" CACHE STRING "" FORCE) else (HIP) - set(EXTRA_HIPCC_FLAGS "-fno-gpu-rdc -munsafe-fp-atomics -fgpu-default-stream=legacy") + if (${HIP_VERSION} VERSION_LESS "5.3" ) + set(EXTRA_HIPCC_FLAGS "-fgpu-default-stream=legacy -fno-gpu-rdc -munsafe-fp-atomics") + else() + set(EXTRA_HIPCC_FLAGS "-fno-gpu-rdc -munsafe-fp-atomics -fgpu-default-stream=per-thread") + endif() set(RELION_HIPCC_FLAGS "${EXTRA_HIPCC_FLAGS}" CACHE STRING "Compiler flags for HIP" FORCE) endif() -#message(STATUS "RELION_NVCC_FLAGS: ${RELION_NVCC_FLAGS}") -#message(STATUS "RELION_HIPCC_FLAGS: ${RELION_HIPCC_FLAGS}") +# message(STATUS "RELION_NVCC_FLAGS: ${RELION_NVCC_FLAGS}") +# message(STATUS "RELION_HIPCC_FLAGS: ${RELION_HIPCC_FLAGS}") # -------------------------- # Debug BUILD # -------------------------- @@ -23,7 +27,7 @@ endif() # it implies --ptxas-options=--verbose. # -- Compiler flags ------------------------------------------------- -set(RELION_FLAGS_DEBUG "-O0" CACHE STRING "") +set(RELION_FLAGS_DEBUG "-O0 -DDEBUG" CACHE STRING "") if(CUDA) set(RELION_NVCC_FLAGS_DEBUG "${RELION_NVCC_FLAGS}" CACHE STRING "") else(HIP) @@ -53,9 +57,9 @@ else(HIP) set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${RELION_DEFINITIONS_DEBUG}") endif() -#message(STATUS "Set the extra flags for Debug build type") -#message(STATUS "RELION_NVCC_FLAGS_DEBUG : ${RELION_NVCC_FLAGS_DEBUG}") -#message(STATUS "CUDA_NVCC_FLAGS_DEBUG : ${CUDA_NVCC_FLAGS_DEBUG}") +# message(STATUS "Set the extra flags for Debug build type") +# message(STATUS "RELION_NVCC_FLAGS_DEBUG : ${RELION_NVCC_FLAGS_DEBUG}") +# message(STATUS "CUDA_NVCC_FLAGS_DEBUG : ${CUDA_NVCC_FLAGS_DEBUG}") # message(STATUS "CMAKE_CXX_FLAGS_DEBUG : ${CMAKE_CXX_FLAGS_DEBUG}") #-------------------------------------------------------------------- @@ -65,7 +69,7 @@ endif() # -------------------------- # RELWITHDEBINFO BUILD # -------------------------- -set(RELION_FLAGS_RELWITHDEBINFO "-O2" CACHE STRING "") +set(RELION_FLAGS_RELWITHDEBINFO "-O2 -DDEBUG" CACHE STRING "") # -- Compiler flags ------------------------------------------------- if(CUDA) set(RELION_NVCC_FLAGS_RELWITHDEBINFO "${RELION_NVCC_FLAGS}" CACHE STRING "") @@ -86,16 +90,16 @@ if(CUDA) set(RELION_DEFINITIONS_RELWITHDEBINFO "-DDEBUG_CUDA") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${RELION_DEFINITIONS_RELWITHDEBINFO}") else(HIP) - set(CMAKE_CXX__FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${RELION_HIPCC_FLAGS_RELWITHDEBINFO}") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${RELION_HIPCC_FLAGS_RELWITHDEBINFO}") # -- Add preprocessor defintions ------------------------------------ set(RELION_DEFINITIONS_RELWITHDEBINFO "-DDEBUG_HIP") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${RELION_DEFINITIONS_RELWITHDEBINFO}") endif() -#message(STATUS "Set the extra flags for RELWITHDEBINFO build type") -#message(STATUS "RELION_NVCC_FLAGS_RELWITHDEBINFO : ${RELION_NVCC_FLAGS_RELWITHDEBINFO}") -#message(STATUS "CUDA_NVCC_FLAGS_RELWITHDEBINFO : ${CUDA_NVCC_FLAGS_RELWITHDEBINFO}") +# message(STATUS "Set the extra flags for RELWITHDEBINFO build type") +# message(STATUS "RELION_NVCC_FLAGS_RELWITHDEBINFO : ${RELION_NVCC_FLAGS_RELWITHDEBINFO}") +# message(STATUS "CUDA_NVCC_FLAGS_RELWITHDEBINFO : ${CUDA_NVCC_FLAGS_RELWITHDEBINFO}") # message(STATUS "CMAKE_CXX_FLAGS_RELWITHDEBINFO : ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}") #-------------------------------------------------------------------- @@ -141,7 +145,7 @@ endif() set(RELION_DEFINITIONS_RELEASE "") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${RELION_DEFINITIONS_RELEASE}") -#message(STATUS "RELION_FLAGS_PROFILING : ${RELION_FLAGS_PROFILING}") +# message(STATUS "RELION_FLAGS_PROFILING : ${RELION_FLAGS_PROFILING}") # message(STATUS "CMAKE_CXX_FLAGS_RELEASE : ${CMAKE_CXX_FLAGS_RELEASE}") #-------------------------------------------------------------------- @@ -189,21 +193,21 @@ else(HIP) # -- Compiler flags ------------------------------------------------- - set(RELION_FLAGS_PROFILING "" CACHE STRING "") + set(RELION_FLAGS_PROFILING "-O2" CACHE STRING "") set(RELION_HIPCC_FLAGS_PROFILING "${RELION_HIPCC_FLAGS} -g -ggdb" CACHE STRING "Semicolon delimited flags") # -- Linker flags --------------------------------------------------- set(RELION_LINKER_FLAGS_PROFILING "") # -- Append compiler and linker flags ------------------------------- - set(CMAKE_CXX_FLAGS_PROFILING "${CMAKE_CXX_FLAGS_RELEASE} ${RELION_FLAGS_PROFILING} ${RELION_HIPCC_FLAGS_RELEASE}") - set(CMAKE_C_FLAGS_PROFILING "${CMAKE_C_FLAGS_RELEASE} ${RELION_FALAGS_PROFILING}") + set(CMAKE_CXX_FLAGS_PROFILING "${RELION_FLAGS_PROFILING} ${RELION_HIPCC_FLAGS_PROFILING}") + set(CMAKE_C_FLAGS_PROFILING "${RELION_FLAGS_PROFILING} ${RELION_HIPCC_FLAGS_PROFILING}") set(CMAKE_EXE_LINKER_FLAGS_PROFILING "${CMAKE_EXE_LINKER_FLAGS_RELEASE} ${RELION_LINKER_FLAGS_PROFILING}") # -- Add preprocessor defintions ------------------------------------ - set(RELION_DEFINITIONS_PROFILING "-DHIP_PROFILING") + set(RELION_DEFINITIONS_PROFILING "-DHIP_PROFILING -DDEBUG_HIP -DDEBUG") set(CMAKE_CXX_FLAGS_PROFILING "${CMAKE_CXX_FLAGS_PROFILING} ${RELION_DEFINITIONS_PROFILING}") - #message(STATUS "RELION_FLAGS_PROFILING : ${RELION_FLAGS_PROFILING}") + # message(STATUS "RELION_FLAGS_PROFILING : ${RELION_FLAGS_PROFILING}") # message(STATUS "CMAKE_CXX_FLAGS_PROFILING : ${CMAKE_CXX_FLAGS_PROFILING}") #-------------------------------------------------------------------- endif() @@ -242,3 +246,4 @@ else(HIP) set(CMAKE_CXX_FLAGS_BENCHMARKING "${CMAKE_CXX_FLAGS_BENCHMARKING} ${RELION_DEFINITIONS_BENCHMARKING}") endif() #-------------------------------------------------------------------- +# message(STATUS "CMAKE_CXX_FLAGS_BENCHMARKING : ${CMAKE_CXX_FLAGS_BENCHMARKING}") \ No newline at end of file diff --git a/src/acc/hip/hip_kernels/diff2.h b/src/acc/hip/hip_kernels/diff2.h index a67c766bf..52dfc8e3f 100644 --- a/src/acc/hip/hip_kernels/diff2.h +++ b/src/acc/hip/hip_kernels/diff2.h @@ -54,9 +54,6 @@ __global__ void hip_kernel_diff2_coarse( __shared__ XFLOAT s_real[block_sz]; __shared__ XFLOAT s_imag[block_sz]; __shared__ XFLOAT s_corr[block_sz]; - __shared__ XFLOAT s_x[block_sz/prefetch_fraction]; - __shared__ XFLOAT s_y[block_sz/prefetch_fraction]; - __shared__ XFLOAT s_z[block_sz/prefetch_fraction]; XFLOAT diff2s[eulers_per_block] = {0.f}; @@ -83,8 +80,6 @@ __global__ void hip_kernel_diff2_coarse( y = floorfracf( xy, projector.imgX); if (z > projector.maxR) z -= projector.imgZ; - - s_z[tid/prefetch_fraction] = z; } else { @@ -93,11 +88,8 @@ __global__ void hip_kernel_diff2_coarse( } if (y > projector.maxR) y -= projector.imgY; - - s_x[tid/prefetch_fraction] = x; - s_y[tid/prefetch_fraction] = y; -// #pragma unroll + #pragma unroll for (int i = tid%prefetch_fraction; i < eulers_per_block; i += prefetch_fraction) { if(DATA3D) // if DATA3D, then REF3D as well. @@ -154,12 +146,30 @@ __global__ void hip_kernel_diff2_coarse( { if((init_pixel + i) >= image_size) break; + int x,y,z,xy; + if(DATA3D) + { + z = floorfracf( init_pixel + i , projector.imgX*projector.imgY); //TODO optimize index extraction. + xy = ( init_pixel + i ) % (projector.imgX*projector.imgY); + x = xy % projector.imgX; + y = floorfracf( xy, projector.imgX); + if (z > projector.maxR) + z -= projector.imgZ; + } + else + { + x = ( init_pixel + i ) % projector.imgX; + y = floorfracf( init_pixel + i , projector.imgX); + } + if (y > projector.maxR) + y -= projector.imgY; + XFLOAT real, imag; if(DATA3D) - translatePixel(s_x[i], s_y[i], s_z[i], tx, ty, tz, s_real[i + init_pixel % block_sz], s_imag[i + init_pixel % block_sz], real, imag); + translatePixel(x, y, z, tx, ty, tz, s_real[i + init_pixel % block_sz], s_imag[i + init_pixel % block_sz], real, imag); else - translatePixel(s_x[i], s_y[i], tx, ty, s_real[i + init_pixel % block_sz], s_imag[i + init_pixel % block_sz], real, imag); + translatePixel(x, y, tx, ty, s_real[i + init_pixel % block_sz], s_imag[i + init_pixel % block_sz], real, imag); #pragma unroll @@ -204,46 +214,30 @@ __global__ void hip_kernel_diff2_fine( unsigned long bid = blockIdx.x; unsigned long tid = threadIdx.x; - unsigned warp_num = blockDim.x/warpSize; - unsigned warp_id = tid/warpSize; - unsigned lane_id = tid % warpSize; - // // Specialize BlockReduce for a 1D block of 128 threads on type XFLOAT // typedef hipcub::BlockReduce BlockReduce; // // Allocate shared memory for BlockReduce // __shared__ typename BlockReduce::TempStorage temp_storage; unsigned long pixel; - XFLOAT ref_real, ref_imag, shifted_real, shifted_imag, diff_real, diff_imag; + XFLOAT ref_real, ref_imag, + shifted_real, shifted_imag, + diff_real, diff_imag; - extern __shared__ XFLOAT s_sum[]; - __shared__ XFLOAT s_trans_x[chunk_sz]; - __shared__ XFLOAT s_trans_y[chunk_sz]; - __shared__ XFLOAT s_trans_z[chunk_sz]; - __shared__ XFLOAT s_eulers[9]; + __shared__ XFLOAT s[block_sz*chunk_sz]; //We MAY have to do up to chunk_sz translations in each block + __shared__ XFLOAT s_outs[chunk_sz]; // inside the padded 2D orientation gri // if( bid < todo_blocks ) // we only need to make { unsigned trans_num = (unsigned)d_job_num[bid]; //how many transes we have for this rot - + for (int itrans=0; itrans itrans) - { - if(DATA3D) - translatePixel(x, y, z, s_trans_x[itrans], s_trans_y[itrans], s_trans_z[itrans], g_imgs_real[pixel], g_imgs_imag[pixel], shifted_real, shifted_imag); - else - translatePixel(x, y, s_trans_x[itrans], s_trans_y[itrans], g_imgs_real[pixel], g_imgs_imag[pixel], shifted_real, shifted_imag); + iy = d_trans_idx[d_job_idx[bid]] + itrans; - diff_real = ref_real - shifted_real; - diff_imag = ref_imag - shifted_imag; - r_sum[itrans] += (diff_real * diff_real + diff_imag * diff_imag) * (XFLOAT)0.5 * g_corr_img[pixel]; - } + if(DATA3D) + translatePixel(x, y, z, trans_x[iy], trans_y[iy], trans_z[iy], g_imgs_real[pixel], g_imgs_imag[pixel], shifted_real, shifted_imag); + else + translatePixel(x, y, trans_x[iy], trans_y[iy], g_imgs_real[pixel], g_imgs_imag[pixel], shifted_real, shifted_imag); + + diff_real = ref_real - shifted_real; + diff_imag = ref_imag - shifted_imag; + s[itrans*block_sz + tid] += (diff_real * diff_real + diff_imag * diff_imag) * (XFLOAT)0.5 * __ldg(&g_corr_img[pixel]); } } + __syncthreads(); } - - for (int itrans = 0; itrans < trans_num; itrans++) + for(int j=(block_sz/2); j>0; j/=2) { - XFLOAT val = r_sum[itrans]; - for (int offset = warpSize/2; offset > 0; offset /= 2) - { - val += __shfl_down(val, offset); - } - if (lane_id == 0) + if(tid -__global__ void hip_kernel_multi( T *A, - T *OUT, - T S, - int image_size) +__global__ void hip_kernel_multi( T *A, + T *OUT, + T S, + int image_size) { int pixel = threadIdx.x + blockIdx.x*blockDim.x; if(pixel -__global__ void hip_kernel_multi( - T *A, - T S, - int image_size) +__global__ void hip_kernel_multi( T *A, + T S, + int image_size) { int pixel = threadIdx.x + blockIdx.x*blockDim.x; if(pixel -__global__ void hip_kernel_multi( T *A, - T *B, - T *OUT, - T S, - int image_size) +__global__ void hip_kernel_multi( T *A, + T *B, + T *OUT, + T S, + int image_size) { int pixel = threadIdx.x + blockIdx.x*blockDim.x; if(pixel(double2*, int, int, int); template __global__ void hip_kernel_centerFFTbySign(float2*, int, int, int); -#if !defined(__HIP_ARCH__) || __HIP_ARCH__ >= gfx906 +#if !defined(__HIP_ARCH__) || __HIP_ARCH__ != gfx906 #else __device__ double atomicAdd(double* address, double val) { diff --git a/src/acc/hip/hip_kernels/hip_device_utils.h b/src/acc/hip/hip_kernels/hip_device_utils.h index 7368e1a38..62092fa26 100644 --- a/src/acc/hip/hip_kernels/hip_device_utils.h +++ b/src/acc/hip/hip_kernels/hip_device_utils.h @@ -20,9 +20,9 @@ __device__ inline double hip_atomic_add(double* address, double val) #else __device__ inline void hip_atomic_add(float* address, float value) { - atomicAddNoRet(address,value); + // atomicAddNoRet(address,value); // unsafeAtomicAdd(address,value); - // atomicAdd(address,value); + atomicAdd(address,value); } #endif diff --git a/src/acc/hip/hip_kernels/wavg.h b/src/acc/hip/hip_kernels/wavg.h index 34236b4c3..2710e7fbf 100644 --- a/src/acc/hip/hip_kernels/wavg.h +++ b/src/acc/hip/hip_kernels/wavg.h @@ -31,7 +31,7 @@ __global__ void hip_kernel_wavg( XFLOAT significant_weight, XFLOAT part_scale) { - #if 1 + #if 0 float4 ref_real, ref_imag, img_real, img_imag, trans_real, trans_imag; int bid = blockIdx.x; //block ID diff --git a/src/ml_optimiser_mpi.cpp b/src/ml_optimiser_mpi.cpp index 037d42a33..17db31ecf 100644 --- a/src/ml_optimiser_mpi.cpp +++ b/src/ml_optimiser_mpi.cpp @@ -962,13 +962,25 @@ void MlOptimiserMpi::expectation() if (free < required_free) { printf("WARNING: Ignoring required free GPU memory amount of %zu MB, due to space insufficiency.\n", required_free/1000000); - allocationSize = (double)free *0.3; + #ifdef _CUDA_ENABLED + allocationSize = (double)free *0.7; + #elif defined _HIP_ENABLED + allocationSize = (double)free *0.7; + #endif } else { - // allocationSize = free - required_free; - allocationSize = (double)free *0.3; - printf("WARNING: due to 296623 ticket, use 30% of free GPU memory as allocationSize %zu MB.\n", allocationSize/1000000); + #ifdef _CUDA_ENABLED + allocationSize = free - required_free; + #elif defined _HIP_ENABLED + allocationSize = free - required_free; + // allocationSize = (double)free *0.5; + // #ifdef DEBUG_HIP + // printf("WARNING: due to 296623 ticket, use 50%% of free GPU memory as allocationSize %zu MB.\n", allocationSize/1000000); + // #else + // printf("WARNING: Using 50%% of free GPU memory as allocationSize %zu MB.\n", allocationSize/1000000); + // #endif + #endif } if (allocationSize < 200000000) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6fd5c7114..47d77090a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -7,7 +7,7 @@ target_link_libraries(tests relion_lib) target_link_libraries(tests ${FFTW_LIBRARIES}) target_link_libraries(tests ${TIFF_LIBRARIES}) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") include_directories(tests PRIVATE ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/tests) include_directories(tests PRIVATE ${FFTW_INCLUDES})