Skip to content

Commit

Permalink
HIP and Spack (#468)
Browse files Browse the repository at this point in the history
* HIP compatibility, remove need for hipify-perl

* split GPU N-body code into additional units

handle gpu-direct with interface compile_definition
removed duplicate cartesian multipole implementation
  • Loading branch information
sekelle committed Dec 9, 2024
1 parent f56fc7c commit 4be3f10
Show file tree
Hide file tree
Showing 78 changed files with 1,149 additions and 1,913 deletions.
10 changes: 5 additions & 5 deletions .gitlab/Dockerfile_hip_2
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,6 @@ RUN echo \
&& unset GCC_X86_64 \
&& cd /usr/local/games/SPH-EXA.git \
&& ls -la . \
&& parallel -j+0 hipify-perl -inplace ::: \
`find . -name '*.h' -o -name '*.cuh' -o -name '*.hpp' -o -name '*.cpp' -o -name '*.cu'` \
&& sed -i "s@GIT_REPOSITORY@SOURCE_DIR $GGTEST_VERSION/\n#@" ./domain/cmake/setup_GTest.cmake \
&& sed -i "s@GIT_REPOSITORY@SOURCE_DIR $GGTEST_VERSION/\n#@" ./cmake/setup_GTest.cmake \
&& sed -i "s@GIT_REPOSITORY@SOURCE_DIR $GGTEST_VERSION/\n#@" ./ryoanji/cmake/setup_GTest.cmake \
Expand All @@ -106,9 +104,11 @@ RUN echo \
MPICH_CC=/opt/rocm-5.2.3/llvm/bin/clang \
cmake -S SPH-EXA.git -B build \
-DCMAKE_CXX_COMPILER=mpicxx -DCMAKE_C_COMPILER=mpicc \
-DBUILD_TESTING=OFF -DBUILD_ANALYTICAL=OFF -DGPU_DIRECT=OFF \
-DCMAKE_BUILD_TYPE=Debug -DCMAKE_HIP_ARCHITECTURES=gfx90a \
&& echo "## cmake --build + --install :" \
-DCMAKE_HIP_COMPILER=mpicxx -DCMAKE_HIP_COMPILER_FORCED=ON \
-DCMAKE_HIP_ARCHITECTURES=gfx90a \
-DCSTONE_WITH_GPU_AWARE_MPI=ON \
-DBUILD_TESTING=OFF \
&& echo "## cmake --build :" \
&& MPICH_CXX=/opt/rocm-5.2.3/llvm/bin/clang++ \
MPICH_CC=/opt/rocm-5.2.3/llvm/bin/clang \
cmake --build build -j `grep processor /proc/cpuinfo | wc -l` -t sphexa-hip
Expand Down
47 changes: 27 additions & 20 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ list(APPEND CMAKE_MODULE_PATH
option(BUILD_TESTING "build unit and integration tests" ON)
option(BUILD_ANALYTICAL "build analytical solution" ON)

option(GPU_DIRECT "Enable CUDA-aware MPI communication" OFF)
option(SPH_EXA_WITH_CUDA "Enable building for NVIDIA GPUs" ON)
option(SPH_EXA_WITH_HIP "Enable building for AMD GPUs" ON)

set(CSTONE_DIR ${PROJECT_SOURCE_DIR}/domain/include)
set(CSTONE_TEST_DIR ${PROJECT_SOURCE_DIR}/domain/test)
Expand Down Expand Up @@ -46,18 +47,32 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
endif()

check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
find_package(CUDAToolkit)
set(CMAKE_CUDA_STANDARD 17)
if(SPH_EXA_WITH_CUDA)
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
find_package(CUDAToolkit)
set(CMAKE_CUDA_STANDARD 17)
else()
message(STATUS "No CUDA support")
set(SPH_EXA_WITH_CUDA OFF)
endif()
endif()

if(SPH_EXA_WITH_HIP)
check_language(HIP)
if(CMAKE_HIP_COMPILER AND NOT CMAKE_CUDA_COMPILER)
enable_language(HIP)
find_package(hip)
set(CMAKE_HIP_STANDARD 17)
else()
message(STATUS "No HIP support")
set(SPH_EXA_WITH_HIP OFF)
endif()
endif()

check_language(HIP)
if(CMAKE_HIP_COMPILER AND NOT CMAKE_CUDA_COMPILER)
enable_language(HIP)
find_package(hip)
set(CMAKE_HIP_STANDARD 17)
if(SPH_EXA_WITH_HIP AND SPH_EXA_WITH_CUDA)
message(FATAL_ERROR "CUDA and HIP cannot both be turned on")
endif()

option(SPH_EXA_WITH_H5PART "Enable HDF5 IO using the H5Part library" ON)
Expand All @@ -81,22 +96,14 @@ elseif(INSITU STREQUAL "Ascent")
find_package(Ascent REQUIRED PATHS "$ENV{EBROOTASCENT}/lib/cmake/ascent")
endif()

option(SPH_EXA_WITH_FFTW "Enable use of the FFTW library" ON)
if (SPH_EXA_WITH_FFTW)
find_package(FFTW)
if (NOT FFTW_FOUND)
message(STATUS "No FFTW support")
set(SPH_EXA_WITH_FFTW OFF)
endif ()
endif ()

option(SPH_EXA_WITH_GRACKLE "Enable radiative cooling with GRACKLE" OFF)
if (SPH_EXA_WITH_GRACKLE)
add_subdirectory(extern/grackle)
endif()

add_subdirectory(domain)
add_subdirectory(ryoanji)
add_subdirectory(scripts)
add_subdirectory(sph)
add_subdirectory(physics)
add_subdirectory(main)
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,12 @@ cd build
CC=cc CXX=CC cmake -DCMAKE_CUDA_ARCHITECTURES=60 -S <GIT_SOURCE_DIR>

```
Module and CMake configuration on LUMI
Module and CMake configuration on LUMI (ROCm 6.2.2)
```shell
module load CrayEnv buildtools/22.12 craype-accel-amd-gfx90a rocm cray-hdf5-parallel
cd <GIT_SOURCE_DIR>; hipify-perl -inplace `find -name *.cu -o -name *.cuh` && find -name *.prehip -delete
cmake -DCMAKE_CXX_COMPILER=CC -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_HIP_COMPILER=CC -DCMAKE_HIP_COMPILER_FORCED=ON -DGPU_DIRECT=<ON/OFF> -S <GIT_SOURCE_DIR>
module swap PrgEnv-cray PrgEnv-gnu
module load CrayEnv buildtools craype-accel-amd-gfx90a rocm cray-hdf5-parallel
cd <GIT_SOURCE_DIR>;
cmake -DCMAKE_CXX_COMPILER=CC -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_HIP_COMPILER=CC -DCSTONE_WITH_GPU_AWARE_MPI=ON -S <GIT_SOURCE_DIR>
```

Build everything: ```make -j```
Expand Down
1 change: 0 additions & 1 deletion README_insitu.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ module load ParaView/5.10.1-CrayGNU-21.09-EGL
-DBUILD_ANALYTICAL:BOOL=OFF \
-DBUILD_TESTING:BOOL=OFF \
-DSPH_EXA_WITH_H5PART:BOOL=OFF \
-DSPH_EXA_WITH_FFTW:BOOL=OFF \
-DCMAKE_CXX_COMPILER=CC \
-DINSITU=Ascent
Expand Down
43 changes: 33 additions & 10 deletions domain/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,41 @@ if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
endif()

check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
find_package(CUDAToolkit)
set(CMAKE_CUDA_STANDARD 17)
option(CSTONE_WITH_CUDA "Enable building for NVIDIA GPUs" ON)
option(CSTONE_WITH_HIP "Enable building for AMD GPUs" ON)
option(CSTONE_WITH_GPU_AWARE_MPI "Enable CUDA-aware MPI communication" OFF)

if(GPU_DIRECT)
message(WARNING "Option GPU_DIRECT is deprecated and will be removed. Use -DCSTONE_WITH_GPU_AWARE_MPI=ON instead.")
set(CSTONE_WITH_GPU_AWARE_MPI ON)
endif()

if(CSTONE_WITH_CUDA)
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
find_package(CUDAToolkit)
set(CMAKE_CUDA_STANDARD 17)
else()
message(STATUS "No CUDA support")
set(CSTONE_WITH_CUDA OFF)
endif()
endif()

if(CSTONE_WITH_HIP)
check_language(HIP)
if(CMAKE_HIP_COMPILER)
enable_language(HIP)
find_package(hip)
set(CMAKE_HIP_STANDARD 17)
else()
message(STATUS "No HIP support")
set(CSTONE_WITH_HIP OFF)
endif()
endif()

check_language(HIP)
if(CMAKE_HIP_COMPILER AND NOT CMAKE_CUDA_COMPILER)
enable_language(HIP)
find_package(hip)
set(CMAKE_HIP_STANDARD 17)
if(CSTONE_WITH_HIP AND CSTONE_WITH_CUDA)
message(FATAL_ERROR "CUDA and HIP cannot both be turned on")
endif()

add_subdirectory(include)
Expand Down
2 changes: 1 addition & 1 deletion domain/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ CUDA version: 11.6 or later, HIP version 5.2 or later.

Example CMake invocation:
```shell
CC=mpicc CXX=mpicxx cmake -DCMAKE_CUDA_ARCHITECTURES=60,80,90 -DGPU_DIRECT=<ON/OFF> -DCMAKE_CUDA_FLAGS=-ccbin=mpicxx <GIT_SOURCE_DIR>
CC=mpicc CXX=mpicxx cmake -DCMAKE_CUDA_ARCHITECTURES=60;80;90 -DCSTONE_WITH_GPU_AWARE_MPI=<ON/OFF> -DCMAKE_CUDA_FLAGS=-ccbin=mpicxx <GIT_SOURCE_DIR>
```

GPU-direct (RDMA) MPI communication can be turned on or off by supplying `-D GPU_DIRECT=ON`. Default is `OFF`.
Expand Down
4 changes: 4 additions & 0 deletions domain/include/cstone/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,8 @@ if (CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER)
$<TARGET_OBJECTS:traversal_obj>
$<TARGET_OBJECTS:source_center_gpu_obj>
$<TARGET_OBJECTS:gpu_utils_obj>)

if (CSTONE_WITH_GPU_AWARE_MPI)
target_compile_definitions(cstone_gpu INTERFACE CSTONE_HAVE_GPU_AWARE_MPI)
endif()
endif ()
14 changes: 14 additions & 0 deletions domain/include/cstone/cuda/cub.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

#pragma once

#ifdef __HIPCC__

#include <hipcub/hipcub.hpp>

namespace cub = hipcub;

#else

#include <cub/cub.cuh>

#endif
61 changes: 61 additions & 0 deletions domain/include/cstone/cuda/cuda_runtime.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*! @file
* @brief CUDA/HIP runtime API compatiblity wrapper
*
* @author Sebastian Keller <[email protected]>
*/

#pragma once

#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)

#include <hip/hip_runtime.h>

#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaErrorInvalidValue hipErrorInvalidValue
#define cudaError_t hipError_t
#define cudaEventCreate hipEventCreate
#define cudaEventDestroy hipEventDestroy
#define cudaEventElapsedTime hipEventElapsedTime
#define cudaEventRecord hipEventRecord
#define cudaEventSynchronize hipEventSynchronize
#define cudaEvent_t hipEvent_t
#define cudaFree hipFree
#define cudaFreeHost hipFreeHost
#define cudaGetDevice hipGetDevice
#define cudaGetDeviceCount hipGetDeviceCount
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorName hipGetErrorName
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaMalloc hipMalloc
#define cudaMallocHost hipMallocHost
#define cudaMallocManaged hipMallocManaged
#define cudaMemAttachGlobal hipMemAttachGlobal
#define cudaMemcpy hipMemcpy
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyFromSymbol hipMemcpyFromSymbol
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyToSymbol hipMemcpyToSymbol
#define cudaMemoryTypeDevice hipMemoryTypeDevice
#define cudaMemoryTypeManaged hipMemoryTypeManaged
#define cudaMemset hipMemset
#define cudaPointerAttributes hipPointerAttribute_t
#define cudaPointerGetAttributes hipPointerGetAttributes
#define cudaSetDevice hipSetDevice
#define cudaStreamCreate hipStreamCreate
#define cudaStreamDestroy hipStreamDestroy
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess

#define GPU_SYMBOL HIP_SYMBOL

#else

#include <cuda_runtime.h>

#define GPU_SYMBOL(x) x

#endif
3 changes: 2 additions & 1 deletion domain/include/cstone/cuda/cuda_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
#pragma once

#include <type_traits>
#include <cuda_runtime.h>
#include <vector>
#include "cuda_runtime.hpp"

#include "device_vector.h"
#include "cuda_stubs.h"
Expand Down
5 changes: 4 additions & 1 deletion domain/include/cstone/cuda/device_vector.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include <thrust/device_vector.h>
#include <thrust/fill.h>

#include "cstone/cuda/cuda_runtime.hpp"
#include "cstone/cuda/errorcheck.cuh"
#include "cstone/util/noinit_thrust.cuh"

#include "device_vector.h"
Expand Down Expand Up @@ -83,7 +85,7 @@ DeviceVector<T>::DeviceVector(const T* first, const T* last)
{
auto size = last - first;
impl_->resize(size);
cudaMemcpy(impl_->data(), first, size * sizeof(T), cudaMemcpyHostToDevice);
checkGpuErrors(cudaMemcpy(impl_->data(), first, size * sizeof(T), cudaMemcpyHostToDevice));
}

template<class T>
Expand Down Expand Up @@ -174,6 +176,7 @@ template class DeviceVector<util::array<int, 2>>;
template class DeviceVector<util::array<int, 3>>;
template class DeviceVector<util::array<unsigned, 1>>;
template class DeviceVector<util::array<uint64_t, 1>>;
template class DeviceVector<util::array<uint64_t, 2>>;
template class DeviceVector<util::array<unsigned, 2>>;
template class DeviceVector<util::array<float, 3>>;
template class DeviceVector<util::array<double, 3>>;
Expand Down
2 changes: 1 addition & 1 deletion domain/include/cstone/cuda/errorcheck.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#pragma once

#include <cstdio>
#include <cuda_runtime.h>
#include "cuda_runtime.hpp"

inline void checkErr(cudaError_t err, const char* filename, int lineno, const char* funcName)
{
Expand Down
3 changes: 1 addition & 2 deletions domain/include/cstone/cuda/gpu_config.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@
#pragma once

#include <cstdint>
#include <cuda_runtime.h>

#include "cstone/cuda/cuda_runtime.hpp"
#include "cstone/cuda/errorcheck.cuh"

namespace cstone
Expand Down
9 changes: 4 additions & 5 deletions domain/include/cstone/focus/rebalance_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@
* @author Sebastian Keller <[email protected]>
*/

#include <cub/cub.cuh>

#include "cstone/cuda/cub.hpp"
#include "cstone/cuda/errorcheck.cuh"
#include "cstone/focus/rebalance.hpp"
#include "cstone/focus/rebalance_gpu.h"
Expand Down Expand Up @@ -159,7 +158,7 @@ bool protectAncestorsGpu(const KeyType* prefixes,
protectAncestorsKernel<<<iceil(numNodes, numThreads), numThreads>>>(prefixes, parents, nodeOps, numNodes);

int numNodesModify;
checkGpuErrors(cudaMemcpyFromSymbol(&numNodesModify, nodeOpSum, sizeof(int)));
checkGpuErrors(cudaMemcpyFromSymbol(&numNodesModify, GPU_SYMBOL(nodeOpSum), sizeof(int)));

return numNodesModify == 0;
}
Expand Down Expand Up @@ -197,7 +196,7 @@ ResolutionStatus enforceKeysGpu(const KeyType* forcedKeys,
}

int status;
checkGpuErrors(cudaMemcpyFromSymbol(&status, enforceKeyStatus_device, sizeof(ResolutionStatus)));
checkGpuErrors(cudaMemcpyFromSymbol(&status, GPU_SYMBOL(enforceKeyStatus_device), sizeof(ResolutionStatus)));
return static_cast<ResolutionStatus>(status);
}

Expand All @@ -215,4 +214,4 @@ template ResolutionStatus enforceKeysGpu(const uint64_t* forcedKeys,
const TreeNodeIndex* parents,
TreeNodeIndex* nodeOps);

} // namespace cstone
} // namespace cstone
1 change: 1 addition & 0 deletions domain/include/cstone/focus/source_center_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

#pragma once

#include "cstone/focus/source_center.hpp"
#include "cstone/tree/definitions.h"

namespace cstone
Expand Down
1 change: 1 addition & 0 deletions domain/include/cstone/halos/gather_halos_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
* @author Sebastian Keller <[email protected]>
*/

#include "cstone/cuda/cuda_runtime.hpp"
#include "cstone/primitives/math.hpp"
#include "cstone/primitives/stl.hpp"
#include "cstone/util/array.hpp"
Expand Down
3 changes: 1 addition & 2 deletions domain/include/cstone/primitives/mpi_cuda.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,12 @@
#pragma once

#include <vector>
#include <cuda_runtime.h>

#include "cstone/primitives/mpi_wrappers.hpp"
#include "cstone/util/noinit_alloc.hpp"
#include "cstone/cuda/errorcheck.cuh"

#ifdef USE_GPU_DIRECT
#ifdef CSTONE_HAVE_GPU_AWARE_MPI
constexpr inline bool useGpuDirect = true;
#else
constexpr inline bool useGpuDirect = false;
Expand Down
Loading

0 comments on commit 4be3f10

Please sign in to comment.