Skip to content

Commit

Permalink
Implemented memory pool for SYCL device/host memory (All except ZE_* …
Browse files Browse the repository at this point in the history
…and ZEX_* environment variables are not necessary. "ulimit -n" is not necessary.)

Tuned SYCL kernel workgroup size (Showed big improvement in 2D classification performance)
Fixed AccPtr host memory leak in PassWeights.weights (src/acc/acc_ml_optimiser_impl.h)
Prevented threads of the same MPI rank from being assigned to different SYCL devices in automatic mapping
Fix Intel IPP library header file for oneAPI 2024 release and older
Minor update for SYCL2020 standard
Consistent use of tab/space in source codes
  • Loading branch information
do-jason committed Nov 30, 2023
1 parent 3ebda4a commit c2edd63
Show file tree
Hide file tree
Showing 26 changed files with 923 additions and 198 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,7 @@ if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Intel" OR "${CMAKE_CXX_COMPILER}" MATCHES
endif(MKLFFT)

if(INTEL_IPP OR DEFINED ENV{IPPROOT})
if(EXISTS "$ENV{IPPROOT}/include/ipps.h" AND EXISTS "$ENV{IPPROOT}/lib/intel64/libipps.a")
if(EXISTS "$ENV{IPPROOT}/include/ipp.h" AND EXISTS "$ENV{IPPROOT}/lib/intel64/libipps.a")
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qipp")
SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -qipp -qipp-link=static")
Expand Down
14 changes: 8 additions & 6 deletions README_sycl.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# RELION SYCL/DPC++ version

This is SYCL/DPC++ version for [RELION](https://github.com/3dem/relion)
This is SYCL/DPC++ version of [RELION](https://github.com/3dem/relion)

## Build & Running

Expand Down Expand Up @@ -40,16 +40,12 @@ $ cmake \
$ #### This is Intel GPU Level Zero backend specific #####
$ export ZE_AFFINITY_MASK=0 # Use only the first available Level Zero device. This can be replaced by --gpu 0 syntax.
$ export ZEX_NUMBER_OF_CCS=0:4,1:4 # Set this only if you are putting more than one MPI ranks per GPU. 0:4 means 4 MPI ranks running on card 0
$ export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
$ export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
$ export SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR="1;4G;host:16M,512,64K;device:16M,1024,64K;shared:0,0,64K"
$ #### End of Intel GPU Level Zero backend specific #####
$ # For finer control of SYCL devcices, please see the above descrpition on ONEAPI_DEVICE_SELECTOR
$
$ ulimit -n 512000 # this is necessary for multi-GPU jobs
$ {Run 2D/3D/refinement application by replacing --gpu/--cpu with --gpu/--sycl/--sycl-opencl/--sycl-cpu/--sycl-cuda/--sycl-hip}
```


## Optional runtime environment variables

+ The below shell environment variables can be tested for more potential SYCL specific tuning. Setting it to "1" or "on" will enable these features.
Expand All @@ -59,6 +55,12 @@ $ {Run 2D/3D/refinement application by replacing --gpu/--cpu with --gpu/--sycl/-
+ `relionSyclUseAsyncSubmission`: Remove wait() for each SYCL kernel submission. (experimental)
+ `relionSyclUseStream`: Create new in-order SYCL queue for each cudaStream. (experimental)
+ `relionSyclUseSubSubDevice`: Create separate SYCL queue for each CCS. (experimental)
+ `relionSyclBlockSize`: SYCL memory pool block size. This takes precedence over relionSyclHostBlockSize and relionSyclDeviceBlockSize.
+ `relionSyclHostBlockSize`: SYCL memory pool block size for sycl::malloc_host. Default is 256MB
+ `relionSyclDeviceBlockSize`: SYCL memory pool block size for sycl::malloc_device. Default is 256MB
+ `MAX_MPI_BLOCK`: Maximum MPI message size per single MPI API call for point-to-point and collective communication. This takes precedence over MAX_MPI_P2P_BLOCK and MAX_MPI_COLL_BLOCK.
+ `MAX_MPI_P2P_BLOCK`: Maximum MPI message size per single MPI API call for point-to-point communication. Default is 4GB.
+ `MAX_MPI_COLL_BLOCK`: Maximum MPI message size per single MPI API call for collective communication. Default is 64MB.


## Added macros
Expand Down
2 changes: 1 addition & 1 deletion src/acc/acc_ml_optimiser_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -2015,8 +2015,8 @@ void convertAllSquaredDifferencesToWeights(unsigned exp_ipass,
PassWeights.weights.setAccPtr(&(~Mweight)[0]);
PassWeights.weights.setHostPtr(&Mweight[0]);
PassWeights.weights.setSize(nr_coarse_weights);
PassWeights.weights.doFreeHost=false;
}
PassWeights.weights.doFreeHost=false;

std::pair<size_t, XFLOAT> min_pair=AccUtilities::getArgMinOnDevice<XFLOAT>(PassWeights.weights);
PassWeights.weights.cpToHost();
Expand Down
16 changes: 12 additions & 4 deletions src/acc/acc_ptr.h
Original file line number Diff line number Diff line change
Expand Up @@ -557,13 +557,11 @@ class AccPtr
ACC_PTR_DEBUG_FATAL(str.c_str());
CRITICAL(RAMERR);
}
isHostSYCL = true;
}
else
{
if(posix_memalign((void **)&newArr, MEM_ALIGN, sizeof(T) * newSize))
CRITICAL(RAMERR);
isHostSYCL = false;
}
#else
if(posix_memalign((void **)&newArr, MEM_ALIGN, sizeof(T) * newSize))
Expand All @@ -580,6 +578,12 @@ class AccPtr
freeHostIfSet();
setSize(newSize);
setHostPtr(newArr);
#ifdef _SYCL_ENABLED
if(accType == accSYCL)
isHostSYCL = true;
else
isHostSYCL = false;
#endif
doFreeHost=true;
}

Expand All @@ -602,13 +606,11 @@ class AccPtr
ACC_PTR_DEBUG_FATAL(str.c_str());
CRITICAL(RAMERR);
}
isHostSYCL = true;
}
else
{
if(posix_memalign((void **)&newArr, MEM_ALIGN, sizeof(T) * newSize))
CRITICAL(RAMERR);
isHostSYCL = false;
}
#else
if(posix_memalign((void **)&newArr, MEM_ALIGN, sizeof(T) * newSize))
Expand Down Expand Up @@ -646,6 +648,12 @@ class AccPtr
freeHostIfSet();
setSize(newSize);
setHostPtr(newArr);
#ifdef _SYCL_ENABLED
if(accType == accSYCL)
isHostSYCL = true;
else
isHostSYCL = false;
#endif
doFreeHost=true;
}

Expand Down
Loading

0 comments on commit c2edd63

Please sign in to comment.