Skip to content

Commit

Permalink
Added the below environment variables
Browse files Browse the repository at this point in the history
- relionSyclUseCuda
- relionSyclUseHip
- relionSyclUseSubSubDevice
- relionSyclUseInOrderQueue
- relionSyclUseAsyncSubmission
- relionSyclUseStream
  • Loading branch information
do-jason committed Oct 17, 2023
1 parent e6ddc77 commit 8ce734a
Show file tree
Hide file tree
Showing 10 changed files with 765 additions and 194 deletions.
26 changes: 16 additions & 10 deletions README_sycl.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ This is SYCL/DPC++ version for [RELION](https://github.com/3dem/relion)
$ git clone https://github.com/3dem/relion-devel.git relion_sycl -b sycl-merge
$ cd relion_sycl; mkdir build_sycl; cd build_sycl
$ {Load Intel oneAPI toolkit and SYCL/Level Zero/OpenCL runtime environment}
$ sycl-ls # This will display available SYCL devices
$ sycl-ls # This will display available SYCL devices
$ cmake \
-DCMAKE_C_COMPILER=mpiicx \
-DCMAKE_CXX_COMPILER=mpiicpx \
Expand All @@ -40,9 +40,9 @@ $ cmake \
..

$ #### This is Intel GPU Level Zero backend specific #####
$ export ZE_AFFINITY_MASK=0 # Use only the first available Level Zero device. This can be replaced by --gpu 0 syntax.
$ export ZEX_NUMBER_OF_CCS=0:4,1:4
$ export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 # Don't use this with Intel Arc GPUs. Only for Intel Data Center GPUs
$ export ZE_AFFINITY_MASK=0 # Use only the first available Level Zero device. This can be replaced by --gpu 0 syntax.
$ export ZEX_NUMBER_OF_CCS=0:4,1:4 # Set this only if you are putting more than one MPI ranks per GPU. 0:4 means 4 MPI ranks running on card 0
$ export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
$ export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
$ export SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR="1;4G;host:16M,512,64K;device:16M,1024,64K;shared:0,0,64K"
$ #### End of Intel GPU Level Zero backend specific #####
Expand All @@ -52,14 +52,24 @@ $ ulimit -n 512000 # this is necessary for multi-GPU jobs
$ {Run 2D/3D/refinement application by replacing --gpu/--cpu with --gpu/--sycl/--sycl-opencl/--sycl-cpu/--sycl-cuda/--sycl-hip}
```

## Optional runtime environment variables

+ The below shell environment variables can be tested for more potential SYCL specific tuning. Setting it to "1" or "on" will enable these features.
+ `relionSyclUseCuda`: --sycl-cuda will be used even if --gpu/--sycl is specified in command lines
+ `relionSyclUseHip`: --sycl-hip will be used even if --gpu/--sycl is specified in command lines
+ `relionSyclUseInOrderQueue`: Use in-order SYCL queue. Without this, out-of-order SYCL queue is used by default. (experimental)
+ `relionSyclUseAsyncSubmission`: Remove wait() for each SYCL kernel submission. (experimental)
+ `relionSyclUseStream`: Create new in-order SYCL queue for each cudaStream. (experimental)
+ `relionSyclUseSubSubDevice`: Create separate SYCL queue for each CCS. (experimental)


## Added macros

+ For CMake configuration
+ `SYCL`(=ON/OFF): Enable SYCL based acceleration build
+ `SyclForceOneDPL`(=ON/OFF): Use oneDPL(https://github.com/oneapi-src/oneDPL) if it can be used. This has the same effect as setting "-DUSE_ONEDPL" for CMAKE_CXX_FLAGS below. (experimental)
+ `SYCL_AOT_COMPILE`(=ON/OFF): Enable AOT(Ahead-Of-Time) compilation for SPIR64 target. Default target is pvc. (experimental)
+ `SYCL_AOT_TARGET`(=ON/OFF): Specify AOT(Ahead-Of-Time) SPIR64 target. Possible list can be checked using "ocloc compile --help" command. (experimental)
+ `SYCL_AOT_COMPILE`(=ON/OFF): Enable AOT(Ahead-Of-Time) compilation for SPIR64 target. Default target is pvc. (for future use)
+ `SYCL_AOT_TARGET`(=ON/OFF): Specify AOT(Ahead-Of-Time) SPIR64 target. Possible list can be checked using "ocloc compile --help" command. (for future use)
+ `SYCL_CUDA_COMPILE`(=ON/OFF): Enable SYCL compilation for CUDA target (Not tested)
+ `SYCL_CUDA_TARGET`: SYCL CUDA arch target (Not tested)
+ `SYCL_HIP_COMPILE`(=ON/OFF): Enable SYCL compilation for HIP target (Not tested)
Expand All @@ -78,10 +88,6 @@ $ {Run 2D/3D/refinement application by replacing --gpu/--cpu with --gpu/--sycl/-
+ `INTEL_SG_SIZE`: Used for Intel sub-group size in SYCL kernel. 32 is recommended for PVC and 16 is for ATS. (Not tested well)
+ `USE_IPP`: Use Intel IPP library's RadixSort for sortOnDevice instead of std::sort. Enabled by default if IPP library exists.
+ `USE_MPI_COLLECTIVE`: Use MPI collective whenever possible. Enabled by default for ALTCPU and SYCL.
+ `USE_INORDER_QUEUE`: Use in-order SYCL queue. Without this, out-of-order SYCL queue is used by default. (experimental)
+ `USE_ASYNC_SYCL_SUBMIT`: Remove wait() for each SYCL kernel submission. (experimental)
+ `USE_SYCL_STREAM`: Create new in-order SYCL queue for each cudaStream. (experimental)
+ `USE_SUBSUB_DEVICE`: Create separate SYCL queue for each CCS. (experimental)
+ `USE_EXISTING_SYCL_DEVICE`: This will copy and use created SYCL device pointer instead of creating new SYCL device for each thread. Not recommended.
+ `USE_SINCOS_TABLE`: Pre-calculate sine/cosine table before main loop in some kernels (Not implemented)

Expand Down
56 changes: 40 additions & 16 deletions src/acc/acc_ml_optimiser_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1270,6 +1270,11 @@ void getAllSquaredDifferencesCoarse(
for (int exp_iclass = sp.iclass_min; exp_iclass <= sp.iclass_max; exp_iclass++)
DEBUG_HANDLE_ERROR(cudaStreamSynchronize(accMLO->classStreams[exp_iclass]));
DEBUG_HANDLE_ERROR(cudaStreamSynchronize(cudaStreamPerThread));
#elif _SYCL_ENABLED
if (accMLO->useStream())
for (int exp_iclass = sp.iclass_min; exp_iclass <= sp.iclass_max; exp_iclass++)
(accMLO->classStreams[exp_iclass])->waitAll();
devAcc->waitAll();
#endif

for (unsigned long iclass = sp.iclass_min, allWeights_pos=0; iclass <= sp.iclass_max; iclass++)
Expand Down Expand Up @@ -1334,10 +1339,9 @@ void getAllSquaredDifferencesCoarse(
DEBUG_HANDLE_ERROR(cudaStreamSynchronize(accMLO->classStreams[exp_iclass]));
DEBUG_HANDLE_ERROR(cudaStreamSynchronize(cudaStreamPerThread)); // does not appear to be NEEDED FOR NON-BLOCKING CLASS STREAMS in tests, but should be to sync against classStreams
#elif _SYCL_ENABLED
#ifdef USE_SYCL_STREAM
for (unsigned long exp_iclass = sp.iclass_min; exp_iclass <= sp.iclass_max; exp_iclass++)
(accMLO->classStreams[exp_iclass])->waitAll();
#endif
if (accMLO->useStream())
for (unsigned long exp_iclass = sp.iclass_min; exp_iclass <= sp.iclass_max; exp_iclass++)
(accMLO->classStreams[exp_iclass])->waitAll();
#ifdef USE_ONEDPL
devAcc->waitAll();
#else
Expand Down Expand Up @@ -1694,6 +1698,11 @@ void getAllSquaredDifferencesFine(
for (unsigned long exp_iclass = sp.iclass_min; exp_iclass <= sp.iclass_max; exp_iclass++)
DEBUG_HANDLE_ERROR(cudaStreamSynchronize(accMLO->classStreams[exp_iclass]));
DEBUG_HANDLE_ERROR(cudaStreamSynchronize(cudaStreamPerThread));
#elif _SYCL_ENABLED
if (accMLO->useStream())
for (unsigned long exp_iclass = sp.iclass_min; exp_iclass <= sp.iclass_max; exp_iclass++)
(accMLO->classStreams[exp_iclass])->waitAll();
devAcc->waitAll();
#endif

for (unsigned long iclass = sp.iclass_min; iclass <= sp.iclass_max; iclass++)
Expand Down Expand Up @@ -1779,10 +1788,9 @@ void getAllSquaredDifferencesFine(
DEBUG_HANDLE_ERROR(cudaStreamSynchronize(accMLO->classStreams[exp_iclass]));
DEBUG_HANDLE_ERROR(cudaStreamSynchronize(cudaStreamPerThread));
#elif _SYCL_ENABLED
#ifdef USE_SYCL_STREAM
for (unsigned long exp_iclass = sp.iclass_min; exp_iclass <= sp.iclass_max; exp_iclass++)
(accMLO->classStreams[exp_iclass])->waitAll();
#endif
if (accMLO->useStream())
for (unsigned long exp_iclass = sp.iclass_min; exp_iclass <= sp.iclass_max; exp_iclass++)
(accMLO->classStreams[exp_iclass])->waitAll();
devAcc->waitAll();
#ifndef USE_ONEDPL
FinePassWeights.rot_id.freeDeviceIfSet();
Expand Down Expand Up @@ -2296,10 +2304,9 @@ void convertAllSquaredDifferencesToWeights(unsigned exp_ipass,
DEBUG_HANDLE_ERROR(cudaStreamSynchronize(accMLO->classStreams[exp_iclass]));
DEBUG_HANDLE_ERROR(cudaStreamSynchronize(cudaStreamPerThread));
#elif defined(_SYCL_ENABLED) && defined(USE_ONEDPL)
#ifdef USE_SYCL_STREAM
for (int exp_iclass = sp.iclass_min; exp_iclass <= sp.iclass_max; exp_iclass++)
(accMLO->classStreams[exp_iclass])->waitAll();
#endif
if (accMLO->useStream())
for (int exp_iclass = sp.iclass_min; exp_iclass <= sp.iclass_max; exp_iclass++)
(accMLO->classStreams[exp_iclass])->waitAll();
devAcc->waitAll();
#endif

Expand Down Expand Up @@ -2384,10 +2391,9 @@ void convertAllSquaredDifferencesToWeights(unsigned exp_ipass,
DEBUG_HANDLE_ERROR(cudaStreamSynchronize(accMLO->classStreams[exp_iclass]));
DEBUG_HANDLE_ERROR(cudaStreamSynchronize(cudaStreamPerThread));
#elif defined(_SYCL_ENABLED) && defined(USE_ONEDPL)
#ifdef USE_SYCL_STREAM
for (unsigned long exp_iclass = sp.iclass_min; exp_iclass <= sp.iclass_max; exp_iclass++)
(accMLO->classStreams[exp_iclass])->waitAll();
#endif
if (accMLO->useStream())
for (unsigned long exp_iclass = sp.iclass_min; exp_iclass <= sp.iclass_max; exp_iclass++)
(accMLO->classStreams[exp_iclass])->waitAll();
devAcc->waitAll();
#endif

Expand Down Expand Up @@ -3066,6 +3072,11 @@ void storeWeightedSums(OptimisationParamters &op, SamplingParameters &sp,
for (unsigned long exp_iclass = sp.iclass_min; exp_iclass <= sp.iclass_max; exp_iclass++)
DEBUG_HANDLE_ERROR(cudaStreamSynchronize(accMLO->classStreams[exp_iclass]));
DEBUG_HANDLE_ERROR(cudaStreamSynchronize(cudaStreamPerThread));
#elif _SYCL_ENABLED
if (accMLO->useStream())
for (unsigned long exp_iclass = sp.iclass_min; exp_iclass <= sp.iclass_max; exp_iclass++)
(accMLO->classStreams[exp_iclass])->waitAll();
devAcc->waitAll();
#endif

for (unsigned long iclass = sp.iclass_min; iclass <= sp.iclass_max; iclass++)
Expand Down Expand Up @@ -3168,6 +3179,11 @@ void storeWeightedSums(OptimisationParamters &op, SamplingParameters &sp,
for (unsigned long iclass = sp.iclass_min; iclass <= sp.iclass_max; iclass++)
DEBUG_HANDLE_ERROR(cudaStreamSynchronize(accMLO->classStreams[iclass]));
DEBUG_HANDLE_ERROR(cudaStreamSynchronize(cudaStreamPerThread));
#elif _SYCL_ENABLED
if (accMLO->useStream())
for (unsigned long iclass = sp.iclass_min; iclass <= sp.iclass_max; iclass++)
(accMLO->classStreams[iclass])->waitAll();
devAcc->waitAll();
#endif

classPos = 0;
Expand Down Expand Up @@ -3350,6 +3366,14 @@ void storeWeightedSums(OptimisationParamters &op, SamplingParameters &sp,

wdiff2s.cpToHost();
DEBUG_HANDLE_ERROR(cudaStreamSynchronize(cudaStreamPerThread));
#elif _SYCL_ENABLED
if (accMLO->useStream())
for (unsigned long exp_iclass = sp.iclass_min; exp_iclass <= sp.iclass_max; exp_iclass++)
(accMLO->classStreams[exp_iclass])->waitAll();
devAcc->waitAll();

wdiff2s.cpToHost();
wdiff2s.streamSync();
#endif

AAXA_pos=0;
Expand Down
Loading

0 comments on commit 8ce734a

Please sign in to comment.