Implemented memory pool for SYCL device/host memory (All except ZE_* …

…and ZEX_* environment variables are not necessary. "ulimit -n" is not necessary.) Tuned SYCL kernel workgroup size (Showed big improvement in 2D classification performance) Fixed AccPtr host memory leak in PassWeights.weights (src/acc/acc_ml_optimiser_impl.h) Prevented threads of the same MPI rank from being assigned to different SYCL devices in automatic mapping Fix Intel IPP library header file for oneAPI 2024 release and older Minor update for SYCL2020 standard Consistent use of tab/space in source codes
3dem · Nov 30, 2023 · c2edd63 · c2edd63
1 parent 3ebda4a
commit c2edd63
Show file tree

Hide file tree

Showing 26 changed files with 923 additions and 198 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -422,7 +422,7 @@ if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Intel" OR "${CMAKE_CXX_COMPILER}" MATCHES
 	endif(MKLFFT)
 
 	if(INTEL_IPP OR DEFINED ENV{IPPROOT})
-		if(EXISTS "$ENV{IPPROOT}/include/ipps.h" AND EXISTS "$ENV{IPPROOT}/lib/intel64/libipps.a")
+		if(EXISTS "$ENV{IPPROOT}/include/ipp.h" AND EXISTS "$ENV{IPPROOT}/lib/intel64/libipps.a")
 			if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM")
 				SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qipp")
 				SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -qipp -qipp-link=static")

diff --git a/README_sycl.md b/README_sycl.md
@@ -1,6 +1,6 @@
 # RELION SYCL/DPC++ version
 
-This is SYCL/DPC++ version for [RELION](https://github.com/3dem/relion)
+This is SYCL/DPC++ version of [RELION](https://github.com/3dem/relion)
 
 ## Build & Running
 
@@ -40,16 +40,12 @@ $ cmake \
 $ #### This is Intel GPU Level Zero backend specific #####
 $ export ZE_AFFINITY_MASK=0         # Use only the first available Level Zero device. This can be replaced by --gpu 0 syntax.
 $ export ZEX_NUMBER_OF_CCS=0:4,1:4  # Set this only if you are putting more than one MPI ranks per GPU. 0:4 means 4 MPI ranks running on card 0
-$ export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
-$ export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
-$ export SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR="1;4G;host:16M,512,64K;device:16M,1024,64K;shared:0,0,64K"
 $ #### End of Intel GPU Level Zero backend specific  #####
 $ # For finer control of SYCL devcices, please see the above descrpition on ONEAPI_DEVICE_SELECTOR
-$ 
-$ ulimit -n 512000 # this is necessary for multi-GPU jobs
 $ {Run 2D/3D/refinement application by replacing --gpu/--cpu with --gpu/--sycl/--sycl-opencl/--sycl-cpu/--sycl-cuda/--sycl-hip}
 ```
 
+
 ## Optional runtime environment variables
 
 + The below shell environment variables can be tested for more potential SYCL specific tuning. Setting it to "1" or "on" will enable these features.
@@ -59,6 +55,12 @@ $ {Run 2D/3D/refinement application by replacing --gpu/--cpu with --gpu/--sycl/-
 	+ `relionSyclUseAsyncSubmission`: Remove wait() for each SYCL kernel submission. (experimental)
 	+ `relionSyclUseStream`: Create new in-order SYCL queue for each cudaStream. (experimental)
 	+ `relionSyclUseSubSubDevice`: Create separate SYCL queue for each CCS. (experimental)
+	+ `relionSyclBlockSize`: SYCL memory pool block size. This takes precedence over relionSyclHostBlockSize and relionSyclDeviceBlockSize.
+	+ `relionSyclHostBlockSize`: SYCL memory pool block size for sycl::malloc_host. Default is 256MB
+	+ `relionSyclDeviceBlockSize`: SYCL memory pool block size for sycl::malloc_device. Default is 256MB
+	+ `MAX_MPI_BLOCK`: Maximum MPI message size per single MPI API call for point-to-point and collective communication. This takes precedence over MAX_MPI_P2P_BLOCK and MAX_MPI_COLL_BLOCK.
+	+ `MAX_MPI_P2P_BLOCK`: Maximum MPI message size per single MPI API call for point-to-point communication. Default is 4GB.
+	+ `MAX_MPI_COLL_BLOCK`: Maximum MPI message size per single MPI API call for collective communication. Default is 64MB.
 
 
 ## Added macros

diff --git a/src/acc/acc_ml_optimiser_impl.h b/src/acc/acc_ml_optimiser_impl.h
@@ -2015,8 +2015,8 @@ void convertAllSquaredDifferencesToWeights(unsigned exp_ipass,
             PassWeights.weights.setAccPtr(&(~Mweight)[0]);
             PassWeights.weights.setHostPtr(&Mweight[0]);
             PassWeights.weights.setSize(nr_coarse_weights);
+            PassWeights.weights.doFreeHost=false;
         }
-        PassWeights.weights.doFreeHost=false;
 
         std::pair<size_t, XFLOAT> min_pair=AccUtilities::getArgMinOnDevice<XFLOAT>(PassWeights.weights);
         PassWeights.weights.cpToHost();

diff --git a/src/acc/acc_ptr.h b/src/acc/acc_ptr.h
@@ -557,13 +557,11 @@ class AccPtr
 				ACC_PTR_DEBUG_FATAL(str.c_str());
 				CRITICAL(RAMERR);
 			}
-			isHostSYCL = true;
 		}
 		else
 		{
 			if(posix_memalign((void **)&newArr, MEM_ALIGN, sizeof(T) * newSize))
 				CRITICAL(RAMERR);
-			isHostSYCL = false;
 		}
 #else
 		if(posix_memalign((void **)&newArr, MEM_ALIGN, sizeof(T) * newSize))
@@ -580,6 +578,12 @@ class AccPtr
 		freeHostIfSet();	
 	    setSize(newSize);
 	    setHostPtr(newArr);
+#ifdef _SYCL_ENABLED
+		if(accType == accSYCL)
+			isHostSYCL = true;
+		else
+			isHostSYCL = false;
+#endif
 	    doFreeHost=true;
 	}
 
@@ -602,13 +606,11 @@ class AccPtr
 				ACC_PTR_DEBUG_FATAL(str.c_str());
 				CRITICAL(RAMERR);
 			}
-			isHostSYCL = true;
 		}
 		else
 		{
 			if(posix_memalign((void **)&newArr, MEM_ALIGN, sizeof(T) * newSize))
 				CRITICAL(RAMERR);
-			isHostSYCL = false;
 		}
 #else
 		if(posix_memalign((void **)&newArr, MEM_ALIGN, sizeof(T) * newSize))
@@ -646,6 +648,12 @@ class AccPtr
 		freeHostIfSet();
 	    setSize(newSize);
 	    setHostPtr(newArr);
+#ifdef _SYCL_ENABLED
+		if(accType == accSYCL)
+			isHostSYCL = true;
+		else
+			isHostSYCL = false;
+#endif
 	    doFreeHost=true;
 	}