From 29b3abc7efe88ddeff7df9ae2f630414fe07f8c6 Mon Sep 17 00:00:00 2001 From: dzzz2001 <153698752+dzzz2001@users.noreply.github.com> Date: Wed, 2 Oct 2024 21:12:14 +0800 Subject: [PATCH] Fix: fix cusolvermp compiling error with icpc and update ks_solver doc (#5196) * fix compilation error of icpc * add cusolvermp * update ks_solver related doc --- docs/advanced/acceleration/cuda.md | 2 +- docs/advanced/input_files/input-main.md | 10 +++++++++- source/module_hsolver/kernels/cuda/diag_cusolvermp.cu | 7 +++++-- source/module_io/read_input_item_elec_stru.cpp | 1 + 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/docs/advanced/acceleration/cuda.md b/docs/advanced/acceleration/cuda.md index 98a85f11b0..3231961766 100644 --- a/docs/advanced/acceleration/cuda.md +++ b/docs/advanced/acceleration/cuda.md @@ -36,7 +36,7 @@ The ABACUS program will automatically determine whether the current ELPA support ## Run with the GPU support by editing the INPUT script: In `INPUT` file we need to set the input parameter [device](../input_files/input-main.md#device) to `gpu`. If this parameter is not set, ABACUS will try to determine if there are available GPUs. -- Set `ks_solver`: For the PW basis, CG, BPCG and Davidson methods are supported on GPU; set the input parameter [ks_solver](../input_files/input-main.md#ks_solver) to `cg`, `bpcg` or `dav`. For the LCAO basis, `cusolver` and `elpa` is supported on GPU. +- Set `ks_solver`: For the PW basis, CG, BPCG and Davidson methods are supported on GPU; set the input parameter [ks_solver](../input_files/input-main.md#ks_solver) to `cg`, `bpcg` or `dav`. For the LCAO basis, `cusolver`, `cusolvermp` and `elpa` is supported on GPU. - **multi-card**: ABACUS allows for multi-GPU acceleration. If you have multiple GPU cards, you can run ABACUS with several MPI processes, and each process will utilize one GPU card. For example, the command `mpirun -n 2 abacus` will by default launch two GPUs for computation. If you only have one card, this command will only start one GPU. ## Examples diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md index 352339f739..b332f1fc80 100644 --- a/docs/advanced/input_files/input-main.md +++ b/docs/advanced/input_files/input-main.md @@ -933,6 +933,8 @@ calculations. - **genelpa**: This method should be used if you choose localized orbitals. - **scalapack_gvx**: Scalapack can also be used for localized orbitals. - **cusolver**: This method needs building with CUDA and at least one gpu is available. + - **cusolvermp**: This method supports multi-GPU acceleration and needs building with CUDA。 Note that when using cusolvermp, you should set the number of MPI processes to be equal to the number of GPUs. + - **elpa**: The ELPA solver supports both CPU and GPU. By setting the `device` to GPU, you can launch the ELPA solver with GPU acceleration (provided that you have installed a GPU-supported version of ELPA, which requires you to manually compile and install ELPA, and the ABACUS should be compiled with -DUSE_ELPA=ON and -DUSE_CUDA=ON). The ELPA solver also supports multi-GPU acceleration. If you set ks_solver=`genelpa` for basis_type=`pw`, the program will be stopped with an error message: @@ -941,7 +943,13 @@ calculations. ``` Then the user has to correct the input file and restart the calculation. -- **Default**: cg (plane-wave basis), or genelpa (localized atomic orbital basis, if compiling option `USE_ELPA` has been set),lapack (localized atomic orbital basis, if compiling option `ENABLE_MPI` has not been set), scalapack_gvx, (localized atomic orbital basis, if compiling option `USE_ELPA` has not been set and if compiling option `ENABLE_MPI` has been set) +- **Default**: + - **PW basis**: cg. + - **LCAO basis**: + - genelpa (if compiling option `USE_ELPA` has been set) + - lapack (if compiling option `ENABLE_MPI` has not been set) + - scalapack_gvx (if compiling option `USE_ELPA` has not been set and compiling option `ENABLE_MPI` has been set) + - cusolver (if compiling option `USE_CUDA` has been set) ### nbands diff --git a/source/module_hsolver/kernels/cuda/diag_cusolvermp.cu b/source/module_hsolver/kernels/cuda/diag_cusolvermp.cu index a0f7442185..25975e6303 100644 --- a/source/module_hsolver/kernels/cuda/diag_cusolvermp.cu +++ b/source/module_hsolver/kernels/cuda/diag_cusolvermp.cu @@ -10,13 +10,15 @@ extern "C" #include "module_hsolver/genelpa/Cblacs.h" } #include +#include #include "helper_cusolver.h" #include "module_base/global_function.h" #include "module_base/module_device/device.h" static calError_t allgather(void* src_buf, void* recv_buf, size_t size, void* data, void** request) { MPI_Request req; - int err = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size, MPI_BYTE, (MPI_Comm)(data), &req); + intptr_t ptr = reinterpret_cast(data); + int err = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size, MPI_BYTE, (MPI_Comm)ptr, &req); if (err != MPI_SUCCESS) { return CAL_ERROR; @@ -27,7 +29,8 @@ static calError_t allgather(void* src_buf, void* recv_buf, size_t size, void* da static calError_t request_test(void* request) { - MPI_Request req = (MPI_Request)(request); + intptr_t ptr = reinterpret_cast(request); + MPI_Request req = (MPI_Request)ptr; int completed; int err = MPI_Test(&req, &completed, MPI_STATUS_IGNORE); if (err != MPI_SUCCESS) diff --git a/source/module_io/read_input_item_elec_stru.cpp b/source/module_io/read_input_item_elec_stru.cpp index 4e7795a853..f7b861157e 100644 --- a/source/module_io/read_input_item_elec_stru.cpp +++ b/source/module_io/read_input_item_elec_stru.cpp @@ -69,6 +69,7 @@ void ReadInput::item_elec_stru() "lapack", "scalapack_gvx", "cusolver", + "cusolvermp", "pexsi", "cg_in_lcao", };