From 29b3abc7efe88ddeff7df9ae2f630414fe07f8c6 Mon Sep 17 00:00:00 2001
From: dzzz2001 <153698752+dzzz2001@users.noreply.github.com>
Date: Wed, 2 Oct 2024 21:12:14 +0800
Subject: [PATCH] Fix: fix cusolvermp compiling error with icpc and update
 ks_solver doc (#5196)

* fix compilation error of icpc

* add cusolvermp

* update ks_solver related doc
---
 docs/advanced/acceleration/cuda.md                    |  2 +-
 docs/advanced/input_files/input-main.md               | 10 +++++++++-
 source/module_hsolver/kernels/cuda/diag_cusolvermp.cu |  7 +++++--
 source/module_io/read_input_item_elec_stru.cpp        |  1 +
 4 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/docs/advanced/acceleration/cuda.md b/docs/advanced/acceleration/cuda.md
index 98a85f11b0..3231961766 100644
--- a/docs/advanced/acceleration/cuda.md
+++ b/docs/advanced/acceleration/cuda.md
@@ -36,7 +36,7 @@ The ABACUS program will automatically determine whether the current ELPA support
 ## Run with the GPU support by editing the INPUT script:
 
 In `INPUT` file we need to set the input parameter [device](../input_files/input-main.md#device) to `gpu`. If this parameter is not set, ABACUS will try to determine if there are available GPUs.
-- Set `ks_solver`: For the PW basis, CG, BPCG and Davidson methods are supported on GPU; set the input parameter [ks_solver](../input_files/input-main.md#ks_solver) to `cg`, `bpcg` or `dav`. For the LCAO basis, `cusolver` and `elpa` is supported on GPU.
+- Set `ks_solver`: For the PW basis, CG, BPCG and Davidson methods are supported on GPU; set the input parameter [ks_solver](../input_files/input-main.md#ks_solver) to `cg`, `bpcg` or `dav`. For the LCAO basis, `cusolver`, `cusolvermp` and `elpa` is supported on GPU.
 - **multi-card**: ABACUS allows for multi-GPU acceleration. If you have multiple GPU cards, you can run ABACUS with several MPI processes, and each process will utilize one GPU card. For example, the command `mpirun -n 2 abacus` will by default launch two GPUs for computation. If you only have one card, this command will only start one GPU. 
 
 ## Examples
diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index 352339f739..b332f1fc80 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -933,6 +933,8 @@ calculations.
   - **genelpa**: This method should be used if you choose localized orbitals.
   - **scalapack_gvx**: Scalapack can also be used for localized orbitals.
   - **cusolver**: This method needs building with CUDA and at least one gpu is available.
+  - **cusolvermp**: This method supports multi-GPU acceleration and needs building with CUDA。 Note that when using cusolvermp, you should set the number of MPI processes to be equal to the number of GPUs.
+  - **elpa**: The ELPA solver supports both CPU and GPU. By setting the `device` to GPU, you can launch the ELPA solver with GPU acceleration (provided that you have installed a GPU-supported version of ELPA, which requires you to manually compile and install ELPA, and the ABACUS should be compiled with -DUSE_ELPA=ON and -DUSE_CUDA=ON). The ELPA solver also supports multi-GPU acceleration.
 
   If you set ks_solver=`genelpa` for basis_type=`pw`, the program will be stopped with an error message:
 
@@ -941,7 +943,13 @@ calculations.
   ```
 
   Then the user has to correct the input file and restart the calculation.
-- **Default**: cg (plane-wave basis), or genelpa (localized atomic orbital basis, if compiling option `USE_ELPA` has been set),lapack (localized atomic orbital basis, if compiling option `ENABLE_MPI` has not been set), scalapack_gvx, (localized atomic orbital basis, if compiling option `USE_ELPA` has not been set and if compiling option `ENABLE_MPI` has been set)
+- **Default**: 
+  - **PW basis**: cg.
+  - **LCAO basis**:
+    - genelpa (if compiling option `USE_ELPA` has been set)
+    - lapack (if compiling option `ENABLE_MPI` has not been set)
+    - scalapack_gvx (if compiling option `USE_ELPA` has not been set and compiling option `ENABLE_MPI` has been set)
+    - cusolver (if compiling option `USE_CUDA` has been set)
 
 ### nbands
 
diff --git a/source/module_hsolver/kernels/cuda/diag_cusolvermp.cu b/source/module_hsolver/kernels/cuda/diag_cusolvermp.cu
index a0f7442185..25975e6303 100644
--- a/source/module_hsolver/kernels/cuda/diag_cusolvermp.cu
+++ b/source/module_hsolver/kernels/cuda/diag_cusolvermp.cu
@@ -10,13 +10,15 @@ extern "C"
 #include "module_hsolver/genelpa/Cblacs.h"
 }
 #include <iostream>
+#include <cstdint>
 #include "helper_cusolver.h"
 #include "module_base/global_function.h"
 #include "module_base/module_device/device.h"
 static calError_t allgather(void* src_buf, void* recv_buf, size_t size, void* data, void** request)
 {
     MPI_Request req;
-    int err = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size, MPI_BYTE, (MPI_Comm)(data), &req);
+    intptr_t ptr = reinterpret_cast<intptr_t>(data);
+    int err = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size, MPI_BYTE, (MPI_Comm)ptr, &req);
     if (err != MPI_SUCCESS)
     {
         return CAL_ERROR;
@@ -27,7 +29,8 @@ static calError_t allgather(void* src_buf, void* recv_buf, size_t size, void* da
 
 static calError_t request_test(void* request)
 {
-    MPI_Request req = (MPI_Request)(request);
+    intptr_t ptr = reinterpret_cast<intptr_t>(request);
+    MPI_Request req = (MPI_Request)ptr;
     int completed;
     int err = MPI_Test(&req, &completed, MPI_STATUS_IGNORE);
     if (err != MPI_SUCCESS)
diff --git a/source/module_io/read_input_item_elec_stru.cpp b/source/module_io/read_input_item_elec_stru.cpp
index 4e7795a853..f7b861157e 100644
--- a/source/module_io/read_input_item_elec_stru.cpp
+++ b/source/module_io/read_input_item_elec_stru.cpp
@@ -69,6 +69,7 @@ void ReadInput::item_elec_stru()
                 "lapack",
                 "scalapack_gvx",
                 "cusolver",
+                "cusolvermp",
                 "pexsi",
                 "cg_in_lcao",
             };