From fbb6f2cccd2643405dc8ce7bed57582235f9c4b8 Mon Sep 17 00:00:00 2001
From: Zhao Tianqi <hongriTianqi@users.noreply.github.com>
Date: Mon, 22 Apr 2024 10:23:02 +0800
Subject: [PATCH 01/13] Feature: output mag force in spin constraint
 calculation (#4037)

* Feature: output mag force in spin constraint calculation

* add ut for print_Mag_Force
---
 source/module_esolver/esolver_ks_lcao.cpp     |  1 +
 .../module_deltaspin/lambda_loop_helper.cpp   |  4 +-
 .../module_deltaspin/spin_constrain.cpp       | 53 +++++++++++++++++--
 .../module_deltaspin/spin_constrain.h         |  3 ++
 .../test/cal_mw_helper_test.cpp               |  6 ++-
 .../test/lambda_loop_helper_test.cpp          |  5 ++
 .../test/spin_constrain_test.cpp              |  6 ++-
 7 files changed, 69 insertions(+), 9 deletions(-)
diff --git a/source/module_esolver/esolver_ks_lcao.cpp b/source/module_esolver/esolver_ks_lcao.cpp
index d142ae8e24..949e6c4ba7 100644
--- a/source/module_esolver/esolver_ks_lcao.cpp
+++ b/source/module_esolver/esolver_ks_lcao.cpp
@@ -1131,6 +1131,7 @@ void ESolver_KS_LCAO<TK, TR>::after_scf(const int istep)
     {
         SpinConstrain<TK, psi::DEVICE_CPU>& sc = SpinConstrain<TK, psi::DEVICE_CPU>::getScInstance();
         sc.cal_MW(istep, &(this->LM), true);
+        sc.print_Mag_Force();
     }
 
     if (!GlobalV::CAL_FORCE && !GlobalV::CAL_STRESS)
diff --git a/source/module_hamilt_lcao/module_deltaspin/lambda_loop_helper.cpp b/source/module_hamilt_lcao/module_deltaspin/lambda_loop_helper.cpp
index dcebe43f7f..38a53b3e68 100644
--- a/source/module_hamilt_lcao/module_deltaspin/lambda_loop_helper.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/lambda_loop_helper.cpp
@@ -4,8 +4,8 @@
 template <>
 void SpinConstrain<std::complex<double>, psi::DEVICE_CPU>::print_termination()
 {
-    print_2d("after-optimization spin: (print in the inner loop): ", this->Mi_, this->nspin_);
-    print_2d("after-optimization lambda: (print in the inner loop): ", this->lambda_, this->nspin_);
+    print_2d("after-optimization spin (uB): (print in the inner loop): ", this->Mi_, this->nspin_);
+    print_2d("after-optimization lambda (Ry/uB): (print in the inner loop): ", this->lambda_, this->nspin_);
     std::cout << "Inner optimization for lambda ends." << std::endl;
     std::cout << "===============================================================================" << std::endl;
 }
diff --git a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.cpp b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.cpp
index 8fce1484fc..e2c73335c3 100644
--- a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.cpp
@@ -1,4 +1,5 @@
 #include "spin_constrain.h"
+#include "module_base/formatter_fmt.h"
 
 #include <cmath>
 
@@ -541,20 +542,66 @@ void SpinConstrain<FPTYPE, Device>::print_Mi(bool print)
     int nat = this->get_nat();
     if (print)
     {
+        formatter::Fmt fmt;
+        fmt.set_width(20);
+        fmt.set_precision(10);
+        fmt.set_fillChar(' ');
+        fmt.set_fixed(false);
+        fmt.set_right(true);
+        fmt.set_error(false);
+        std::cout << "Total Magnetism (uB): " << std::endl;
         for (int iat = 0; iat < nat; ++iat)
         {
             if (this->nspin_ == 2)
             {
-                std::cout << "Total Magnetism on atom: " << iat << " " << std::setprecision(10) << " (" << Mi_[iat].z << ")" << std::endl;
+                std::cout << "ATOM " << std::left << std::setw(6) << iat << fmt.format(Mi_[iat].z) << std::endl;
             }
             else if (this->nspin_ ==4)
             {
-                std::cout << "Total Magnetism on atom: " << iat << " " << std::setprecision(10) << " (" << Mi_[iat].x
-                        << ", " << Mi_[iat].y << ", " << Mi_[iat].z << ")" << std::endl;
+                std::cout << "ATOM " << std::left << std::setw(6) << iat << fmt.format(Mi_[iat].x) << fmt.format(Mi_[iat].y) << fmt.format(Mi_[iat].z) << std::endl;
             }
         }
     }
 }
 
+/// print magnetic force (defined as \frac{\delta{L}}/{\delta{Mi}} = -lambda[iat])
+template <typename FPTYPE, typename Device>
+void SpinConstrain<FPTYPE, Device>::print_Mag_Force()
+{
+    this->check_atomCounts();
+    int nat = this->get_nat();
+    formatter::Fmt fmt;
+    fmt.set_width(20);
+    fmt.set_precision(10);
+    fmt.set_fillChar(' ');
+    fmt.set_fixed(false);
+    fmt.set_right(true);
+    fmt.set_error(false);
+    std::cout << "Final optimal lambda (Ry/uB): " << std::endl;
+    for (int iat = 0; iat < nat; ++iat)
+    {
+        if (this->nspin_ == 2)
+        {
+            std::cout << "ATOM " << std::left << std::setw(6) << iat << fmt.format(lambda_[iat].z) << std::endl;
+        }
+        else if (this->nspin_ ==4)
+        {
+            std::cout << "ATOM " << std::left << std::setw(6) << iat << fmt.format(lambda_[iat].x) << fmt.format(lambda_[iat].y) << fmt.format(lambda_[iat].z) << std::endl;
+        }
+    }
+    std::cout << "Magnetic force (Ry/uB): " << std::endl;
+    for (int iat = 0; iat < nat; ++iat)
+    {
+        if (this->nspin_ == 2)
+        {
+            std::cout << "ATOM " << std::left << std::setw(6) << iat << fmt.format(-lambda_[iat].z) << std::endl;
+        }
+        else if (this->nspin_ ==4)
+        {
+            std::cout << "ATOM " << std::left << std::setw(6) << iat << fmt.format(-lambda_[iat].x) << fmt.format(-lambda_[iat].y) << fmt.format(-lambda_[iat].z) << std::endl;
+        }
+    }
+}
+
 template class SpinConstrain<std::complex<double>, psi::DEVICE_CPU>;
 template class SpinConstrain<double, psi::DEVICE_CPU>;
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h
index 3ec76ac04f..a966467586 100644
--- a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h
+++ b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h
@@ -87,6 +87,9 @@ class SpinConstrain
   /// print mi
   void print_Mi(bool print = false);
 
+  /// print magnetic force, defined as \frac{\delta{L}}/{\delta{Mi}} = -lambda[iat])
+  void print_Mag_Force();
+
   /// collect_mw from matrix multiplication result
   void collect_MW(ModuleBase::matrix& MecMulP, const ModuleBase::ComplexMatrix& mud, int nw, int isk);
 
diff --git a/source/module_hamilt_lcao/module_deltaspin/test/cal_mw_helper_test.cpp b/source/module_hamilt_lcao/module_deltaspin/test/cal_mw_helper_test.cpp
index 628cc4f2c1..4c8bb14dac 100644
--- a/source/module_hamilt_lcao/module_deltaspin/test/cal_mw_helper_test.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/test/cal_mw_helper_test.cpp
@@ -66,7 +66,8 @@ TEST_F(SpinConstrainTest, CalculateMW)
     testing::internal::CaptureStdout();
     sc.print_Mi(true);
     std::string output = testing::internal::GetCapturedStdout();
-    EXPECT_THAT(output, testing::HasSubstr("Total Magnetism on atom: 0  (2, 3, 4)"));
+    EXPECT_THAT(output, testing::HasSubstr("Total Magnetism (uB):"));
+    EXPECT_THAT(output, testing::HasSubstr("ATOM 0         2.0000000000e+00    3.0000000000e+00    4.0000000000e+00"));
 }
 
 TEST_F(SpinConstrainTest, CollectMW)
@@ -141,7 +142,8 @@ TEST_F(SpinConstrainTest, CalculateMWS2)
     testing::internal::CaptureStdout();
     sc.print_Mi(true);
     std::string output = testing::internal::GetCapturedStdout();
-    EXPECT_THAT(output, testing::HasSubstr("Total Magnetism on atom: 0  (-1)"));
+    EXPECT_THAT(output, testing::HasSubstr("Total Magnetism (uB):"));
+    EXPECT_THAT(output, testing::HasSubstr("ATOM 0        -1.0000000000e+00"));
 }
 
 TEST_F(SpinConstrainTest, CollectMWS2)
diff --git a/source/module_hamilt_lcao/module_deltaspin/test/lambda_loop_helper_test.cpp b/source/module_hamilt_lcao/module_deltaspin/test/lambda_loop_helper_test.cpp
index d5d4e42144..a7183263fe 100644
--- a/source/module_hamilt_lcao/module_deltaspin/test/lambda_loop_helper_test.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/test/lambda_loop_helper_test.cpp
@@ -40,10 +40,15 @@ TEST_F(SpinConstrainTest, PrintTermination)
     sc.set_sc_lambda(sc_lambda.data(), 1);
     testing::internal::CaptureStdout();
     sc.print_termination();
+    sc.print_Mag_Force();
     std::string output = testing::internal::GetCapturedStdout();
     EXPECT_THAT(output, testing::HasSubstr("Inner optimization for lambda ends."));
     EXPECT_THAT(output, testing::HasSubstr("ATOM 1         0.0000000000e+00    0.0000000000e+00    0.0000000000e+00"));
     EXPECT_THAT(output, testing::HasSubstr("ATOM 1         1.0000000000e+00    2.0000000000e+00    3.0000000000e+00"));
+    EXPECT_THAT(output, testing::HasSubstr("Final optimal lambda (Ry/uB):"));
+    EXPECT_THAT(output, testing::HasSubstr("ATOM 1         1.0000000000e+00    2.0000000000e+00    3.0000000000e+00"));
+    EXPECT_THAT(output, testing::HasSubstr("Magnetic force (Ry/uB):"));
+    EXPECT_THAT(output, testing::HasSubstr("ATOM 0        -1.0000000000e+00   -2.0000000000e+00   -3.0000000000e+00"));
 }
 
 TEST_F(SpinConstrainTest, CheckRmsStop)
diff --git a/source/module_hamilt_lcao/module_deltaspin/test/spin_constrain_test.cpp b/source/module_hamilt_lcao/module_deltaspin/test/spin_constrain_test.cpp
index e23add1002..cd1f7c2956 100644
--- a/source/module_hamilt_lcao/module_deltaspin/test/spin_constrain_test.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/test/spin_constrain_test.cpp
@@ -398,10 +398,12 @@ TYPED_TEST(SpinConstrainTest, PrintMi)
     testing::internal::CaptureStdout();
     this->sc.print_Mi(true);
     std::string output = testing::internal::GetCapturedStdout();
-    EXPECT_THAT(output, testing::HasSubstr("Total Magnetism on atom: 0  (0, 0, 0)"));
+    EXPECT_THAT(output, testing::HasSubstr("Total Magnetism (uB):"));
+    EXPECT_THAT(output, testing::HasSubstr("ATOM 0         0.0000000000e+00    0.0000000000e+00    0.0000000000e+00"));
     this->sc.set_nspin(2);
      testing::internal::CaptureStdout();
     this->sc.print_Mi(true);
     output = testing::internal::GetCapturedStdout();
-    EXPECT_THAT(output, testing::HasSubstr("Total Magnetism on atom: 0  (0)"));
+    EXPECT_THAT(output, testing::HasSubstr("Total Magnetism (uB):"));
+    EXPECT_THAT(output, testing::HasSubstr("ATOM 0         0.0000000000e+00"));
 }
\ No newline at end of file

From 6eb39e8431adb480c58f10ca71b5425a73998d28 Mon Sep 17 00:00:00 2001
From: Chun Cai <amoycaic@gmail.com>
Date: Mon, 22 Apr 2024 10:28:55 +0800
Subject: [PATCH 02/13] CI: update cuda tests (#4032)

Co-authored-by: Haozhi Han <haozhi.han@stu.pku.edu.cn>
---
 .github/workflows/cuda.yml | 81 ++++----------------------------------
 1 file changed, 7 insertions(+), 74 deletions(-)

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 2832a3e02a..6df2577bb8 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -4,88 +4,21 @@ on:
   workflow_dispatch:
 
 jobs:
-  start-runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: us-east-2
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: machulav/ec2-github-runner@v2
-        with:
-          mode: start
-          github-token: ${{ secrets.PAT }}
-          ec2-image-id: ami-04cd9fec4a7a39019
-          ec2-instance-type: g4dn.xlarge
-          subnet-id: subnet-72d3e53e
-          security-group-id: sg-06b0c93122c08aeab
-
   test:
-    name: Do the job on the runner
-    needs: start-runner # required to start the main job when the runner is ready
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    name: Test on CUDA Build
+    runs-on: nvidia
     container:
       image: ghcr.io/deepmodeling/abacus-cuda
       options: --gpus all
     steps:
       - name: Checkout
         uses: actions/checkout@v4
-      - name: Build cuSolver
+        with:
+          submodules: recursive
+      - name: Build
         run: |
           nvidia-smi
-          cmake -B build -DUSE_CUSOLVER_LCAO=ON
+          cmake -B build -DUSE_CUDA=ON -DBUILD_TESTING=ON
           cmake --build build -j4
           cmake --install build
-          cmake -B build -DBUILD_TESTING=ON
-          cmake --build build -j4 --target hsolver_diago
-      - name: Test e2e
-        run: |
-          export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
-          cd tests/integrate
-          echo "ks_solver cusolver" >> ./270_NO_MD_2O/INPUT
-          ./Autotest.sh -r 270_NO_MD_2O
-      - name: Test UT
-        run: |
-          cd source/src_pdiag/test/
-          cp ../../../build/source/src_pdiag/test/hsolver_diago .
-          ./hsolver_diago
-          bash diago_parallel_test.sh
-      - name: Test performance
-        run: |
-          cd examples/performance
-          ls -d P1*lcao* > allcase
-          sed -i '/ks_solver/d' P1*lcao*/INPUT
-          sed -i '$a ks_solver cusolver' P1*lcao*/INPUT
-          bash run.sh
-          cat sumall.dat
-
-
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner # required to get output from the start-runner job
-      - test # required to wait when the main job is done
-    runs-on: ubuntu-latest
-    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: us-east-2
-      - name: Stop EC2 runner
-        uses: machulav/ec2-github-runner@v2
-        with:
-          mode: stop
-          github-token: ${{ secrets.PAT }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
+      # TODO: add tests

From f709ba050552ebf03a3bcddf4361da5eec581630 Mon Sep 17 00:00:00 2001
From: wqzhou <33364058+WHUweiqingzhou@users.noreply.github.com>
Date: Mon, 22 Apr 2024 13:27:22 +0800
Subject: [PATCH 03/13] add some docs (#4039)

---
 docs/advanced/scf/converge.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/advanced/scf/converge.md b/docs/advanced/scf/converge.md
index f74da82522..4f77f2f3b7 100644
--- a/docs/advanced/scf/converge.md
+++ b/docs/advanced/scf/converge.md
@@ -10,6 +10,8 @@ For each of the mixing types, we also provide variables for controlling relevant
 
 `mixing_ndim` is the mixing dimensions in DIIS (broyden or pulay) mixing. Gerenally, a larger `mixing_ndim` leads to a better convergence. the default choice `mixing_ndim=8` should work fine in most cases. For `mixing_type`, the default choice is `broyden`, which is slightly better than `Pulay` typically. Besides that, a large `mixing_beta` means a larger change in electron density for each SCF step. For well-behaved systems, a larger `mixing_beta` leads to faster convergence. However, for some difficult cases, a smaller `mixing_beta` is preferred to avoid numerical instabilities.
 
+For most isolated systems, Kerker preconditioning is unnecessary. You can turn off it by setting `mixing_gg0  0.0` to get a faster convergence.
+
 For non-spin-polarized calculations, the default choices usually achieve convergence. If convergence issue arises in metallic systems, you can try different value of Kerker preconditioning [mixing_gg0](../input_files/input-main.md#mixing_gg0) and [mixing_gg0_min](../input_files/input-main.md#mixing_gg0_min), and try to reduce `mixing_beta`, which is 0.8 defaultly for `nspin=1`.
 
 For magnetic calculations, `mixing_beta_mag` and `mixing_gg0_mag` are activated. Considering collinear calculations, you can rely on the default value for most cases. If convergence issue arises, you can try to reduce `mixing_beta` and `mixing_beta_mag` together. For non-collinear calculations, tradtional broyden usually works, especially for a given magnetic configuration. If one is not interested in the energies of a given magnetic configuration but wants to determine the ground state by relaxing the magnetic moments’ directions, the standard Broyden mixing algorithm sometimes fails to find the correct magnetic configuration. If so, we can set [mixing_angle=1.0](../input_files/input-main.md#mixing_angle), which is a promising mixing method proposed by J. Phys. Soc. Jpn. 82 (2013) 114706.

From 25244770a1a8ea0cd999e9b99b084f8262cefa02 Mon Sep 17 00:00:00 2001
From: wqzhou <33364058+WHUweiqingzhou@users.noreply.github.com>
Date: Tue, 23 Apr 2024 14:34:02 +0800
Subject: [PATCH 04/13] fix final mag in STRU_ION_D (#4044)

---
 source/module_cell/read_atoms.cpp    | 30 +++++++++++++++++-----------
 source/module_cell/unitcell.h        |  2 +-
 source/module_io/mulliken_charge.cpp | 16 +++++++++++++++
 3 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/source/module_cell/read_atoms.cpp b/source/module_cell/read_atoms.cpp
index c5836b8aea..acf9a42600 100644
--- a/source/module_cell/read_atoms.cpp
+++ b/source/module_cell/read_atoms.cpp
@@ -1003,6 +1003,7 @@ void UnitCell::print_stru_file(const std::string &fn, const int &type, const int
 
 	if(type==1)
 	{
+		int nat_tmp = 0;
 		ofs << "Cartesian" << std::endl;
 		for(int it=0; it<ntype; it++)
 		{
@@ -1028,28 +1029,31 @@ void UnitCell::print_stru_file(const std::string &fn, const int &type, const int
                     ofs << context.str();
                 }
 
-                if (GlobalV::NSPIN == 2)
+                if (GlobalV::NSPIN == 2 && GlobalV::out_mul)
                 {
                     // output magnetic information
                     ofs << " mag ";
                     context.set_context("double_w6_f2");
-                    context << atoms[it].mag[ia];
+                    context << atom_mulliken[nat_tmp][1];
                     ofs << context.str();
                 }
-                else if (GlobalV::NSPIN == 4)
+                else if (GlobalV::NSPIN == 4 && GlobalV::out_mul)
                 {
                     // output magnetic information
                     ofs << " mag ";
-                    context.set_context("vector3d");
-                    context << atoms[it].m_loc_[ia].x << " " << atoms[it].m_loc_[ia].y << " " << atoms[it].m_loc_[ia].z;
-                    ofs << context.str();
+					ofs << std::fixed << std::setprecision(5);
+                    ofs << std::setw(8) << atom_mulliken[nat_tmp][1] 
+						<< std::setw(8) << atom_mulliken[nat_tmp][2] 
+						<< std::setw(8) << atom_mulliken[nat_tmp][3];
                 }
                 ofs << std::endl;
+				nat_tmp++;
 			}
 		}
 	}
 	else if(type==2)
 	{
+		int nat_tmp = 0;
 		ofs << "Direct" << std::endl;
 		for(int it=0; it<ntype; it++)
 		{
@@ -1075,23 +1079,25 @@ void UnitCell::print_stru_file(const std::string &fn, const int &type, const int
                     ofs << context.str();
                 }
 
-                if (GlobalV::NSPIN == 2)
+                if (GlobalV::NSPIN == 2 && GlobalV::out_mul)
                 {
                     // output magnetic information
                     ofs << " mag ";
                     context.set_context("double_w6_f2");
-                    context << atoms[it].mag[ia];
+                    context << atom_mulliken[nat_tmp][1];
                     ofs << context.str();
                 }
-                else if (GlobalV::NSPIN == 4)
+                else if (GlobalV::NSPIN == 4 && GlobalV::out_mul)
                 {
                     // output magnetic information
                     ofs << " mag ";
-                    context.set_context("vector3d");
-                    context << atoms[it].m_loc_[ia].x << " " << atoms[it].m_loc_[ia].y << " " << atoms[it].m_loc_[ia].z;
-                    ofs << context.str();
+					ofs << std::fixed << std::setprecision(5);
+                    ofs << std::setw(8) << atom_mulliken[nat_tmp][1] 
+						<< std::setw(8) << atom_mulliken[nat_tmp][2] 
+						<< std::setw(8) << atom_mulliken[nat_tmp][3];
                 }
                 ofs << std::endl;
+				nat_tmp++;
 			}
 		}
 	}
diff --git a/source/module_cell/unitcell.h b/source/module_cell/unitcell.h
index 6223e3b9f6..1d1e520da9 100644
--- a/source/module_cell/unitcell.h
+++ b/source/module_cell/unitcell.h
@@ -23,7 +23,7 @@ class UnitCell
     Magnetism magnet;  // magnetism Yu Liu 2021-07-03
     void cal_ux();
     bool judge_parallel(double a[3],ModuleBase::Vector3<double> b);
-	double *atom_mag;
+    std::vector<std::vector<double>> atom_mulliken;  //[nat][nspin]
 	int n_mag_at;
 
     std::string& Coordinate = lat.Coordinate;
diff --git a/source/module_io/mulliken_charge.cpp b/source/module_io/mulliken_charge.cpp
index 72b0046cc3..60983030cb 100644
--- a/source/module_io/mulliken_charge.cpp
+++ b/source/module_io/mulliken_charge.cpp
@@ -312,6 +312,16 @@ void ModuleIO::out_mulliken(const int& step, LCAO_Matrix* LM, const elecstate::E
 		os << "Decomposed Mulliken populations" << std::endl;
         GlobalV::ofs_running << std::endl <<std::endl;
 
+        // alloacte space for ucell.atom_mulliken
+        if (GlobalC::ucell.atom_mulliken.empty())
+        {
+            GlobalC::ucell.atom_mulliken.resize(GlobalC::ucell.nat);
+            for (int iat = 0; iat < GlobalC::ucell.nat; iat++)
+            {
+                GlobalC::ucell.atom_mulliken[iat].resize(GlobalV::NSPIN);
+            }
+        }
+
         for (size_t i = 0; i != GlobalC::ucell.nat; ++i)
         {
             double total_charge = 0.0, atom_mag = 0.0;
@@ -428,6 +438,8 @@ void ModuleIO::out_mulliken(const int& step, LCAO_Matrix* LM, const elecstate::E
                 os << "Total Charge on atom:  " << GlobalC::ucell.atoms[t].label <<  std::setw(20) << total_charge <<std::endl;
                 os << "Total Magnetism on atom:  " << GlobalC::ucell.atoms[t].label <<  std::setw(20) << ModuleIO::output_cut(atom_mag) <<std::endl;
                 GlobalV::ofs_running << "Total Magnetism on atom:  " << GlobalC::ucell.atoms[t].label <<  std::setw(20) << std::setprecision(10) << atom_mag <<std::endl;
+                GlobalC::ucell.atom_mulliken[i][0] = total_charge;
+                GlobalC::ucell.atom_mulliken[i][1] = atom_mag;
             }
             else if (GlobalV::NSPIN==4)
             {
@@ -443,6 +455,10 @@ void ModuleIO::out_mulliken(const int& step, LCAO_Matrix* LM, const elecstate::E
                 GlobalV::ofs_running << "Total Magnetism on atom:  " << GlobalC::ucell.atoms[t].label <<  std::setw(10)
                 << "("  << std::setprecision(10) << spin2 << ", " << std::setprecision(10) << spin3 << ", " << std::setprecision(10) << spin4 << ")"
                 <<std::endl;
+                GlobalC::ucell.atom_mulliken[i][0] = spin1;
+                GlobalC::ucell.atom_mulliken[i][1] = spin2;
+                GlobalC::ucell.atom_mulliken[i][2] = spin3;
+                GlobalC::ucell.atom_mulliken[i][3] = spin4;
             }
             os << std::endl <<std::endl;
         }

From bec5e64685a0255588b7ba955963571521cb0cc0 Mon Sep 17 00:00:00 2001
From: Zhao Tianqi <hongriTianqi@users.noreply.github.com>
Date: Wed, 24 Apr 2024 12:32:28 +0800
Subject: [PATCH 05/13] Test: case test for noncolin force (#4048)

---
 tests/integrate/204_NO_KP_NC/INPUT            |   2 +
 tests/integrate/204_NO_KP_NC/STRU             |   2 +-
 tests/integrate/204_NO_KP_NC/result.ref       |   8 +-
 .../204_NO_KP_NC_deltaspin/mulliken.txt.ref   | 168 +++++++++---------
 .../204_NO_KP_NC_deltaspin/result.ref         |   7 +-
 5 files changed, 96 insertions(+), 91 deletions(-)

diff --git a/tests/integrate/204_NO_KP_NC/INPUT b/tests/integrate/204_NO_KP_NC/INPUT
index 8886378fdf..4bb8c943c4 100644
--- a/tests/integrate/204_NO_KP_NC/INPUT
+++ b/tests/integrate/204_NO_KP_NC/INPUT
@@ -23,3 +23,5 @@ smearing_sigma			0.002
 
 #Parameters (5.Mixing)
 
+
+cal_force 1
diff --git a/tests/integrate/204_NO_KP_NC/STRU b/tests/integrate/204_NO_KP_NC/STRU
index 73cf74c4f9..cd92e43261 100644
--- a/tests/integrate/204_NO_KP_NC/STRU
+++ b/tests/integrate/204_NO_KP_NC/STRU
@@ -18,5 +18,5 @@ Direct
 Si	// Element type	
 0.0	// magnetism
 2	
-0.00 0.00 0.00 1 1 1 mag 1.0 angle1 90 angle2 0
+0.10 0.00 0.00 1 1 1 mag 1.0 angle1 90 angle2 0
 0.25 0.25 0.25 1 1 1 mag 1.0 angle1 90 angle2 180
diff --git a/tests/integrate/204_NO_KP_NC/result.ref b/tests/integrate/204_NO_KP_NC/result.ref
index e0ce28aca8..d1c497de0c 100644
--- a/tests/integrate/204_NO_KP_NC/result.ref
+++ b/tests/integrate/204_NO_KP_NC/result.ref
@@ -1,3 +1,5 @@
-etotref -211.1858438233880
-etotperatomref -105.5929219117
-totaltimeref 1.3488
+etotref -210.3233812890469
+etotperatomref -105.1616906445
+totalforceref 17.244244
+totaltimeref 28.3911
+23.58
diff --git a/tests/integrate/204_NO_KP_NC_deltaspin/mulliken.txt.ref b/tests/integrate/204_NO_KP_NC_deltaspin/mulliken.txt.ref
index bffad6b08a..168c5723a3 100644
--- a/tests/integrate/204_NO_KP_NC_deltaspin/mulliken.txt.ref
+++ b/tests/integrate/204_NO_KP_NC_deltaspin/mulliken.txt.ref
@@ -3,92 +3,92 @@ CALCULATE THE MULLIkEN ANALYSIS FOR EACH ATOM
  Total charge:	32
 Decomposed Mulliken populations
 0                 Zeta of Fe                        Spin 1                        Spin 2                        Spin 3                        Spin 4
-s                        0                           1.317                       0.06196                       -0.2625                      -0.07949
-  sum over m                                         1.317                       0.06196                       -0.2625                      -0.07949
-s                        1                           1.726                      -0.01809                       0.09886                      -0.01413
-  sum over m                                         1.726                      -0.01809                       0.09886                      -0.01413
-s                        2                         0.03246                      -0.04153                        0.2209                      -0.02228
-  sum over m                                       0.03246                      -0.04153                        0.2209                      -0.02228
-s                        3                        -0.02921                      0.005609                        -0.025                     -0.005114
-  sum over m                                      -0.02921                      0.005609                        -0.025                     -0.005114
-  sum over m+zeta                                    3.046                      0.007945                        0.0323                        -0.121
-pz                        0                           2.034                     -0.001186                      0.005932                    -3.981e-06
-px                        0                           2.033                     -0.001283                      0.006419                    -3.989e-06
-py                        0                           2.033                     -0.001188                      0.005944                    -3.979e-06
-  sum over m                                           6.1                     -0.003658                        0.0183                    -1.195e-05
-pz                        1                        -0.02621                     0.0005578                     -0.002789                             0
-px                        1                        -0.02639                     0.0006107                     -0.003054                             0
-py                        1                        -0.02603                     0.0005536                     -0.002768                             0
-  sum over m                                      -0.07863                      0.001722                     -0.008611                             0
-  sum over m+zeta                                    6.021                     -0.001936                      0.009684                    -1.277e-05
-dz^2                        0                           1.964                     0.0008269                     -0.004128                    -1.088e-05
-dxz                        0                           1.044                         0.156                       -0.7849                    -0.0003055
-dyz                        0                          0.9592                        0.1564                       -0.7869                    -0.0003096
-dx^2-y^2                        0                           1.967                      0.000752                     -0.003754                    -1.059e-05
-dxy                        0                           1.055                        0.1558                       -0.7835                    -0.0003047
-  sum over m                                         6.988                        0.4698                        -2.363                    -0.0009413
-dz^2                        1                         0.03863                    -0.0008716                      0.004365                    -1.357e-05
-dxz                        1                        -0.03708                     -0.004148                       0.02101                     1.956e-05
-dyz                        1                        -0.03373                     -0.004494                       0.02274                     1.968e-05
-dx^2-y^2                        1                         0.03943                    -0.0009117                      0.004566                    -1.471e-05
-dxy                        1                        -0.03733                     -0.004056                       0.02055                     1.945e-05
-  sum over m                                      -0.03008                      -0.01448                       0.07324                     3.041e-05
-  sum over m+zeta                                    6.958                        0.4553                         -2.29                    -0.0009109
-fz^3                        0                       -0.007044                     0.0007552                     -0.003776                    -1.406e-06
-fxz^2                        0                       -0.002046                     0.0002628                     -0.001314                             0
-fyz^2                        0                        -0.00273                       0.00029                      -0.00145                             0
-fzx^2-zy^2                        0                       5.811e-05                             0                     3.451e-06                             0
-fxyz                        0                        1.14e-05                     1.249e-06                    -6.306e-06                             0
-fx^3-3*xy^2                        0                       -0.003379                     0.0004381                      -0.00219                             0
-f3yx^2-y^3                        0                        -0.00407                     0.0004626                     -0.002313                             0
-  sum over m                                       -0.0192                      0.002209                      -0.01105                    -4.307e-06
-  sum over m+zeta                                  -0.0192                      0.002209                      -0.01105                    -4.307e-06
-Total Charge on atom:  Fe               16.01
-Total Magnetism on atom:  Fe                   (0.4635, -2.259, -0.1219)
+s                        0                           1.317                       0.05552                        0.2843                       0.02903
+  sum over m                                         1.317                       0.05552                        0.2843                       0.02903
+s                        1                           1.726                      -0.01923                      -0.09498                      0.005159
+  sum over m                                         1.726                      -0.01923                      -0.09498                      0.005159
+s                        2                         0.03246                      -0.04333                       -0.2148                      0.008137
+  sum over m                                       0.03246                      -0.04333                       -0.2148                      0.008137
+s                        3                        -0.02921                      0.005194                       0.02641                      0.001867
+  sum over m                                      -0.02921                      0.005194                       0.02641                      0.001867
+  sum over m+zeta                                    3.046                     -0.001842                     0.0009368                       0.04419
+pz                        0                           2.034                     -0.001185                     -0.005932                     1.545e-06
+px                        0                           2.033                     -0.001283                     -0.006419                     1.538e-06
+py                        0                           2.033                     -0.001188                     -0.005944                     1.543e-06
+  sum over m                                           6.1                     -0.003656                      -0.01829                     4.626e-06
+pz                        1                        -0.02622                     0.0005602                      0.002791                             0
+px                        1                        -0.02639                     0.0006145                      0.003054                             0
+py                        1                        -0.02603                     0.0005563                       0.00277                             0
+  sum over m                                      -0.07864                      0.001731                      0.008615                             0
+  sum over m+zeta                                    6.021                     -0.001925                      -0.00968                     5.611e-06
+dz^2                        0                           1.964                     0.0008273                      0.004131                     4.077e-06
+dxz                        0                           1.044                        0.1755                        0.7507                      0.002258
+dyz                        0                          0.9544                        0.1768                        0.7532                      0.002329
+dx^2-y^2                        0                           1.967                     0.0007523                      0.003756                     3.978e-06
+dxy                        0                           1.055                        0.1751                        0.7495                      0.002251
+  sum over m                                         6.984                         0.529                         2.261                      0.006846
+dz^2                        1                         0.03863                    -0.0008699                     -0.004363                     5.197e-06
+dxz                        1                        -0.03759                     -0.005346                      -0.01936                    -0.0001322
+dyz                        1                        -0.03407                     -0.005734                      -0.02118                    -0.0001342
+dx^2-y^2                        1                         0.03943                    -0.0009093                     -0.004564                     5.691e-06
+dxy                        1                        -0.03787                     -0.005246                       -0.0189                    -0.0001314
+  sum over m                                      -0.03146                      -0.01811                      -0.06836                     -0.000387
+  sum over m+zeta                                    6.952                        0.5109                         2.193                      0.006459
+fz^3                        0                       -0.007049                     0.0007578                      0.003775                             0
+fxz^2                        0                       -0.002045                     0.0002638                      0.001312                             0
+fyz^2                        0                       -0.002729                     0.0002912                      0.001448                             0
+fzx^2-zy^2                        0                       6.273e-05                             0                    -6.642e-06                             0
+fxyz                        0                       1.153e-05                     1.446e-06                     5.675e-06                             0
+fx^3-3*xy^2                        0                        -0.00338                       0.00044                      0.002189                             0
+f3yx^2-y^3                        0                        -0.00407                     0.0004646                      0.002311                             0
+  sum over m                                       -0.0192                      0.002219                       0.01103                     2.581e-06
+  sum over m+zeta                                  -0.0192                      0.002219                       0.01103                     2.581e-06
+Total Charge on atom:  Fe                  16
+Total Magnetism on atom:  Fe                   (0.5093, 2.195, 0.05066)
 
 
 1                 Zeta of Fe                        Spin 1                        Spin 2                        Spin 3                        Spin 4
-s                        0                           1.275                       0.04699                       -0.2823                       0.07949
-  sum over m                                         1.275                       0.04699                       -0.2823                       0.07949
-s                        1                           1.755                      -0.01866                       0.08491                       0.01412
-  sum over m                                         1.755                      -0.01866                       0.08491                       0.01412
-s                        2                        -0.02899                      -0.04221                        0.1978                       0.02226
-  sum over m                                      -0.02899                      -0.04221                        0.1978                       0.02226
-s                        3                        -0.04712                       0.00595                      -0.03281                      0.005133
-  sum over m                                      -0.04712                       0.00595                      -0.03281                      0.005133
-  sum over m+zeta                                    2.954                     -0.007928                      -0.03239                         0.121
-pz                        0                           2.032                     -0.001371                       0.00685                     3.967e-06
-px                        0                           2.025                    -0.0009218                      0.004606                     3.958e-06
-py                        0                           2.032                     -0.001333                      0.006664                     3.965e-06
-  sum over m                                         6.089                     -0.003626                       0.01812                     1.189e-05
-pz                        1                        -0.02529                     0.0005803                     -0.002904                             0
-px                        1                        -0.01606                     0.0001295                    -0.0006492                             0
-py                        1                        -0.02466                     0.0005625                     -0.002815                             0
-  sum over m                                      -0.06602                      0.001272                     -0.006367                             0
-  sum over m+zeta                                    6.023                     -0.002353                       0.01175                      1.25e-05
-dz^2                        0                           1.957                      0.001154                     -0.005778                     1.149e-05
-dxz                        0                           1.091                        0.1517                       -0.7637                    -8.462e-05
-dyz                        0                          0.9556                        0.1553                       -0.7815                    -8.443e-05
-dx^2-y^2                        0                           1.947                      0.001648                     -0.008249                     1.233e-05
-dxy                        0                           1.106                        0.1508                       -0.7591                    -8.432e-05
-  sum over m                                         7.056                        0.4606                        -2.318                    -0.0002295
-dz^2                        1                         0.03925                     -0.001067                      0.005328                     1.289e-05
-dxz                        1                        -0.03558                     -0.002824                       0.01439                       2.5e-06
-dyz                        1                        -0.03117                     -0.003962                        0.0201                     2.798e-06
-dx^2-y^2                        1                         0.04266                     -0.001401                      0.006997                      1.29e-05
-dxy                        1                        -0.03637                     -0.002747                       0.01401                     2.475e-06
-  sum over m                                      -0.02122                        -0.012                       0.06082                     3.356e-05
-  sum over m+zeta                                    7.035                        0.4486                        -2.257                     -0.000196
-fz^3                        0                       -0.006615                     0.0007206                     -0.003605                     1.352e-06
-fxz^2                        0                       -0.001955                     0.0002554                     -0.001278                             0
-fyz^2                        0                       -0.002684                     0.0002735                     -0.001368                             0
-fzx^2-zy^2                        0                       9.383e-05                      1.68e-05                    -8.473e-05                             0
-fxyz                        0                       2.053e-05                      3.66e-06                    -1.839e-05                             0
-fx^3-3*xy^2                        0                       -0.003204                     0.0004266                     -0.002134                             0
-f3yx^2-y^3                        0                       -0.003695                     0.0004558                     -0.002281                             0
-  sum over m                                      -0.01804                      0.002152                      -0.01077                     4.022e-06
-  sum over m+zeta                                 -0.01804                      0.002152                      -0.01077                     4.022e-06
-Total Charge on atom:  Fe               15.99
-Total Magnetism on atom:  Fe                   (0.4405, -2.289, 0.1208)
+s                        0                           1.275                       0.05341                        0.2605                      -0.02903
+  sum over m                                         1.275                       0.05341                        0.2605                      -0.02903
+s                        1                           1.755                      -0.01752                      -0.08879                     -0.005156
+  sum over m                                         1.755                      -0.01752                      -0.08879                     -0.005156
+s                        2                        -0.02898                       -0.0404                       -0.2039                      -0.00813
+  sum over m                                      -0.02898                       -0.0404                       -0.2039                      -0.00813
+s                        3                        -0.04711                      0.006367                       0.03139                     -0.001874
+  sum over m                                      -0.04711                      0.006367                       0.03139                     -0.001874
+  sum over m+zeta                                    2.954                      0.001862                    -0.0008532                      -0.04419
+pz                        0                           2.032                     -0.001369                     -0.006852                    -1.367e-06
+px                        0                           2.025                    -0.0009208                     -0.004608                    -1.387e-06
+py                        0                           2.032                     -0.001332                     -0.006666                    -1.366e-06
+  sum over m                                         6.089                     -0.003622                      -0.01813                    -4.119e-06
+pz                        1                        -0.02528                     0.0005889                      0.002889                             0
+px                        1                        -0.01606                     0.0001369                     0.0006408                             0
+py                        1                        -0.02466                      0.000571                      0.002802                             0
+  sum over m                                        -0.066                      0.001297                      0.006331                     2.367e-06
+  sum over m+zeta                                    6.023                     -0.002325                      -0.01179                    -1.753e-06
+dz^2                        0                           1.957                      0.001158                      0.005774                    -3.913e-06
+dxz                        0                           1.097                        0.1724                        0.7275                      0.002311
+dyz                        0                          0.9509                        0.1759                        0.7475                      0.002269
+dx^2-y^2                        0                           1.947                      0.001654                      0.008245                    -4.075e-06
+dxy                        0                           1.113                        0.1714                        0.7227                      0.002304
+  sum over m                                         7.065                        0.5225                         2.212                      0.006876
+dz^2                        1                         0.03925                     -0.001062                     -0.005333                    -4.383e-06
+dxz                        1                         -0.0366                     -0.003947                      -0.01263                    -0.0001213
+dyz                        1                        -0.03157                     -0.005197                      -0.01856                    -0.0001267
+dx^2-y^2                        1                         0.04266                     -0.001394                     -0.007002                    -4.206e-06
+dxy                        1                        -0.03743                     -0.003854                      -0.01222                    -0.0001203
+  sum over m                                      -0.02369                      -0.01545                      -0.05575                    -0.0003768
+  sum over m+zeta                                    7.041                        0.5071                         2.156                      0.006499
+fz^3                        0                       -0.006614                     0.0007261                      0.003596                             0
+fxz^2                        0                       -0.001954                     0.0002565                      0.001276                             0
+fyz^2                        0                       -0.002684                     0.0002742                      0.001366                             0
+fzx^2-zy^2                        0                        9.09e-05                      1.99e-05                     8.018e-05                             0
+fxyz                        0                       2.062e-05                     4.102e-06                     1.816e-05                             0
+fx^3-3*xy^2                        0                       -0.003203                     0.0004291                       0.00213                             0
+f3yx^2-y^3                        0                       -0.003698                     0.0004635                      0.002271                             0
+  sum over m                                      -0.01804                      0.002174                       0.01074                             0
+  sum over m+zeta                                 -0.01804                      0.002174                       0.01074                             0
+Total Charge on atom:  Fe                  16
+Total Magnetism on atom:  Fe                   (0.5088, 2.154, -0.03769)
 
 
diff --git a/tests/integrate/204_NO_KP_NC_deltaspin/result.ref b/tests/integrate/204_NO_KP_NC_deltaspin/result.ref
index 8a17a1fada..5db4c4ed06 100644
--- a/tests/integrate/204_NO_KP_NC_deltaspin/result.ref
+++ b/tests/integrate/204_NO_KP_NC_deltaspin/result.ref
@@ -1,4 +1,5 @@
-etotref -6844.326716364628
-etotperatomref -3422.1633581823
+etotref -6844.685232778927
+etotperatomref -3422.3426163895
 Compare_mulliken_pass 0
-totaltimeref 36.59
+totaltimeref 28.5759
+14.63

From 92ef679df24c406d83f5130ea655560e058fb6b6 Mon Sep 17 00:00:00 2001
From: wqzhou <33364058+WHUweiqingzhou@users.noreply.github.com>
Date: Fri, 26 Apr 2024 12:24:41 +0800
Subject: [PATCH 06/13] Fix: add a check_rho() to avoid negative charge density
 (#4053)

* add a check_rho() to avoid negative rho_up/rho_dn

* add mocks in UTs

* change warningquit to warning about rho

* Update source/module_elecstate/module_charge/charge.cpp

Co-authored-by: kirk0830 <67682086+kirk0830@users.noreply.github.com>

* Update source/module_elecstate/module_charge/charge.cpp

Co-authored-by: kirk0830 <67682086+kirk0830@users.noreply.github.com>

---------

Co-authored-by: kirk0830 <67682086+kirk0830@users.noreply.github.com>
---
 source/module_elecstate/elecstate.cpp         |  1 +
 .../module_elecstate/module_charge/charge.cpp | 41 +++++++++++++++++--
 .../module_elecstate/module_charge/charge.h   |  6 ++-
 source/module_elecstate/test/charge_test.cpp  |  6 +--
 .../test/elecstate_base_test.cpp              |  3 ++
 .../test/elecstate_pw_test.cpp                |  3 ++
 6 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/source/module_elecstate/elecstate.cpp b/source/module_elecstate/elecstate.cpp
index 393c2d07d5..6a8af42fb7 100644
--- a/source/module_elecstate/elecstate.cpp
+++ b/source/module_elecstate/elecstate.cpp
@@ -218,6 +218,7 @@ void ElecState::init_scf(const int istep, const ModuleBase::ComplexMatrix& struc
     if (istep == 0)
     {
         this->charge->init_rho(this->eferm, strucfac, this->bigpw->nbz, this->bigpw->bz);
+        this->charge->check_rho(); // check the rho
     }
 
     // renormalize the charge density
diff --git a/source/module_elecstate/module_charge/charge.cpp b/source/module_elecstate/module_charge/charge.cpp
index b1c7dee91d..e659c46213 100644
--- a/source/module_elecstate/module_charge/charge.cpp
+++ b/source/module_elecstate/module_charge/charge.cpp
@@ -700,7 +700,7 @@ void Charge::save_rho_before_sum_band(void)
     return;
 }
 
-double Charge::check_ne(const double* rho_in) const
+double Charge::cal_rho2ne(const double* rho_in) const
 {
     double ne = 0.0;
     for (int ir = 0; ir < this->rhopw->nrxx; ir++)
@@ -711,12 +711,45 @@ double Charge::check_ne(const double* rho_in) const
     Parallel_Reduce::reduce_pool(ne);
 #endif
     ne = ne * elecstate::get_ucell_omega() / (double)this->rhopw->nxyz;
-    std::cout << std::setprecision(10);
-    std::cout << " check the electrons number from rho, ne =" << ne << std::endl;
-    std::cout << std::setprecision(6);
+
     return ne;
 }
 
+void Charge::check_rho()
+{
+    if (this->nspin==1 || this->nspin==4)
+    {
+        double ne = 0.0;
+        ne = this->cal_rho2ne(rho[0]);
+        if (std::abs(ne - GlobalV::nelec) > 1.0e-6)
+        {
+            ModuleBase::WARNING("Charge", "Charge is not equal to the number of electrons!");
+        }
+    }
+    else if (this->nspin == 2)
+    {
+        // for spin up
+        double ne_up = 0.0;
+        ne_up = this->cal_rho2ne(rho[0]);
+        if (ne_up < 0.0)
+        {
+            ModuleBase::WARNING_QUIT("Charge", "Number of spin-down electrons set in starting magnetization exceeds all available.");
+        }
+        // for spin down
+        double ne_dn = 0.0;
+        ne_dn = this->cal_rho2ne(rho[1]);
+        if (ne_dn < 0.0)
+        {
+            ModuleBase::WARNING_QUIT("Charge", "Number of spin-up electrons set in starting magnetization exceeds all available.");
+        }
+        // for total charge
+        if (std::abs(ne_up + ne_dn - GlobalV::nelec) > 1.0e-6)
+        {
+            ModuleBase::WARNING("Charge", "Charge is not equal to the number of electrons!");
+        }
+    }
+}
+
 // LiuXh add 20180619
 void Charge::init_final_scf()
 {
diff --git a/source/module_elecstate/module_charge/charge.h b/source/module_elecstate/module_charge/charge.h
index 58bc2ef615..71fdc25120 100644
--- a/source/module_elecstate/module_charge/charge.h
+++ b/source/module_elecstate/module_charge/charge.h
@@ -98,9 +98,11 @@ class Charge
         double *rhocg
     ) const;
 
-	double check_ne(const double *rho_in) const;
+	double cal_rho2ne(const double *rho_in) const;
 
-    void init_final_scf(); //LiuXh add 20180619
+  void check_rho(); // to check whether the charge density is normal
+
+  void init_final_scf(); //LiuXh add 20180619
 
 	public:
     /**
diff --git a/source/module_elecstate/test/charge_test.cpp b/source/module_elecstate/test/charge_test.cpp
index 4c50265284..37cdd0ab05 100644
--- a/source/module_elecstate/test/charge_test.cpp
+++ b/source/module_elecstate/test/charge_test.cpp
@@ -64,7 +64,7 @@ void Set_GlobalV_Default()
  *     - calculate \sum_{is}^nspin \sum_{ir}^nrxx rho[is][ir]
  *   - RenormalizeRho: Charge::renormalize_rho()
  *     - renormalize rho so as to ensure the sum of rho equals to total number of electrons
- *   - CheckNe: Charge::check_ne()
+ *   - CheckNe: Charge::cal_rho2ne()
  *     - check the total number of electrons summed from rho[is]
  *   - SaveRhoBeforeSumBand: Charge::save_rho_before_sum_band()
  *     - meaning as the function name
@@ -185,7 +185,7 @@ TEST_F(ChargeTest, CheckNe)
     elecstate::tmp_ucell_omega = ucell->omega;
     charge->renormalize_rho();
     EXPECT_NEAR(charge->sum_rho(), 8.0, 1e-10);
-    EXPECT_NEAR(charge->check_ne(charge->rho[0]), 8.0, 1e-10);
+    EXPECT_NEAR(charge->cal_rho2ne(charge->rho[0]), 8.0, 1e-10);
 }
 
 TEST_F(ChargeTest, SaveRhoBeforeSumBand)
@@ -207,7 +207,7 @@ TEST_F(ChargeTest, SaveRhoBeforeSumBand)
     elecstate::tmp_ucell_omega = ucell->omega;
     charge->renormalize_rho();
     charge->save_rho_before_sum_band();
-    EXPECT_NEAR(charge->check_ne(charge->rho_save[0]), 8.0, 1e-10);
+    EXPECT_NEAR(charge->cal_rho2ne(charge->rho_save[0]), 8.0, 1e-10);
 }
 
 TEST_F(ChargeTest, InitFinalScf)
diff --git a/source/module_elecstate/test/elecstate_base_test.cpp b/source/module_elecstate/test/elecstate_base_test.cpp
index 2f5ee12e6d..33b831e6a9 100644
--- a/source/module_elecstate/test/elecstate_base_test.cpp
+++ b/source/module_elecstate/test/elecstate_base_test.cpp
@@ -74,6 +74,9 @@ void Charge::set_rhopw(ModulePW::PW_Basis*)
 void Charge::renormalize_rho()
 {
 }
+void Charge::check_rho()
+{
+}
 
 /************************************************
  *  unit test of elecstate.cpp
diff --git a/source/module_elecstate/test/elecstate_pw_test.cpp b/source/module_elecstate/test/elecstate_pw_test.cpp
index ea859072ad..b82dd3cef8 100644
--- a/source/module_elecstate/test/elecstate_pw_test.cpp
+++ b/source/module_elecstate/test/elecstate_pw_test.cpp
@@ -144,6 +144,9 @@ void Charge::set_rhopw(ModulePW::PW_Basis*)
 void Charge::renormalize_rho()
 {
 }
+void Charge::check_rho()
+{
+}
 
 void Set_GlobalV_Default()
 {

From 7f84a0996a90e439c95a34e560e5fe33b452ac6c Mon Sep 17 00:00:00 2001
From: Peng Xingliang <91927439+pxlxingliang@users.noreply.github.com>
Date: Fri, 26 Apr 2024 19:07:47 +0800
Subject: [PATCH 07/13] update version (#4061)

---
 source/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/version.h b/source/version.h
index db044fd958..3be59f6341 100644
--- a/source/version.h
+++ b/source/version.h
@@ -1,3 +1,3 @@
 #ifndef VERSION
-#define VERSION "v3.6.1"
+#define VERSION "v3.6.2"
 #endif

From f372175d267b92e99e20c668e50111c5acae4860 Mon Sep 17 00:00:00 2001
From: GRYS <57103981+grysgreat@users.noreply.github.com>
Date: Sat, 27 Apr 2024 21:07:24 +0800
Subject: [PATCH 08/13] doc: add docs about using MPI in docker. (#4060)

* add docs.

* Update docs/community/faq.md

Co-authored-by: Chun Cai <amoycaic@gmail.com>

* Update docs/community/faq.md

Co-authored-by: Chun Cai <amoycaic@gmail.com>

---------

Co-authored-by: Chun Cai <amoycaic@gmail.com>
---
 docs/community/faq.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/community/faq.md b/docs/community/faq.md
index e2a1cdee02..724519a35e 100644
--- a/docs/community/faq.md
+++ b/docs/community/faq.md
@@ -80,6 +80,8 @@ If the program prompt something like "KILLED BY SIGNAL: 9 (Killed)", it may be c
 
 If the error message is "Segmentation fault", or there is no enough information on the error, please feel free to submit an issue.
 
+**4. Error "Read -1" when using mpirun in docker environment**
+This is a [known issue](https://github.com/open-mpi/ompi/issues/4948) of OpenMPI running in a docker container. In this case, please set environment variable `OMPI_MCA_btl_vader_single_copy_mechanism=none`.
 ## Miscellaneous
 
 **1. How to visualize charge density file?**

From b74f2a0b67fb190bc8d35a5d88f3ece6cdec94f0 Mon Sep 17 00:00:00 2001
From: Wenfei Li <38569667+wenfei-li@users.noreply.github.com>
Date: Sun, 28 Apr 2024 10:39:53 +0800
Subject: [PATCH 09/13] Feature : descriptors for equivariant DeePKS (#3894)

* Feature : add interfaces for calculating equivariant pdm

* add priting of equiv descriptors

* modify calculation of new pdm

* modify calculation of new pdm (multi-k)

---------

Co-authored-by: wenfei-li <liwenfei@gmail.com>
---
 .../module_esolver/esolver_ks_lcao_elec.cpp   |   2 +-
 .../esolver_ks_lcao_tmpfunc.cpp               |   8 +-
 .../hamilt_lcaodft/FORCE_gamma.cpp            |   2 +-
 .../hamilt_lcaodft/FORCE_k.cpp                |   4 +-
 .../operator_lcao/deepks_lcao.cpp             |  14 +-
 .../module_deepks/LCAO_deepks.cpp             |  47 ++--
 .../module_deepks/LCAO_deepks.h               |  23 +-
 .../module_deepks/LCAO_deepks_interface.cpp   |   6 +-
 .../module_deepks/LCAO_deepks_io.cpp          |  36 ++-
 .../module_deepks/LCAO_deepks_pdm.cpp         | 211 +++++++++++++-----
 .../module_deepks/LCAO_deepks_torch.cpp       |  74 ++++--
 .../module_deepks/test/LCAO_deepks_test.cpp   |   6 +-
 12 files changed, 313 insertions(+), 120 deletions(-)

diff --git a/source/module_esolver/esolver_ks_lcao_elec.cpp b/source/module_esolver/esolver_ks_lcao_elec.cpp
index 5eeaaa2e63..35ee6df306 100644
--- a/source/module_esolver/esolver_ks_lcao_elec.cpp
+++ b/source/module_esolver/esolver_ks_lcao_elec.cpp
@@ -689,7 +689,7 @@ void ESolver_KS_LCAO<TK, TR>::nscf(void)
         const elecstate::DensityMatrix<TK, double>* dm
             = dynamic_cast<const elecstate::ElecStateLCAO<TK>*>(this->pelec)->get_DM();
         this->dpks_cal_projected_DM(dm);
-        GlobalC::ld.cal_descriptor(); // final descriptor
+        GlobalC::ld.cal_descriptor(GlobalC::ucell.nat); // final descriptor
         GlobalC::ld.cal_gedm(GlobalC::ucell.nat);
     }
 #endif
diff --git a/source/module_esolver/esolver_ks_lcao_tmpfunc.cpp b/source/module_esolver/esolver_ks_lcao_tmpfunc.cpp
index a8ffb911f7..95fd4312a0 100644
--- a/source/module_esolver/esolver_ks_lcao_tmpfunc.cpp
+++ b/source/module_esolver/esolver_ks_lcao_tmpfunc.cpp
@@ -93,9 +93,7 @@ namespace ModuleESolver
         GlobalC::ld.cal_projected_DM_k(dm,
             GlobalC::ucell,
             GlobalC::ORB,
-            GlobalC::GridD,
-            this->kv.nks,
-            this->kv.kvec_d);
+            GlobalC::GridD);
     }
 
 
@@ -106,9 +104,7 @@ namespace ModuleESolver
         GlobalC::ld.cal_projected_DM_k(dm,
             GlobalC::ucell,
             GlobalC::ORB,
-            GlobalC::GridD,
-            this->kv.nks,
-            this->kv.kvec_d);
+            GlobalC::GridD);
     }
 #endif
 }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
index a6670fdc9a..0751a9e46f 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
@@ -73,7 +73,7 @@ void Force_LCAO_gamma::ftable_gamma(const bool isforce,
     {
         const std::vector<std::vector<double>>& dm_gamma = DM->get_DMK_vector();
         GlobalC::ld.cal_projected_DM(DM, GlobalC::ucell, GlobalC::ORB, GlobalC::GridD);
-        GlobalC::ld.cal_descriptor();
+        GlobalC::ld.cal_descriptor(GlobalC::ucell.nat);
         GlobalC::ld.cal_gedm(GlobalC::ucell.nat);
         GlobalC::ld.cal_f_delta_gamma(
 				dm_gamma, 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
index 2dea6d49f3..736813ccbf 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
@@ -124,8 +124,8 @@ void Force_LCAO_k::ftable_k(const bool isforce,
     if (GlobalV::deepks_scf)
     {
         const std::vector<std::vector<std::complex<double>>>& dm_k = DM->get_DMK_vector();
-        GlobalC::ld.cal_projected_DM_k(DM, GlobalC::ucell, GlobalC::ORB, GlobalC::GridD, kv.nks, kv.kvec_d);
-        GlobalC::ld.cal_descriptor();
+        GlobalC::ld.cal_projected_DM_k(DM, GlobalC::ucell, GlobalC::ORB, GlobalC::GridD);
+        GlobalC::ld.cal_descriptor(GlobalC::ucell.nat);
         GlobalC::ld.cal_gedm(GlobalC::ucell.nat);
 
         GlobalC::ld.cal_f_delta_k(dm_k,
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
index c5e3174539..b61a4b248c 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
@@ -156,7 +156,7 @@ void DeePKS<OperatorLCAO<double, double>>::contributeHR()
             *this->ucell,
             GlobalC::ORB,
             GlobalC::GridD);
-        GlobalC::ld.cal_descriptor();        
+        GlobalC::ld.cal_descriptor(this->ucell->nat);        
         GlobalC::ld.cal_gedm(this->ucell->nat);
         //GlobalC::ld.add_v_delta(*this->ucell,
         //    GlobalC::ORB,
@@ -189,10 +189,8 @@ void DeePKS<OperatorLCAO<std::complex<double>, double>>::contributeHR()
         GlobalC::ld.cal_projected_DM_k(this->DM,
             *this->ucell,
             GlobalC::ORB,
-            GlobalC::GridD,
-            this->nks,
-            this->kvec_d);
-        GlobalC::ld.cal_descriptor();
+            GlobalC::GridD);
+        GlobalC::ld.cal_descriptor(this->ucell->nat);
         // calculate dE/dD
         GlobalC::ld.cal_gedm(this->ucell->nat);
 
@@ -230,10 +228,8 @@ void DeePKS<OperatorLCAO<std::complex<double>, std::complex<double>>>::contribut
         GlobalC::ld.cal_projected_DM_k(this->DM,
             *this->ucell,
             GlobalC::ORB,
-            GlobalC::GridD,
-            this->nks,
-            this->kvec_d);
-        GlobalC::ld.cal_descriptor();
+            GlobalC::GridD);
+        GlobalC::ld.cal_descriptor(this->ucell->nat);
         // calculate dE/dD
         GlobalC::ld.cal_gedm(this->ucell->nat);
 
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
index 18192ab8a8..a29c0754fa 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
@@ -87,17 +87,36 @@ void LCAO_Deepks::init(
     assert(nm >= 0);
     assert(tot_inl_per_atom >= 0);
     
-    const int tot_inl = tot_inl_per_atom * nat;
+    int tot_inl = tot_inl_per_atom * nat;
+
+    if(if_equiv) tot_inl = nat;
 
     this->lmaxd = lm;
     this->nmaxd = nm;
-    this->inlmax = tot_inl;
+    
     GlobalV::ofs_running << " lmax of descriptor = " << this->lmaxd << std::endl;
     GlobalV::ofs_running << " nmax of descriptor= " << nmaxd << std::endl;
-	GlobalV::ofs_running << " total basis (all atoms) for descriptor= " << std::endl;
-    
-    //init pdm**
-    const int pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
+
+    int pdm_size = 0;
+    this->inlmax = tot_inl;
+    if(!if_equiv)
+    {
+        GlobalV::ofs_running << " total basis (all atoms) for descriptor= " << std::endl;
+
+        //init pdm**
+        pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
+    }
+    else
+    {
+        for(int il = 0; il < this->lmaxd + 1; il++)
+        {
+            pdm_size += (2 * il + 1) * orb.Alpha[0].getNchi(il);
+        }
+        pdm_size = pdm_size * pdm_size;
+        this->des_per_atom=pdm_size;
+        GlobalV::ofs_running << " Equivariant version, size of pdm matrices : " << pdm_size << std::endl;
+    }
+
     this->pdm = new double* [this->inlmax];
     for (int inl = 0;inl < this->inlmax;inl++)
     {
@@ -106,15 +125,17 @@ void LCAO_Deepks::init(
     }
 
     // cal n(descriptor) per atom , related to Lmax, nchi(L) and m. (not total_nchi!)
-	this->des_per_atom=0; // mohan add 2021-04-21
-    for (int l = 0; l <= this->lmaxd; l++)
+    if(!if_equiv)
     {
-        this->des_per_atom += orb.Alpha[0].getNchi(l) * (2 * l + 1);
-    }
-
-    this->n_descriptor = nat * this->des_per_atom;
+        this->des_per_atom=0; // mohan add 2021-04-21
+        for (int l = 0; l <= this->lmaxd; l++)
+        {
+            this->des_per_atom += orb.Alpha[0].getNchi(l) * (2 * l + 1);
+        }
+        this->n_descriptor = nat * this->des_per_atom;
 
-    this->init_index(ntype, nat, na, tot_inl, orb);
+        this->init_index(ntype, nat, na, tot_inl, orb);
+    }
     this->allocate_nlm(nat);
 
     this->pv = &pv_in;
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
index dbf6ee26ad..1ec77ed412 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
@@ -93,6 +93,8 @@ class LCAO_Deepks
     int nks_V_delta = 0;
 
     bool init_pdm = false; //for DeePKS NSCF calculation
+
+    bool if_equiv = false; //equivariant version
     
 	// deep neural network module that provides corrected Hamiltonian term and
 	// related derivatives.
@@ -106,7 +108,7 @@ class LCAO_Deepks
     std::vector<std::map<key_tuple,std::unordered_map<int,std::vector<std::vector<double>>>>> nlm_save_k;
 
     // projected density matrix
-	double** pdm;	//[tot_Inl][2l+1][2l+1]	caoyu modified 2021-05-07
+	double** pdm;	//[tot_Inl][2l+1][2l+1]	caoyu modified 2021-05-07; if equivariant version: [nat][nlm*nlm]
 	std::vector<torch::Tensor> pdm_tensor;
 
 	// descriptors
@@ -287,11 +289,20 @@ class LCAO_Deepks
         const elecstate::DensityMatrix<std::complex<double>, double>* dm,
         const UnitCell &ucell,
         const LCAO_Orbitals &orb,
-        Grid_Driver& GridD,
-        const int nks,
-        const std::vector<ModuleBase::Vector3<double>> &kvec_d);
+        Grid_Driver& GridD);
     void check_projected_dm(void);
 
+    void cal_projected_DM_equiv(
+        const elecstate::DensityMatrix<double, double>* dm,
+        const UnitCell &ucell,
+        const LCAO_Orbitals &orb,
+        Grid_Driver& GridD);
+    void cal_projected_DM_k_equiv(
+        const elecstate::DensityMatrix<std::complex<double>, double>* dm,
+        const UnitCell &ucell,
+        const LCAO_Orbitals &orb,
+        Grid_Driver& GridD);
+        
     //calculate the gradient of pdm with regard to atomic positions
     //d/dX D_{Inl,mm'}
     void cal_gdmx(//const ModuleBase::matrix& dm,
@@ -439,10 +450,12 @@ class LCAO_Deepks
 
     ///Calculates descriptors
     ///which are eigenvalues of pdm in blocks of I_n_l
-	void cal_descriptor(void);
+	void cal_descriptor(const int nat);
     ///print descriptors based on LCAO basis
     void check_descriptor(const UnitCell &ucell);
 
+    void cal_descriptor_equiv(const int nat);
+
     ///calculates gradient of descriptors w.r.t atomic positions
     ///----------------------------------------------------
     ///m, n: 2*l+1
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp
index a6b14dcb58..6c0f72e242 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp
@@ -88,7 +88,7 @@ void LCAO_Deepks_Interface::out_deepks_labels(double etot,
         // this part is for integrated test of deepks
         ld->cal_projected_DM(dm, ucell, orb, GridD);
         ld->check_projected_dm(); // print out the projected dm for NSCF calculaiton
-        ld->cal_descriptor();     // final descriptor
+        ld->cal_descriptor(nat);     // final descriptor
         ld->check_descriptor(ucell);
 
         if (GlobalV::deepks_out_labels)
@@ -188,9 +188,9 @@ void LCAO_Deepks_Interface::out_deepks_labels(double etot,
     {
         // this part is for integrated test of deepks
         // so it is printed no matter even if deepks_out_labels is not used
-        ld->cal_projected_DM_k(dm, ucell, orb, GridD, nks, kvec_d);
+        ld->cal_projected_DM_k(dm, ucell, orb, GridD);
         ld->check_projected_dm(); // print out the projected dm for NSCF calculaiton
-        ld->cal_descriptor();     // final descriptor
+        ld->cal_descriptor(nat);     // final descriptor
         ld->check_descriptor(ucell);
 
         if (GlobalV::deepks_out_labels)
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp
index 3d50ad646c..646e5c0a63 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp
@@ -65,19 +65,39 @@ void LCAO_Deepks::save_npy_d(const int nat)
     ModuleBase::TITLE("LCAO_Deepks", "save_npy_d");
     if(GlobalV::MY_RANK!=0) return;
     //save descriptor in .npy format
-    vector<double> npy_des;
-    for (int inl = 0;inl < inlmax;++inl)
+    if(!if_equiv)
     {
-        int nm = 2*inl_l[inl] + 1;
-        for(int im=0;im<nm;im++)
+        vector<double> npy_des;
+        for (int inl = 0;inl < inlmax;++inl)
         {
-            npy_des.push_back(this->d_tensor[inl].index({im}).item().toDouble());
+            int nm = 2*inl_l[inl] + 1;
+            for(int im=0;im<nm;im++)
+            {
+                npy_des.push_back(this->d_tensor[inl].index({im}).item().toDouble());
+            }
+        }
+        const long unsigned dshape[] = {static_cast<unsigned long>(nat), static_cast<unsigned long>(this->des_per_atom)};
+        if (GlobalV::MY_RANK == 0)
+        {
+            npy::SaveArrayAsNumpy("dm_eig.npy", false, 2, dshape, npy_des);
         }
     }
-    const long unsigned dshape[] = {static_cast<unsigned long>(nat), static_cast<unsigned long>(this->des_per_atom)};
-    if (GlobalV::MY_RANK == 0)
+    else
     {
-        npy::SaveArrayAsNumpy("dm_eig.npy", false, 2, dshape, npy_des);
+        // a rather unnecessary way of writing this, but I'll do it for now
+        std::vector<double> npy_des;
+        for(int iat = 0; iat < nat; iat ++)
+        {
+            for(int i = 0; i < this->des_per_atom; i++)
+            {
+                npy_des.push_back(this->d_tensor[iat].index({i}).item().toDouble());
+            }
+        }
+        const long unsigned dshape[] = {static_cast<unsigned long>(nat), static_cast<unsigned long>(this->des_per_atom)};
+        if (GlobalV::MY_RANK == 0)
+        {
+            npy::SaveArrayAsNumpy("dm_eig.npy", false, 2, dshape, npy_des);
+        }        
     }
     return;
 }
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp
index bf21360ebe..88a56582e6 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp
@@ -34,7 +34,20 @@ void LCAO_Deepks::cal_projected_DM(const elecstate::DensityMatrix<double, double
 {
     ModuleBase::TITLE("LCAO_Deepks", "cal_projected_DM");
 
-    const int pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
+    int pdm_size;
+    if(!if_equiv)
+    {
+        pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
+    }
+    else
+    {
+        int nproj = 0;
+        for(int il = 0; il < this->lmaxd + 1; il++)
+        {
+            nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
+        }
+        pdm_size = nproj * nproj;
+    }
     if (GlobalV::init_chg == "file" && !this->init_pdm) //for DeePKS NSCF calculation 
     {
         std::ifstream ifs("pdm.dat");
@@ -80,23 +93,42 @@ void LCAO_Deepks::cal_projected_DM(const elecstate::DensityMatrix<double, double
             //trace alpha orbital
             std::vector<int> trace_alpha_row;
             std::vector<int> trace_alpha_col;
-            int ib=0;
-            for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+            if(!if_equiv)
             {
-                for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                int ib=0;
+                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
                 {
-                    const int inl = this->inl_index[T0](I0, L0, N0);
-                    const int nm = 2*L0+1;
-            
-                    for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
+                    for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
                     {
-                        for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                        const int inl = this->inl_index[T0](I0, L0, N0);
+                        const int nm = 2*L0+1;
+                
+                        for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
                         {
-                            trace_alpha_row.push_back(ib+m1);
-                            trace_alpha_col.push_back(ib+m2);
+                            for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                            {
+                                trace_alpha_row.push_back(ib+m1);
+                                trace_alpha_col.push_back(ib+m2);
+                            }
                         }
+                        ib+=nm;
+                    }
+                }
+            }
+            else
+            {
+                int nproj = 0;
+                for(int il = 0; il < this->lmaxd + 1; il++)
+                {
+                    nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
+                }
+                for(int iproj = 0; iproj < nproj; iproj ++)
+                {
+                    for(int jproj = 0; jproj < nproj; jproj ++)
+                    {
+                        trace_alpha_row.push_back(iproj);
+                        trace_alpha_col.push_back(jproj);
                     }
-                    ib+=nm;
                 }
             }
             const int trace_alpha_size = trace_alpha_row.size();
@@ -199,25 +231,46 @@ void LCAO_Deepks::cal_projected_DM(const elecstate::DensityMatrix<double, double
                         &row_size);
 				}//ad2
                 // do dot of g_1dmt and s_1t to get orbital_pdm_shell
-                int ib=0, index=0, inc=1;
-                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+                if(!if_equiv)
                 {
-                    for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                    int ib=0, index=0, inc=1;
+                    for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
                     {
-                        const int inl = this->inl_index[T0](I0, L0, N0);
-                        const int nm = 2*L0+1;
-                
-                        for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
+                        for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
                         {
-                            for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                            const int inl = this->inl_index[T0](I0, L0, N0);
+                            const int nm = 2*L0+1;
+                    
+                            for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
                             {
-                                int ind = m1*nm + m2;
-                                pdm[inl][ind] += 
-                                    ddot_(&row_size, g_1dmt.data()+index*row_size, &inc, s_1t.data()+index*row_size, &inc);
-                                index++;
+                                for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                                {
+                                    int ind = m1*nm + m2;
+                                    pdm[inl][ind] += 
+                                        ddot_(&row_size, g_1dmt.data()+index*row_size, &inc, s_1t.data()+index*row_size, &inc);
+                                    index++;
+                                }
                             }
+                            ib+=nm;
+                        }
+                    }
+                }
+                else
+                {
+                    int index = 0, inc=1;
+                    int nproj = 0;
+                    for(int il = 0; il < this->lmaxd + 1; il++)
+                    {
+                        nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
+                    }
+                    for(int iproj = 0; iproj < nproj; iproj ++)
+                    {
+                        for(int jproj = 0; jproj < nproj; jproj ++)
+                        {
+                            pdm[iat][iproj * nproj + jproj] += 
+                                ddot_(&row_size, g_1dmt.data()+index*row_size, &inc, s_1t.data()+index*row_size, &inc);
+                            index ++;
                         }
-                        ib+=nm;
                     }
                 }
 			}//ad1
@@ -234,11 +287,23 @@ void LCAO_Deepks::cal_projected_DM(const elecstate::DensityMatrix<double, double
 void LCAO_Deepks::cal_projected_DM_k(const elecstate::DensityMatrix<std::complex<double>, double>* dm,
     const UnitCell &ucell,
     const LCAO_Orbitals &orb,
-    Grid_Driver& GridD,
-    const int nks,
-    const std::vector<ModuleBase::Vector3<double>> &kvec_d)
+    Grid_Driver& GridD)
 {
-    const int pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
+
+    int pdm_size;
+    if(!if_equiv)
+    {
+        pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
+    }
+    else
+    {
+        int nproj = 0;
+        for(int il = 0; il < this->lmaxd + 1; il++)
+        {
+            nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
+        }
+        pdm_size = nproj * nproj;        
+    }
 
     if (GlobalV::init_chg == "file" && !this->init_pdm) //for DeePKS NSCF calculation 
     {
@@ -285,25 +350,44 @@ void LCAO_Deepks::cal_projected_DM_k(const elecstate::DensityMatrix<std::complex
             //trace alpha orbital
             std::vector<int> trace_alpha_row;
             std::vector<int> trace_alpha_col;
-            int ib=0;
-            for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+            if(!if_equiv)
             {
-                for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                int ib=0;
+                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
                 {
-                    const int inl = this->inl_index[T0](I0, L0, N0);
-                    const int nm = 2*L0+1;
-            
-                    for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
+                    for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
                     {
-                        for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                        const int inl = this->inl_index[T0](I0, L0, N0);
+                        const int nm = 2*L0+1;
+                
+                        for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
                         {
-                            trace_alpha_row.push_back(ib+m1);
-                            trace_alpha_col.push_back(ib+m2);
+                            for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                            {
+                                trace_alpha_row.push_back(ib+m1);
+                                trace_alpha_col.push_back(ib+m2);
+                            }
                         }
+                        ib+=nm;
                     }
-                    ib+=nm;
                 }
             }
+            else
+            {
+                int nproj = 0;
+                for(int il = 0; il < this->lmaxd + 1; il++)
+                {
+                    nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
+                }
+                for(int iproj = 0; iproj < nproj; iproj ++)
+                {
+                    for(int jproj = 0; jproj < nproj; jproj ++)
+                    {
+                        trace_alpha_row.push_back(iproj);
+                        trace_alpha_col.push_back(jproj);
+                    }
+                }                
+            }
             const int trace_alpha_size = trace_alpha_row.size();
 
             for (int ad1=0; ad1<GridD.getAdjacentNum()+1 ; ++ad1)
@@ -411,26 +495,47 @@ void LCAO_Deepks::cal_projected_DM_k(const elecstate::DensityMatrix<std::complex
                         &row_size);
 				}//ad2
                 // do dot of g_1dmt and s_1t to get orbital_pdm_shell
-                int ib=0, index=0, inc=1;
-                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+                if(!if_equiv)
                 {
-                    for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                    int ib=0, index=0, inc=1;
+                    for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
                     {
-                        const int inl = this->inl_index[T0](I0, L0, N0);
-                        const int nm = 2*L0+1;
-                
-                        for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
+                        for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
                         {
-                            for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                            const int inl = this->inl_index[T0](I0, L0, N0);
+                            const int nm = 2*L0+1;
+                    
+                            for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
                             {
-                                int ind = m1*nm + m2;
-                                pdm[inl][ind] += 
-                                    ddot_(&row_size, g_1dmt.data()+index*row_size, &inc, s_1t.data()+index*row_size, &inc);
-                                index++;
+                                for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                                {
+                                    int ind = m1*nm + m2;
+                                    pdm[inl][ind] += 
+                                        ddot_(&row_size, g_1dmt.data()+index*row_size, &inc, s_1t.data()+index*row_size, &inc);
+                                    index++;
+                                }
                             }
+                            ib+=nm;
                         }
-                        ib+=nm;
                     }
+                }
+                else
+                {
+                    int index = 0, inc=1;
+                    int nproj = 0;
+                    for(int il = 0; il < this->lmaxd + 1; il++)
+                    {
+                        nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
+                    }
+                    for(int iproj = 0; iproj < nproj; iproj ++)
+                    {
+                        for(int jproj = 0; jproj < nproj; jproj ++)
+                        {
+                            pdm[iat][iproj * nproj + jproj] += 
+                                ddot_(&row_size, g_1dmt.data()+index*row_size, &inc, s_1t.data()+index*row_size, &inc);
+                            index ++;
+                        }
+                    }                    
                 }
 			}//ad1
         }//I0
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
index 57c7974e12..c7d81ac2d3 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
@@ -30,12 +30,39 @@
 #include "module_base/libm/libm.h"
 #include "module_base/blas_connector.h"
 
+void LCAO_Deepks::cal_descriptor_equiv(const int nat)
+{
+    ModuleBase::TITLE("LCAO_Deepks", "cal_descriptor_equiv");
+    ModuleBase::timer::tick("LCAO_Deepks", "cal_descriptor_equiv");
+
+    // a rather unnecessary way of writing this, but I'll do it for now
+    if (!this->d_tensor.empty())
+    {
+        this->d_tensor.erase(this->d_tensor.begin(), this->d_tensor.end());
+    }
+
+    for(int iat = 0; iat < nat; iat++)
+    {
+        auto tmp = torch::zeros(des_per_atom,torch::kFloat64);
+        std::memcpy(tmp.data_ptr(),pdm[iat],sizeof(double)*tmp.numel());
+        this->d_tensor.push_back(tmp);
+    }
+
+    ModuleBase::timer::tick("LCAO_Deepks", "cal_descriptor_equiv");
+}
+
 //calculates descriptors from projected density matrices
-void LCAO_Deepks::cal_descriptor(void)
+void LCAO_Deepks::cal_descriptor(const int nat)
 {
     ModuleBase::TITLE("LCAO_Deepks", "cal_descriptor");
     ModuleBase::timer::tick("LCAO_Deepks", "cal_descriptor");
 
+    if(if_equiv)
+    {
+        this->cal_descriptor_equiv(nat);
+        return;
+    }
+
     //init pdm_tensor and d_tensor
     torch::Tensor tmp;
 
@@ -85,24 +112,41 @@ void LCAO_Deepks::check_descriptor(const UnitCell &ucell)
     if(GlobalV::MY_RANK!=0) return;
     std::ofstream ofs("descriptor.dat");
     ofs<<std::setprecision(10);
-    for (int it = 0; it < ucell.ntype; it++)
+    if(!if_equiv)
     {
-        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+        for (int it = 0; it < ucell.ntype; it++)
         {
-            int iat=ucell.itia2iat(it,ia);
-            ofs << ucell.atoms[it].label << " atom_index " << ia + 1 << " n_descriptor " << this->des_per_atom << std::endl;
-            int id = 0;
-            for(int inl=0;inl<inlmax/ucell.nat;inl++)
+            for (int ia = 0; ia < ucell.atoms[it].na; ia++)
             {
-                int nm = 2*inl_l[inl]+1;
-                for(int im=0;im<nm;im++)
+                int iat=ucell.itia2iat(it,ia);
+                ofs << ucell.atoms[it].label << " atom_index " << ia + 1 << " n_descriptor " << this->des_per_atom << std::endl;
+                int id = 0;
+                for(int inl=0;inl<inlmax/ucell.nat;inl++)
                 {
-                    const int ind=iat*inlmax/ucell.nat+inl;
-                    ofs << d_tensor[ind].index({im}).item().toDouble() << " ";
-                    if (id % 8 == 7) ofs << std::endl;
-                    id++;
-                }
-            }   
+                    int nm = 2*inl_l[inl]+1;
+                    for(int im=0;im<nm;im++)
+                    {
+                        const int ind=iat*inlmax/ucell.nat+inl;
+                        ofs << d_tensor[ind].index({im}).item().toDouble() << " ";
+                        if (id % 8 == 7) ofs << std::endl;
+                        id++;
+                    }
+                }   
+                ofs << std::endl << std::endl;
+            }
+        }
+    }
+    else
+    {
+        for(int iat = 0; iat < ucell.nat; iat ++)
+        {
+            int it = ucell.iat2it[iat];
+            ofs << ucell.atoms[it].label << " atom_index " << iat + 1 << " n_descriptor " << this->des_per_atom << std::endl;
+            for(int i = 0; i < this->des_per_atom; i ++)
+            {
+                ofs << this->pdm[iat][i] << " ";
+                if (i % 8 == 7) ofs << std::endl;
+            }
             ofs << std::endl << std::endl;
         }
     }
diff --git a/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp b/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp
index adb73fc568..f42ce77fc5 100644
--- a/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp
+++ b/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp
@@ -129,9 +129,7 @@ void test_deepks::check_pdm(void)
 		this->ld.cal_projected_DM_k(dm_k_new,
 			ucell,
 			ORB,
-            Test_Deepks::GridD,
-			kv.nkstot,
-			kv.kvec_d);		
+            Test_Deepks::GridD);		
 	}
 	this->ld.check_projected_dm();
 	this->compare_with_ref("pdm.dat","pdm_ref.dat");
@@ -187,7 +185,7 @@ void test_deepks::check_gdmx(void)
 
 void test_deepks::check_descriptor(void)
 {
-	this->ld.cal_descriptor();
+	this->ld.cal_descriptor(ucell.nat);
 	this->ld.check_descriptor(ucell);
 	this->compare_with_ref("descriptor.dat","descriptor_ref.dat");
 }

From c36e5170aa38537ea4b3d680f643502f0441df15 Mon Sep 17 00:00:00 2001
From: Wenfei Li <38569667+wenfei-li@users.noreply.github.com>
Date: Mon, 29 Apr 2024 14:14:01 +0800
Subject: [PATCH 10/13] Feature : reading and writing of h(r) and dm(r) in npz
 format (#4066)

* Feature : reading and writing of h(r) and dm(r) in npz format

* fix bug

* udpate makefile

* fix makefile and cmake

---------

Co-authored-by: wenfei-li <liwenfei@gmail.com>
---
 CMakeLists.txt                                |  22 +
 docs/advanced/input_files/input-main.md       |  14 +
 source/Makefile.Objects                       |   1 +
 source/module_base/global_variable.cpp        |   3 +
 source/module_base/global_variable.h          |   3 +
 source/module_elecstate/elecstate_lcao.cpp    |   7 +
 source/module_esolver/CMakeLists.txt          |   1 +
 source/module_esolver/esolver_ks.cpp          |  16 +
 source/module_esolver/esolver_ks_lcao.cpp     |  32 ++
 source/module_esolver/esolver_ks_lcao.h       |   3 +
 .../module_esolver/esolver_ks_lcao_elec.cpp   |  23 +
 source/module_esolver/io_npz.cpp              | 499 ++++++++++++++++++
 source/module_io/input.cpp                    |  32 +-
 source/module_io/input.h                      |   3 +
 source/module_io/input_conv.cpp               |   3 +
 source/module_io/parameter_pool.cpp           |  12 +
 source/module_io/test/input_conv_test.cpp     |   3 +
 source/module_io/test/write_input_test.cpp    |   3 +
 source/module_io/write_input.cpp              |   3 +
 19 files changed, 681 insertions(+), 2 deletions(-)
 create mode 100644 source/module_esolver/io_npz.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 44fcd8f094..6a820b8b63 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,7 @@ option(COMMIT_INFO "Print commit information in log" ON)
 option(ENABLE_FFT_TWO_CENTER "Enable FFT-based two-center integral method." ON)
 option(ENABLE_GOOGLEBENCH "Enable GOOGLE-benchmark usage." OFF)
 option(ENABLE_RAPIDJSON "Enable rapid-json usage." OFF)
+option(ENABLE_CNPY "Enable cnpy usage." OFF)
 
 # enable json support
 if(ENABLE_RAPIDJSON)
@@ -467,6 +468,27 @@ if(ENABLE_DEEPKS)
   add_compile_definitions(__DEEPKS)
 endif()
 
+if (ENABLE_CNPY)
+  find_path(cnpy_SOURCE_DIR
+    cnpy.h
+    HINTS ${libnpy_INCLUDE_DIR}
+  )
+  if(NOT cnpy_SOURCE_DIR)
+    include(FetchContent)
+    FetchContent_Declare(
+      cnpy
+      GIT_REPOSITORY https://github.com/rogersce/cnpy.git
+      GIT_PROGRESS TRUE
+    )
+    FetchContent_MakeAvailable(cnpy)
+  else()
+    include_directories(${cnpy_INCLUDE_DIR})
+  endif()
+  include_directories(${cnpy_SOURCE_DIR})
+  target_link_libraries(${ABACUS_BIN_NAME} cnpy)
+  add_compile_definitions(__USECNPY)
+endif()
+
 function(git_submodule_update)
   if(GIT_SUBMOD_RESULT EQUAL "0")
     message(DEBUG "Submodule init'ed")
diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index 16bebcab9d..f1345f04c4 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -1607,6 +1607,20 @@ These variables are used to control the output of properties.
 - **Description**: Whether to print the upper triangular part of the exchange-correlation matrices in **Kohn-Sham orbital representation** (unit: Ry): $\braket{\psi_i|V_\text{xc}^\text{(semi-)local}+V_\text{exx}+V_\text{DFTU}|\psi_j}$ for each k point into files in the directory `OUT.${suffix}`, which is useful for the subsequent GW calculation. (Note that currently DeePKS term is not included. ) The files are named `k-$k-Vxc`, the meaning of `$k`corresponding to k point and spin  is same as [hs_matrix.md](../elec_properties/hs_matrix.md#out_mat_hs).
 - **Default**: False
 
+### out_hr_npz/out_dm_npz
+
+- **Type**: Boolean
+- **Availability**: Numerical atomic orbital basis
+- **Description**: Whether to print Hamiltonian matrices H(R)/density matrics DM(R) in npz format. This feature does not work for gamma-only calculations. Currently only intended for internal usage.
+- **Default**: False
+
+### dm_to_rho
+
+- **Type**: Boolean
+- **Availability**: Numerical atomic orbital basis
+- **Description**: Reads density matrix DM(R) in npz format and creates electron density on grids. This feature does not work for gamma-only calculations. Only supports serial calculations. Currently only intended for internal usage.
+- **Default**: False
+
 ### out_app_flag
 
 - **Type**: Boolean
diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index 5416eb69e7..f61b7cced1 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -222,6 +222,7 @@ OBJS_ESOLVER_LCAO=esolver_ks_lcao.o\
       esolver_ks_lcao_elec.o\
       esolver_ks_lcao_tddft.o\
       esolver_ks_lcao_tmpfunc.o\
+      io_npz.o\
 
 OBJS_GINT=gint.o\
       gint_gamma.o\
diff --git a/source/module_base/global_variable.cpp b/source/module_base/global_variable.cpp
index b9ceb358da..84ae03d359 100644
--- a/source/module_base/global_variable.cpp
+++ b/source/module_base/global_variable.cpp
@@ -283,6 +283,9 @@ bool out_bandgap = false; // QO added for bandgap printing
 int out_interval = 1;    // convert from out_hsR_interval liuyu 2023-04-18
 
 bool out_mat_xc = false; // output Vxc in KS-wfc representation for GW calculation
+bool out_hr_npz = false;
+bool out_dm_npz = false;
+bool dm_to_rho = false; // reads dm in npz format, then prints density in cube format
 
 //==========================================================
 // Deltaspin related
diff --git a/source/module_base/global_variable.h b/source/module_base/global_variable.h
index a504307ce2..bf181c84d9 100644
--- a/source/module_base/global_variable.h
+++ b/source/module_base/global_variable.h
@@ -313,6 +313,9 @@ extern bool out_bandgap;
 extern int out_interval;
 
 extern bool out_mat_xc; // output Vxc in KS-wfc representation for GW calculation
+extern bool out_hr_npz; //writes h(r) in npz format
+extern bool out_dm_npz; //writes dm(r) in npz format
+extern bool dm_to_rho; //reads in dm(r) and creates density
 
 // Deltaspin related
 extern bool sc_mag_switch; // 0: no deltaspin; 1: constrain atomic magnetic moments;
diff --git a/source/module_elecstate/elecstate_lcao.cpp b/source/module_elecstate/elecstate_lcao.cpp
index a70153fe68..4fdbe229fb 100644
--- a/source/module_elecstate/elecstate_lcao.cpp
+++ b/source/module_elecstate/elecstate_lcao.cpp
@@ -93,6 +93,12 @@ void ElecStateLCAO<std::complex<double>>::psiToRho(const psi::Psi<std::complex<d
     ModuleBase::timer::tick("ElecStateLCAO", "psiToRho");
 
     this->calculate_weights();
+
+// the calculations of dm, and dm -> rho are, technically, two separate functionalities, as we cannot
+// rule out the possibility that we may have a dm from other sources, such as read from file.
+// However, since we are not separating them now, I opt to add a flag to control how dm is obtained as of now
+if(!GlobalV::dm_to_rho)
+{
     this->calEBand();
 
     ModuleBase::GlobalFunc::NOTE("Calculate the density matrix.");
@@ -130,6 +136,7 @@ void ElecStateLCAO<std::complex<double>>::psiToRho(const psi::Psi<std::complex<d
             this->print_psi(psi);
         }
     }
+}
     // old 2D-to-Grid conversion has been replaced by new Gint Refactor 2023/09/25
     //this->loc->cal_dk_k(*this->lowf->gridt, this->wg, (*this->klist));
     for (int is = 0; is < GlobalV::NSPIN; is++)
diff --git a/source/module_esolver/CMakeLists.txt b/source/module_esolver/CMakeLists.txt
index 978160295e..44fe0eabe1 100644
--- a/source/module_esolver/CMakeLists.txt
+++ b/source/module_esolver/CMakeLists.txt
@@ -18,6 +18,7 @@ if(ENABLE_LCAO)
       esolver_ks_lcao_elec.cpp
       esolver_ks_lcao_tddft.cpp
       esolver_ks_lcao_tmpfunc.cpp
+      io_npz.cpp
   )
 endif()
 
diff --git a/source/module_esolver/esolver_ks.cpp b/source/module_esolver/esolver_ks.cpp
index 59db2849fd..e69e61c296 100644
--- a/source/module_esolver/esolver_ks.cpp
+++ b/source/module_esolver/esolver_ks.cpp
@@ -390,6 +390,7 @@ void ESolver_KS<T, Device>::run(const int istep, UnitCell& ucell)
 		ModuleBase::timer::tick(this->classname, "run");
 
 		this->before_scf(istep); //Something else to do before the iter loop
+		if(GlobalV::dm_to_rho) return; //nothing further is needed
 
 		ModuleBase::GlobalFunc::DONE(GlobalV::ofs_running, "INIT SCF");
 
@@ -631,6 +632,21 @@ ModuleIO::Output_Rho ESolver_KS<T, Device>::create_Output_Rho(
 {
 	const int precision = 3;
 	std::string tag = "CHG";
+	if(GlobalV::dm_to_rho)
+	{
+		return ModuleIO::Output_Rho(this->pw_big,
+								this->pw_rhod,
+								is,
+								GlobalV::NSPIN,
+								pelec->charge->rho[is],
+								iter,
+								this->pelec->eferm.get_efval(is),
+								&(GlobalC::ucell),
+								GlobalV::global_out_dir,
+								precision,
+								tag,
+								prefix);
+	}
 	return ModuleIO::Output_Rho(this->pw_big,
 			this->pw_rhod,
 			is,
diff --git a/source/module_esolver/esolver_ks_lcao.cpp b/source/module_esolver/esolver_ks_lcao.cpp
index 949e6c4ba7..80c2d94c79 100644
--- a/source/module_esolver/esolver_ks_lcao.cpp
+++ b/source/module_esolver/esolver_ks_lcao.cpp
@@ -1116,6 +1116,38 @@ void ESolver_KS_LCAO<TK, TR>::after_scf(const int istep)
     }
 #endif
 
+    if(GlobalV::out_hr_npz)
+    {
+        this->p_hamilt->updateHk(0); // first k point, up spin
+        hamilt::HamiltLCAO<std::complex<double>, double>* p_ham_lcao 
+            = dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(this->p_hamilt);
+        std::string zipname = "output_HR0.npz";
+        this->output_mat_npz(zipname,*(p_ham_lcao->getHR()));
+
+        if(GlobalV::NSPIN==2)
+        {
+            this->p_hamilt->updateHk(this->kv.nks/2); // the other half of k points, down spin
+            hamilt::HamiltLCAO<std::complex<double>, double>* p_ham_lcao 
+                = dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(this->p_hamilt);
+            zipname = "output_HR1.npz";
+            this->output_mat_npz(zipname,*(p_ham_lcao->getHR()));            
+        }
+    }
+
+    if(GlobalV::out_dm_npz)
+    {
+        const elecstate::DensityMatrix<TK, double>* dm
+            = dynamic_cast<const elecstate::ElecStateLCAO<TK>*>(this->pelec)->get_DM();
+        std::string zipname = "output_DM0.npz";
+        this->output_mat_npz(zipname,*(dm->get_DMR_pointer(1)));
+
+        if(GlobalV::NSPIN==2)
+        {
+            zipname = "output_DM1.npz";
+            this->output_mat_npz(zipname,*(dm->get_DMR_pointer(2)));       
+        }
+    }
+
     if (!md_skip_out(GlobalV::CALCULATION, istep, GlobalV::out_interval))
     {
         this->create_Output_Mat_Sparse(istep).write();
diff --git a/source/module_esolver/esolver_ks_lcao.h b/source/module_esolver/esolver_ks_lcao.h
index f579579cea..9886081e0f 100644
--- a/source/module_esolver/esolver_ks_lcao.h
+++ b/source/module_esolver/esolver_ks_lcao.h
@@ -115,6 +115,9 @@ namespace ModuleESolver
         /// @brief create ModuleIO::Output_Mat_Sparse object to output sparse density matrix of H, S, T, r
         ModuleIO::Output_Mat_Sparse<TK> create_Output_Mat_Sparse(int istep);
 
+        void read_mat_npz(std::string& zipname, hamilt::HContainer<double>& hR);
+        void output_mat_npz(std::string& zipname, const hamilt::HContainer<double>& hR);
+
         /// @brief check if skip the corresponding output in md calculation
         bool md_skip_out(std::string calculation, int istep, int interval);
 
diff --git a/source/module_esolver/esolver_ks_lcao_elec.cpp b/source/module_esolver/esolver_ks_lcao_elec.cpp
index 35ee6df306..3646858a49 100644
--- a/source/module_esolver/esolver_ks_lcao_elec.cpp
+++ b/source/module_esolver/esolver_ks_lcao_elec.cpp
@@ -329,6 +329,29 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(int istep)
         ->get_DM()
         ->init_DMR(*(dynamic_cast<hamilt::HamiltLCAO<TK, TR>*>(this->p_hamilt)->getHR()));
 
+    if(GlobalV::dm_to_rho)
+    {
+        std::string zipname = "output_DM0.npz";
+        elecstate::DensityMatrix<TK, double>* dm
+            = dynamic_cast<const elecstate::ElecStateLCAO<TK>*>(this->pelec)->get_DM();
+        this->read_mat_npz(zipname,*(dm->get_DMR_pointer(1)));
+        if(GlobalV::NSPIN == 2)
+        {
+            zipname = "output_DM1.npz";
+            this->read_mat_npz(zipname,*(dm->get_DMR_pointer(2)));
+        }
+
+        this->pelec->psiToRho(*this->psi);
+
+        this->create_Output_Rho(0, istep).write();
+        if(GlobalV::NSPIN == 2)
+        {
+            this->create_Output_Rho(1, istep).write();
+        }
+
+        return;
+    }
+
     // the electron charge density should be symmetrized,
     // here is the initialization
     Symmetry_rho srho;
diff --git a/source/module_esolver/io_npz.cpp b/source/module_esolver/io_npz.cpp
new file mode 100644
index 0000000000..3415ae3004
--- /dev/null
+++ b/source/module_esolver/io_npz.cpp
@@ -0,0 +1,499 @@
+//Deals with io of dm(r)/h(r) in npz format
+
+#include "module_esolver/esolver_ks_lcao.h"
+
+#include "module_base/parallel_reduce.h"
+#include "module_cell/module_neighbor/sltk_atom_arrange.h"
+#include "module_cell/module_neighbor/sltk_grid_driver.h"
+#include "module_hamilt_general/module_xc/xc_functional.h"
+#include "module_hamilt_lcao/module_dftu/dftu.h"
+#include "module_hamilt_pw/hamilt_pwdft/global.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h"
+#ifdef __DEEPKS
+#include "module_hamilt_lcao/module_deepks/LCAO_deepks.h"	//caoyu add 2021-07-26
+#endif
+#include "module_base/timer.h"
+
+#ifdef __MPI
+#include <mpi.h>
+#include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h"
+#endif
+
+#ifdef __USECNPY
+#include "cnpy.h"
+#endif
+
+#include "module_base/element_name.h"
+
+namespace ModuleESolver
+{
+
+template <typename TK, typename TR>
+void ESolver_KS_LCAO<TK, TR>::read_mat_npz(std::string& zipname, hamilt::HContainer<double>& hR)
+{
+    ModuleBase::TITLE("LCAO_Hamilt","read_mat_npz");
+
+    const Parallel_Orbitals* paraV = this->LOWF.ParaV;
+
+#ifdef __USECNPY
+
+#ifdef __MPI
+
+    if(GlobalV::NPROC!=1)
+    {
+        std::cout << "read_mat_npz is not supported in NPI mode yet" << std::endl;
+        return;
+    }
+
+/*
+    hamilt::HContainer<double>* HR_serial;
+    Parallel_Orbitals serialV;
+    serialV.set_proc_dim(1,0);
+    serialV.mpi_create_cart(MPI_COMM_WORLD);
+    serialV.set_local2global(GlobalV::NLOCAL, GlobalV::NLOCAL, GlobalV::ofs_running, GlobalV::ofs_warning);
+    serialV.set_global2local(GlobalV::NLOCAL, GlobalV::NLOCAL, false, GlobalV::ofs_running);
+    serialV.set_atomic_trace(GlobalC::ucell.get_iat2iwt(), GlobalC::ucell.nat, GlobalV::NLOCAL);
+*/
+    if(GlobalV::MY_RANK == 0)
+    {
+        //HR_serial = new hamilt::HContainer<double>(&serialV);
+
+        cnpy::npz_t my_npz = cnpy::npz_load(zipname);
+        std::vector<std::string> fnames;
+
+        //check consistency
+        // 1. lattice vectors
+        double* lattice_vector = my_npz["lattice_vectors"].data<double>();
+        assert(std::abs(lattice_vector[0] - GlobalC::ucell.lat0 * GlobalC::ucell.a1.x) < 1e-6);
+        assert(std::abs(lattice_vector[1] - GlobalC::ucell.lat0 * GlobalC::ucell.a1.y) < 1e-6);
+        assert(std::abs(lattice_vector[2] - GlobalC::ucell.lat0 * GlobalC::ucell.a1.z) < 1e-6);
+        assert(std::abs(lattice_vector[3] - GlobalC::ucell.lat0 * GlobalC::ucell.a2.x) < 1e-6);
+        assert(std::abs(lattice_vector[4] - GlobalC::ucell.lat0 * GlobalC::ucell.a2.y) < 1e-6);
+        assert(std::abs(lattice_vector[5] - GlobalC::ucell.lat0 * GlobalC::ucell.a2.z) < 1e-6);
+        assert(std::abs(lattice_vector[6] - GlobalC::ucell.lat0 * GlobalC::ucell.a3.x) < 1e-6);
+        assert(std::abs(lattice_vector[7] - GlobalC::ucell.lat0 * GlobalC::ucell.a3.y) < 1e-6);
+        assert(std::abs(lattice_vector[8] - GlobalC::ucell.lat0 * GlobalC::ucell.a3.z) < 1e-6);
+
+        // 2. atoms
+        double* atom_info = my_npz["atom_info"].data<double>();
+        for(int iat = 0; iat < GlobalC::ucell.nat; ++iat)
+        {
+            const int it = GlobalC::ucell.iat2it[iat];
+            const int ia = GlobalC::ucell.iat2ia[iat];
+
+            //get atomic number (copied from write_cube.cpp)
+            std::string element = "";
+            element = GlobalC::ucell.atoms[it].label;
+			std::string::iterator temp = element.begin();
+			while (temp != element.end())
+			{
+				if ((*temp >= '1') && (*temp <= '9'))
+				{
+					temp = element.erase(temp);
+				}
+				else
+				{
+					temp++;
+				}
+			}
+            int z = 0;
+            for(int j=0; j!=ModuleBase::element_name.size(); j++)
+            {
+                if (element == ModuleBase::element_name[j])
+                {
+                    z=j+1;
+                    break;
+                }
+            }
+
+            assert(atom_info[iat*5] == it);
+            assert(atom_info[iat*5+1] == z);
+            //I will not be checking the coordinates for now in case the direct coordinates provided in the 
+            //npz file do not fall in the range [0,1); if a protocol is to be set in the future such that
+            //this could be guaranteed, then the following lines could be uncommented
+            //assert(std::abs(atom_info[iat*5+2] - GlobalC::ucell.atoms[it].taud[ia].x) < 1e-6);
+            //assert(std::abs(atom_info[iat*5+3] - GlobalC::ucell.atoms[it].taud[ia].y) < 1e-6);
+            //assert(std::abs(atom_info[iat*5+4] - GlobalC::ucell.atoms[it].taud[ia].z) < 1e-6);            
+        }
+
+        // 3. orbitals
+        for(int it = 0; it < GlobalC::ucell.ntype; ++it)
+        {
+            std::string filename="orbital_info_"+std::to_string(it);
+            double* orbital_info = my_npz[filename].data<double>();
+            for(int iw = 0; iw < GlobalC::ucell.atoms[it].nw; ++iw)
+            {
+                assert(orbital_info[iw*3] == GlobalC::ucell.atoms[it].iw2n[iw]);
+                assert(orbital_info[iw*3+1] == GlobalC::ucell.atoms[it].iw2l[iw]);
+                const int im = GlobalC::ucell.atoms[it].iw2m[iw];
+                const int m = (im % 2 == 0) ? -im/2 : (im+1)/2; // copied from LCAO_gen_fixedH.cpp
+                assert(orbital_info[iw*3+2] == m);
+            }
+        }
+
+        //starts reading the matrix
+        for(auto const& imap: my_npz)
+            fnames.push_back(imap.first);
+
+        for(int i = 0; i < fnames.size(); i ++)
+        {
+            if(fnames[i].find("mat_") == std::string::npos) continue;
+
+            std::vector<std::string> tokens;
+            std::string token;
+            std::stringstream fname_tmp(fnames[i]);
+            char delimiter = '_'; 
+        
+            while (std::getline(fname_tmp, token, delimiter)) { 
+                tokens.push_back(token); 
+            }
+            
+            cnpy::NpyArray arr = my_npz[fnames[i]];
+
+            int iat1 = std::stoi(tokens[1]);
+            int iat2 = std::stoi(tokens[2]);
+            int Rx = std::stoi(tokens[3]);
+            int Ry = std::stoi(tokens[4]);
+            int Rz = std::stoi(tokens[5]);
+
+            int it1 = GlobalC::ucell.iat2it[iat1];
+            int it2 = GlobalC::ucell.iat2it[iat2];
+
+            assert(arr.shape[0] == GlobalC::ucell.atoms[it1].nw);
+            assert(arr.shape[1] == GlobalC::ucell.atoms[it2].nw);
+            
+            //hamilt::AtomPair<double> tmp(iat1,iat2,Rx,Ry,Rz,&serialV);
+            //HR_serial->insert_pair(tmp);
+            hamilt::AtomPair<double> tmp(iat1,iat2,Rx,Ry,Rz,paraV);
+            hR.insert_pair(tmp);
+
+            // use symmetry : H_{mu,nu,R} = H_{nu,mu,-R}
+            if(Rx!=0 || Ry!=0 || Rz!=0)
+            {
+                //hamilt::AtomPair<double> tmp(iat2,iat1,-Rx,-Ry,-Rz,&serialV);
+                //HR_serial->insert_pair(tmp);
+                hamilt::AtomPair<double> tmp(iat2,iat1,-Rx,-Ry,-Rz,paraV);
+                hR.insert_pair(tmp);
+            }
+        }
+
+        //HR_serial->allocate();
+        hR.allocate();
+
+        for(int i = 0; i < fnames.size(); i ++)
+        {
+            if(fnames[i].find("mat_") == std::string::npos) continue;
+
+            std::vector<std::string> tokens;
+            std::string token;
+            std::stringstream fname_tmp(fnames[i]);
+            char delimiter = '_'; 
+        
+            while (std::getline(fname_tmp, token, delimiter)) { 
+                tokens.push_back(token); 
+            }
+            
+            cnpy::NpyArray arr = my_npz[fnames[i]];
+
+            int iat1 = std::stoi(tokens[1]);
+            int iat2 = std::stoi(tokens[2]);
+            int Rx = std::stoi(tokens[3]);
+            int Ry = std::stoi(tokens[4]);
+            int Rz = std::stoi(tokens[5]);
+
+            int it1 = GlobalC::ucell.iat2it[iat1];
+            int it2 = GlobalC::ucell.iat2it[iat2];
+
+            assert(arr.shape[0] == GlobalC::ucell.atoms[it1].nw);
+            assert(arr.shape[1] == GlobalC::ucell.atoms[it2].nw);
+
+            double* submat_read = arr.data<double>();
+            
+            //hamilt::BaseMatrix<double>* submat = HR_serial->find_matrix(iat1,iat2,Rx,Ry,Rz);
+            hamilt::BaseMatrix<double>* submat = hR.find_matrix(iat1,iat2,Rx,Ry,Rz);
+
+            for(int i = 0; i < arr.shape[0]; i ++)
+            {
+                for(int j = 0; j < arr.shape[1]; j ++)
+                {
+                    submat->add_element(i,j,submat_read[i*arr.shape[1]+j]);
+                }
+            }
+            
+            // use symmetry : H_{mu,nu,R} = H_{nu,mu,-R}
+            if(Rx!=0 || Ry!=0 || Rz!=0)
+            {
+                //hamilt::BaseMatrix<double>* submat = HR_serial->find_matrix(iat2,iat1,-Rx,-Ry,-Rz);
+                hamilt::BaseMatrix<double>* submat = hR.find_matrix(iat2,iat1,-Rx,-Ry,-Rz);
+                for(int i = 0; i < arr.shape[0]; i ++)
+                {
+                    for(int j = 0; j < arr.shape[1]; j ++)
+                    {
+                        submat->add_element(j,i,submat_read[i*arr.shape[1]+j]);
+                    }
+                }
+            }
+        }
+
+    }
+#else
+
+        cnpy::npz_t my_npz = cnpy::npz_load(zipname);
+        std::vector<std::string> fnames;
+
+        for(auto const& imap: my_npz)
+            fnames.push_back(imap.first);
+
+        for(int i = 0; i < fnames.size(); i ++)
+        {
+            if(fnames[i].find("mat_") == std::string::npos) continue;
+
+            std::vector<std::string> tokens;
+            std::string token;
+            std::stringstream fname_tmp(fnames[i]);
+            char delimiter = '_'; 
+        
+            while (std::getline(fname_tmp, token, delimiter)) { 
+                tokens.push_back(token); 
+            }
+            
+            cnpy::NpyArray arr = my_npz[fnames[i]];
+
+            int iat1 = std::stoi(tokens[1]);
+            int iat2 = std::stoi(tokens[2]);
+            int Rx = std::stoi(tokens[3]);
+            int Ry = std::stoi(tokens[4]);
+            int Rz = std::stoi(tokens[5]);
+
+            int it1 = GlobalC::ucell.iat2it[iat1];
+            int it2 = GlobalC::ucell.iat2it[iat2];
+
+            assert(arr.shape[0] == GlobalC::ucell.atoms[it1].nw);
+            assert(arr.shape[1] == GlobalC::ucell.atoms[it2].nw);
+            
+            hamilt::AtomPair<double> tmp(iat1,iat2,Rx,Ry,Rz,paraV);
+            hR->insert_pair(tmp);
+            // use symmetry : H_{mu,nu,R} = H_{nu,mu,-R}
+            if(Rx!=0 || Ry!=0 || Rz!=0)
+            {
+                hamilt::AtomPair<double> tmp(iat2,iat1,-Rx,-Ry,-Rz,paraV);
+                hR->insert_pair(tmp);
+            }
+        }
+
+        hR->allocate();
+
+        for(int i = 0; i < fnames.size(); i ++)
+        {
+            if(fnames[i].find("mat_") == std::string::npos) continue;
+
+            std::vector<std::string> tokens;
+            std::string token;
+            std::stringstream fname_tmp(fnames[i]);
+            char delimiter = '_'; 
+        
+            while (std::getline(fname_tmp, token, delimiter)) { 
+                tokens.push_back(token); 
+            }
+            
+            cnpy::NpyArray arr = my_npz[fnames[i]];
+
+            int iat1 = std::stoi(tokens[1]);
+            int iat2 = std::stoi(tokens[2]);
+            int Rx = std::stoi(tokens[3]);
+            int Ry = std::stoi(tokens[4]);
+            int Rz = std::stoi(tokens[5]);
+
+            int it1 = GlobalC::ucell.iat2it[iat1];
+            int it2 = GlobalC::ucell.iat2it[iat2];
+
+            assert(arr.shape[0] == GlobalC::ucell.atoms[it1].nw);
+            assert(arr.shape[1] == GlobalC::ucell.atoms[it2].nw);
+
+            double* submat_read = arr.data<double>();
+            
+            hamilt::BaseMatrix<double>* submat = hR->find_matrix(iat1,iat2,Rx,Ry,Rz);
+
+            for(int i = 0; i < arr.shape[0]; i ++)
+            {
+                for(int j = 0; j < arr.shape[1]; j ++)
+                {
+                    submat->add_element(i,j,submat_read[i*arr.shape[1]+j]);
+                }
+            }
+            
+            // use symmetry : H_{mu,nu,R} = H_{nu,mu,-R}
+            if(Rx!=0 || Ry!=0 || Rz!=0)
+            {
+                hamilt::BaseMatrix<double>* submat = hR->find_matrix(iat2,iat1,-Rx,-Ry,-Rz);
+                for(int i = 0; i < arr.shape[0]; i ++)
+                {
+                    for(int j = 0; j < arr.shape[1]; j ++)
+                    {
+                        submat->add_element(j,i,submat_read[i*arr.shape[1]+j]);
+                    }
+                }
+            }
+        }
+
+#endif
+
+#endif
+}
+
+template <typename TK, typename TR>
+void ESolver_KS_LCAO<TK, TR>::output_mat_npz(std::string& zipname, const hamilt::HContainer<double>& hR)
+{
+    ModuleBase::TITLE("LCAO_Hamilt","output_mat_npz");
+
+#ifdef __USECNPY
+    std::string filename = "";
+
+    if(GlobalV::MY_RANK == 0)
+    {
+        
+// first block: lattice vectors
+        filename = "lattice_vectors";
+        std::vector<double> lattice_vectors;
+        lattice_vectors.resize(9);
+        lattice_vectors[0] = GlobalC::ucell.lat0 * GlobalC::ucell.a1.x;
+        lattice_vectors[1] = GlobalC::ucell.lat0 * GlobalC::ucell.a1.y;
+        lattice_vectors[2] = GlobalC::ucell.lat0 * GlobalC::ucell.a1.z;
+        lattice_vectors[3] = GlobalC::ucell.lat0 * GlobalC::ucell.a2.x;
+        lattice_vectors[4] = GlobalC::ucell.lat0 * GlobalC::ucell.a2.y;
+        lattice_vectors[5] = GlobalC::ucell.lat0 * GlobalC::ucell.a2.z;
+        lattice_vectors[6] = GlobalC::ucell.lat0 * GlobalC::ucell.a3.x;
+        lattice_vectors[7] = GlobalC::ucell.lat0 * GlobalC::ucell.a3.y;
+        lattice_vectors[8] = GlobalC::ucell.lat0 * GlobalC::ucell.a3.z;
+
+        cnpy::npz_save(zipname,filename,lattice_vectors);
+
+// second block: atom info
+        filename = "atom_info";
+        double* atom_info = new double[GlobalC::ucell.nat*5];
+        for(int iat = 0; iat < GlobalC::ucell.nat; ++iat)
+        {
+            const int it = GlobalC::ucell.iat2it[iat];
+            const int ia = GlobalC::ucell.iat2ia[iat];
+
+            //get atomic number (copied from write_cube.cpp)
+            std::string element = "";
+            element = GlobalC::ucell.atoms[it].label;
+			std::string::iterator temp = element.begin();
+			while (temp != element.end())
+			{
+				if ((*temp >= '1') && (*temp <= '9'))
+				{
+					temp = element.erase(temp);
+				}
+				else
+				{
+					temp++;
+				}
+			}
+            int z = 0;
+            for(int j=0; j!=ModuleBase::element_name.size(); j++)
+            {
+                if (element == ModuleBase::element_name[j])
+                {
+                    z=j+1;
+                    break;
+                }
+            }
+
+            atom_info[iat*5] = it;
+            atom_info[iat*5+1] = z;
+            atom_info[iat*5+2] = GlobalC::ucell.atoms[it].taud[ia].x;
+            atom_info[iat*5+3] = GlobalC::ucell.atoms[it].taud[ia].y;
+            atom_info[iat*5+4] = GlobalC::ucell.atoms[it].taud[ia].z;
+        }
+        std::vector<size_t> shape={(size_t)GlobalC::ucell.nat,5};
+
+        cnpy::npz_save(zipname,filename,atom_info,shape,"a");
+        delete[] atom_info;
+
+//third block: orbital info
+        for(int it = 0; it < GlobalC::ucell.ntype; ++it)
+        {
+            filename="orbital_info_"+std::to_string(it);
+            double* orbital_info = new double[GlobalC::ucell.atoms[it].nw*3];
+            for(int iw = 0; iw < GlobalC::ucell.atoms[it].nw; ++iw)
+            {
+                orbital_info[iw*3] = GlobalC::ucell.atoms[it].iw2n[iw];
+                orbital_info[iw*3+1] = GlobalC::ucell.atoms[it].iw2l[iw];
+                const int im = GlobalC::ucell.atoms[it].iw2m[iw];
+                const int m = (im % 2 == 0) ? -im/2 : (im+1)/2; // copied from LCAO_gen_fixedH.cpp
+                orbital_info[iw*3+2] = m;
+            }
+            shape={(size_t)GlobalC::ucell.atoms[it].nw,3};
+
+            cnpy::npz_save(zipname,filename,orbital_info,shape,"a");
+        }
+    }
+
+//fourth block: hr(i0,jR)
+#ifdef __MPI
+    hamilt::HContainer<double>* HR_serial;
+    Parallel_Orbitals serialV;
+    serialV.set_global2local(GlobalV::NLOCAL, GlobalV::NLOCAL, false, GlobalV::ofs_running);
+    serialV.set_atomic_trace(GlobalC::ucell.get_iat2iwt(), GlobalC::ucell.nat, GlobalV::NLOCAL);
+    if(GlobalV::MY_RANK == 0)
+    {
+        HR_serial = new hamilt::HContainer<double>(&serialV);
+    }
+    hamilt::gatherParallels(hR, HR_serial, 0);
+
+    if(GlobalV::MY_RANK==0)
+    {
+        for(int iap=0;iap<HR_serial[0].size_atom_pairs();++iap)
+        {
+            int atom_i = HR_serial[0].get_atom_pair(iap).get_atom_i();
+            int atom_j = HR_serial[0].get_atom_pair(iap).get_atom_j();
+            if(atom_i > atom_j) continue;
+            int start_i = serialV.atom_begin_row[atom_i];
+            int start_j = serialV.atom_begin_col[atom_j];
+            int row_size = serialV.get_row_size(atom_i);
+            int col_size = serialV.get_col_size(atom_j);
+            for(int iR=0;iR<HR_serial[0].get_atom_pair(iap).get_R_size();++iR)
+            {
+                auto& matrix = HR_serial[0].get_atom_pair(iap).get_HR_values(iR);
+                int* r_index = HR_serial[0].get_atom_pair(iap).get_R_index(iR);
+                filename = "mat_"+std::to_string(atom_i)+"_"+std::to_string(atom_j)+"_"
+                    +std::to_string(r_index[0])+"_"+std::to_string(r_index[1])+"_"+std::to_string(r_index[2]);
+                std::vector<size_t> shape = {(size_t)row_size,(size_t)col_size};
+                cnpy::npz_save(zipname,filename,matrix.get_pointer(),shape,"a");
+            }
+        }
+
+    }
+#else
+    const Parallel_Orbitals* paraV = this->LM->ParaV;
+    auto row_indexes = paraV->get_indexes_row();
+    auto col_indexes = paraV->get_indexes_col();
+    for(int iap=0;iap<hR.size_atom_pairs();++iap)
+    {
+        int atom_i = hR.get_atom_pair(iap).get_atom_i();
+        int atom_j = hR.get_atom_pair(iap).get_atom_j();
+        int start_i = paraV->atom_begin_row[atom_i];
+        int start_j = paraV->atom_begin_col[atom_j];
+        int row_size = paraV->get_row_size(atom_i);
+        int col_size = paraV->get_col_size(atom_j);
+        for(int iR=0;iR<hR.get_atom_pair(iap).get_R_size();++iR)
+        {
+            auto& matrix = hR.get_atom_pair(iap).get_HR_values(iR);
+            int* r_index = hR.get_atom_pair(iap).get_R_index(iR);
+
+            filename = "mat_"+std::to_string(atom_i)+"_"+std::to_string(atom_j)+"_"
+                +std::to_string(r_index[0])+"_"+std::to_string(r_index[1])+"_"+std::to_string(r_index[2]);
+            std::vector<size_t> shape = {(size_t)row_size,(size_t)col_size};
+            cnpy::npz_save(zipname,filename,matrix.get_pointer(),shape,"a");
+        }
+    }
+#endif
+#endif
+}
+
+template class ESolver_KS_LCAO<double, double>;
+template class ESolver_KS_LCAO<std::complex<double>, double>;
+template class ESolver_KS_LCAO<std::complex<double>, std::complex<double>>;
+}
\ No newline at end of file
diff --git a/source/module_io/input.cpp b/source/module_io/input.cpp
index 60fc3ce47b..4fe9acac6d 100644
--- a/source/module_io/input.cpp
+++ b/source/module_io/input.cpp
@@ -346,6 +346,9 @@ void Input::Default(void)
     out_proj_band = 0;
     out_mat_hs = {0, 8};
     out_mat_xc = 0;
+    out_hr_npz = 0;
+    out_dm_npz = 0;
+    dm_to_rho = 0;
     cal_syns = 0;
     dmax = 0.01;
     out_mat_hs2 = 0; // LiuXh add 2019-07-15
@@ -1435,6 +1438,18 @@ bool Input::Read(const std::string& fn)
         {
             read_bool(ifs, out_mat_xc);
         }
+        else if (strcmp("out_hr_npz", word) == 0)
+        {
+            read_bool(ifs, out_hr_npz);
+        }
+        else if (strcmp("out_dm_npz", word) == 0)
+        {
+            read_bool(ifs, out_dm_npz);
+        }
+        else if (strcmp("dm_to_rho", word) == 0)
+        {
+            read_bool(ifs, dm_to_rho);
+        }
         else if (strcmp("out_interval", word) == 0)
         {
             read_value(ifs, out_interval);
@@ -2637,10 +2652,20 @@ bool Input::Read(const std::string& fn)
             gamma_only_local = 0;
         }
     }
-    if ((out_mat_r || out_mat_hs2 || out_mat_t || out_mat_dh) && gamma_only_local)
+    if ((out_mat_r || out_mat_hs2 || out_mat_t || out_mat_dh || out_hr_npz || out_dm_npz || dm_to_rho) && gamma_only_local)
     {
         ModuleBase::WARNING_QUIT("Input",
-                                 "printing of H(R)/S(R)/dH(R)/T(R) is not available for gamma only calculations");
+                                 "printing of H(R)/S(R)/dH(R)/T(R)/DM(R) is not available for gamma only calculations");
+    }
+    if(dm_to_rho && GlobalV::NPROC > 1)
+    {
+        ModuleBase::WARNING_QUIT("Input", "dm_to_rho is not available for parallel calculations");
+    }
+    if(out_hr_npz || out_dm_npz || dm_to_rho)
+    {
+#ifndef __USECNPY
+        ModuleBase::WARNING_QUIT("Input", "to write in npz format, please recompile with -DENABLE_CNPY=1");
+#endif
     }
     if (out_mat_dh && nspin == 4)
     {
@@ -3411,6 +3436,9 @@ void Input::Bcast()
     Parallel_Common::bcast_bool(out_mat_t);
     Parallel_Common::bcast_bool(out_mat_dh);
     Parallel_Common::bcast_bool(out_mat_xc);
+    Parallel_Common::bcast_bool(out_hr_npz);
+    Parallel_Common::bcast_bool(out_dm_npz);
+    Parallel_Common::bcast_bool(dm_to_rho);
     Parallel_Common::bcast_bool(out_mat_r); // jingan add 2019-8-14
     Parallel_Common::bcast_int(out_wfc_lcao);
     Parallel_Common::bcast_bool(out_alllog);
diff --git a/source/module_io/input.h b/source/module_io/input.h
index 857068d2da..a4f9f18895 100644
--- a/source/module_io/input.h
+++ b/source/module_io/input.h
@@ -271,6 +271,9 @@ class Input
     bool out_proj_band; // projected band structure calculation jiyy add 2022-05-11
     std::vector<int> out_mat_hs; // output H matrix and S matrix in local basis.
     bool out_mat_xc; // output exchange-correlation matrix in KS-orbital representation.
+    bool out_hr_npz;// output exchange-correlation matrix in KS-orbital representation.
+    bool out_dm_npz;
+    bool dm_to_rho;
     bool cal_syns; // calculate asynchronous S matrix to output
     double dmax; // maximum displacement of all atoms in one step (bohr)
     bool out_mat_hs2; // LiuXh add 2019-07-16, output H(R) matrix and S(R) matrix in local basis.
diff --git a/source/module_io/input_conv.cpp b/source/module_io/input_conv.cpp
index 229c517ef8..9a9dd1896b 100644
--- a/source/module_io/input_conv.cpp
+++ b/source/module_io/input_conv.cpp
@@ -684,6 +684,9 @@ void Input_Conv::Convert(void)
     hsolver::HSolverLCAO<std::complex<double>>::out_mat_t = INPUT.out_mat_t;
     hsolver::HSolverLCAO<std::complex<double>>::out_mat_dh = INPUT.out_mat_dh;
     GlobalV::out_mat_xc = INPUT.out_mat_xc;
+    GlobalV::out_hr_npz = INPUT.out_hr_npz;
+    GlobalV::out_dm_npz = INPUT.out_dm_npz;
+    GlobalV::dm_to_rho = INPUT.dm_to_rho;
     if (GlobalV::GAMMA_ONLY_LOCAL)
     {
         elecstate::ElecStateLCAO<double>::out_wfc_lcao = INPUT.out_wfc_lcao;
diff --git a/source/module_io/parameter_pool.cpp b/source/module_io/parameter_pool.cpp
index 9be1fe00ac..a3852ccd6a 100644
--- a/source/module_io/parameter_pool.cpp
+++ b/source/module_io/parameter_pool.cpp
@@ -926,6 +926,18 @@ void input_parameters_set(std::map<std::string, InputParameter> input_parameters
     {
         INPUT.out_mat_xc = *static_cast<bool*>(input_parameters["out_mat_xc"].get());
     }
+    else if (input_parameters.count("out_hr_npz") != 0)
+    {
+        INPUT.out_hr_npz = *static_cast<bool*>(input_parameters["out_hr_npz"].get());
+    }
+    else if (input_parameters.count("out_dm_npz") != 0)
+    {
+        INPUT.out_dm_npz = *static_cast<bool*>(input_parameters["out_dm_npz"].get());
+    }
+    else if (input_parameters.count("dm_to_rho") != 0)
+    {
+        INPUT.dm_to_rho = *static_cast<bool*>(input_parameters["dm_to_rho"].get());
+    }
     else if (input_parameters.count("cal_syns") != 0)
     {
         INPUT.cal_syns = *static_cast<bool*>(input_parameters["cal_syns"].get());
diff --git a/source/module_io/test/input_conv_test.cpp b/source/module_io/test/input_conv_test.cpp
index fac85bc51d..ea5d2f5266 100644
--- a/source/module_io/test/input_conv_test.cpp
+++ b/source/module_io/test/input_conv_test.cpp
@@ -154,6 +154,9 @@ TEST_F(InputConvTest, Conv)
     EXPECT_EQ(hsolver::HSolverLCAO<double>::out_mat_dh, INPUT.out_mat_dh);
     EXPECT_EQ(hsolver::HSolverLCAO<std::complex<double>>::out_mat_dh, INPUT.out_mat_dh);
     EXPECT_EQ(GlobalV::out_mat_xc, false);
+	EXPECT_EQ(GlobalV::out_hr_npz, false);
+	EXPECT_EQ(GlobalV::out_dm_npz, false);
+	EXPECT_EQ(GlobalV::dm_to_rho, false);
     EXPECT_EQ(GlobalV::out_interval, 1);
     EXPECT_EQ(elecstate::ElecStateLCAO<double>::out_wfc_lcao, false);
     EXPECT_EQ(berryphase::berry_phase_flag, false);
diff --git a/source/module_io/test/write_input_test.cpp b/source/module_io/test/write_input_test.cpp
index 8d3b8640df..8f9059a67e 100644
--- a/source/module_io/test/write_input_test.cpp
+++ b/source/module_io/test/write_input_test.cpp
@@ -324,6 +324,9 @@ TEST_F(write_input, LCAO5)
                 testing::HasSubstr("lcao_rmax                      30 #max R for 1D two-center integration table"));
     EXPECT_THAT(output, testing::HasSubstr("out_mat_hs                     0 #output H and S matrix"));
     EXPECT_THAT(output, testing::HasSubstr("out_mat_xc                     0 #output exchange-correlation matrix in KS-orbital representation"));
+    EXPECT_THAT(output, testing::HasSubstr("out_hr_npz                     0 #output hr(I0,JR) submatrices in npz format"));
+    EXPECT_THAT(output, testing::HasSubstr("out_dm_npz                     0 #output dmr(I0,JR) submatrices in npz format"));
+    EXPECT_THAT(output, testing::HasSubstr("dm_to_rho                      0 #reads dmr in npz format and calculates electron density"));
     EXPECT_THAT(output, testing::HasSubstr("out_mat_hs2                    0 #output H(R) and S(R) matrix"));
     EXPECT_THAT(output, testing::HasSubstr("out_mat_dh                     0 #output of derivative of H(R) matrix"));
     EXPECT_THAT(
diff --git a/source/module_io/write_input.cpp b/source/module_io/write_input.cpp
index cfb7174508..5035a618b5 100644
--- a/source/module_io/write_input.cpp
+++ b/source/module_io/write_input.cpp
@@ -229,6 +229,9 @@ ModuleBase::GlobalFunc::OUTP(ofs, "out_bandgap", out_bandgap, "if true, print ou
     ModuleBase::GlobalFunc::OUTP(ofs, "out_mat_hs2", out_mat_hs2, "output H(R) and S(R) matrix");
     ModuleBase::GlobalFunc::OUTP(ofs, "out_mat_dh", out_mat_dh, "output of derivative of H(R) matrix");
     ModuleBase::GlobalFunc::OUTP(ofs, "out_mat_xc", out_mat_xc, "output exchange-correlation matrix in KS-orbital representation");
+    ModuleBase::GlobalFunc::OUTP(ofs, "out_hr_npz", out_hr_npz, "output hr(I0,JR) submatrices in npz format");
+    ModuleBase::GlobalFunc::OUTP(ofs, "out_dm_npz", out_dm_npz, "output dmr(I0,JR) submatrices in npz format");
+    ModuleBase::GlobalFunc::OUTP(ofs, "dm_to_rho", dm_to_rho, "reads dmr in npz format and calculates electron density");
     ModuleBase::GlobalFunc::OUTP(ofs, "out_interval", out_interval, "interval for printing H(R) and S(R) matrix during MD");
     ModuleBase::GlobalFunc::OUTP(ofs, "out_app_flag", out_app_flag, "whether output r(R), H(R), S(R), T(R), and dH(R) matrices in an append manner during MD");
     ModuleBase::GlobalFunc::OUTP(ofs, "out_mat_t", out_mat_t, "output T(R) matrix");

From 579048d8b0b553657502fa43c1d4f1aaf4e2e387 Mon Sep 17 00:00:00 2001
From: Chun Cai <amoycaic@gmail.com>
Date: Tue, 30 Apr 2024 15:06:55 +0800
Subject: [PATCH 11/13] pin MPI version (#4065)

---
 Dockerfile.intel | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/Dockerfile.intel b/Dockerfile.intel
index 39ca4c1431..269713a16c 100644
--- a/Dockerfile.intel
+++ b/Dockerfile.intel
@@ -16,18 +16,16 @@ RUN apt-get update && \
         intel-oneapi-compiler-dpcpp-cpp \
         intel-oneapi-compiler-fortran \
         intel-oneapi-mkl-devel \
-        intel-oneapi-mpi-devel \
+        intel-oneapi-mpi-devel="2021.11.*" \
         intel-oneapi-vtune
-
-
-ENV I_MPI_ROOT='/opt/intel/oneapi/mpi/latest' \
+ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest \
     LIBRARY_PATH=/opt/intel/oneapi/tbb/latest/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/ippcp/latest/lib/:/opt/intel/oneapi/ipp/latest/lib:/opt/intel/oneapi/dpl/latest/lib:/opt/intel/oneapi/dnnl/latest/lib:/opt/intel/oneapi/dal/latest/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/ccl/latest/lib/ \
     LD_LIBRARY_PATH=/opt/intel/oneapi/tbb/latest/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/itac/latest/slib:/opt/intel/oneapi/ippcp/latest/lib/:/opt/intel/oneapi/ipp/latest/lib:/opt/intel/oneapi/dpl/latest/lib:/opt/intel/oneapi/dnnl/latest/lib:/opt/intel/oneapi/debugger/latest/opt/debugger/lib:/opt/intel/oneapi/dal/latest/lib:/opt/intel/oneapi/compiler/latest/opt/oclfpga/host/linux64/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/ccl/latest/lib/ \
     PATH=/opt/intel/oneapi/vtune/latest/bin64:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/itac/latest/bin:/opt/intel/oneapi/inspector/latest/bin64:/opt/intel/oneapi/dpcpp-ct/latest/bin:/opt/intel/oneapi/dev-utilities/latest/bin:/opt/intel/oneapi/debugger/latest/opt/debugger/bin:/opt/intel/oneapi/compiler/latest/opt/oclfpga/bin:/opt/intel/oneapi/compiler/latest/bin:/opt/intel/oneapi/advisor/latest/bin64:/opt/mamba/bin:/opt/mamba/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin \
-    MKLROOT='/opt/intel/oneapi/mkl/latest' \
-    FI_PROVIDER_PATH='/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric' \
-    CMAKE_PREFIX_PATH='/opt/intel/oneapi/tbb/latest/env/..:/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/ipp/latest/lib/cmake/ipp:/opt/intel/oneapi/dpl/latest/lib/cmake/oneDPL:/opt/intel/oneapi/dnnl/latest/lib/cmake:/opt/intel/oneapi/dal/latest:/opt/intel/oneapi/compiler/latest' \
-    CMPLR_ROOT='/opt/intel/oneapi/compiler/latest'
+    MKLROOT=/opt/intel/oneapi/mkl/latest \
+    FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric \
+    CMAKE_PREFIX_PATH=/opt/intel/oneapi/tbb/latest/env/..:/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/ipp/latest/lib/cmake/ipp:/opt/intel/oneapi/dpl/latest/lib/cmake/oneDPL:/opt/intel/oneapi/dnnl/latest/lib/cmake:/opt/intel/oneapi/dal/latest:/opt/intel/oneapi/compiler/latest \
+    CMPLR_ROOT=/opt/intel/oneapi/compiler/latest
 
 SHELL ["/bin/bash", "-c"]
 ENV CC=mpiicx CXX=mpiicpx FC=mpiifx

From a2a0e19082284742f54171cde77c2384ee689c63 Mon Sep 17 00:00:00 2001
From: wqzhou <33364058+WHUweiqingzhou@users.noreply.github.com>
Date: Tue, 30 Apr 2024 16:10:14 +0800
Subject: [PATCH 12/13] add some docs (#4073)

---
 docs/advanced/input_files/input-main.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index f1345f04c4..993e250478 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -438,7 +438,7 @@ These variables are used to control general system parameters.
 
 - **Type**: Integer
 - **Description**: takes value 1, 0 or -1.
-  - -1: No symmetry will be considered.
+  - -1: No symmetry will be considered. It is recommended to set -1 for non-colinear + soc calculations, where time reversal symmetry is broken sometimes.
   - 0: Only time reversal symmetry would be considered in symmetry operations, which implied k point and -k point would be treated as a single k point with twice the weight.
   - 1: Symmetry analysis will be performed to determine the type of Bravais lattice and associated symmetry operations. (point groups, space groups, primitive cells, and irreducible k-points)
 - **Default**: 

From 1439426b631f1dccfbecb1806915e18b2c566722 Mon Sep 17 00:00:00 2001
From: wqzhou <33364058+WHUweiqingzhou@users.noreply.github.com>
Date: Tue, 30 Apr 2024 17:02:18 +0800
Subject: [PATCH 13/13] Fix: use std::abs() instead of abs() (#4076)

---
 source/module_elecstate/module_charge/charge_mixing.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/module_elecstate/module_charge/charge_mixing.cpp b/source/module_elecstate/module_charge/charge_mixing.cpp
index 57083301b2..e327904d0e 100755
--- a/source/module_elecstate/module_charge/charge_mixing.cpp
+++ b/source/module_elecstate/module_charge/charge_mixing.cpp
@@ -504,7 +504,7 @@ void Charge_Mixing::mix_rho_recip(Charge* chr)
         {
             chr->rhog[0][ig] = rhog_magabs[ig]; // rhog
             double norm = std::sqrt(chr->rho[1][ig] * chr->rho[1][ig] + chr->rho[2][ig] * chr->rho[2][ig] + chr->rho[3][ig] * chr->rho[3][ig]);
-            if (abs(norm) < 1e-10) continue;
+            if (std::abs(norm) < 1e-10) continue;
             double rescale_tmp = rho_magabs[npw + ig] / norm; 
             chr->rho[1][ig] *= rescale_tmp;
             chr->rho[2][ig] *= rescale_tmp;