diff --git a/source/module_lr/utils/ri_benchmark.cpp b/source/module_lr/utils/ri_benchmark.cpp
new file mode 100644
index 0000000000..722629d08e
--- /dev/null
+++ b/source/module_lr/utils/ri_benchmark.cpp
@@ -0,0 +1,239 @@
+#include "ri_benchmark.h"
+#include "module_base/module_container/base/third_party/blas.h"
+#include "module_base/blas_connector.h"
+namespace RI_Benchmark
+{
+    // std::cout << "the size of Cs:" << std::endl;
+    // for (auto& it1: Cs)
+    // {
+    //     for (auto& it2: it1.second)
+    //     {
+    //         std::cout << "iat1=" << it1.first << ", iat2= " << it2.first.first << std::endl;
+    //         auto& ts = it2.second.shape;
+    //         std::cout << "Tensor shape:" << ts[0] << " " << ts[1] << " " << ts[2] << std::endl; // abf, nw1, nw2
+    //     }
+    // }
+
+    template <typename TR>
+    inline int count_nao_from_Cs(const TLRI<TR>& Cs_ao)
+    {
+        int naos = 0;
+        for (const auto& it2 : Cs_ao.at(0))
+        {
+            assert(it2.second.shape.size() == 3);
+            naos += it2.second.shape[2];
+        }
+        return naos
+    }
+
+    /// @brief slice the psi from wfc_ks of all the k-points, some bands, some basis functions
+    /// @return
+    template <typename TK>
+    inline std::vector<TK> slice_psi(const psi::Psi<TK>& wfc_ks,
+        const bool& ibstart,
+        const int& nb,
+        const int& iwstart,
+        const int& nw)
+    {
+        std::vector<TK> psi_slice(wfc_ks.get_nk() * nb * nw);
+        int i = 0;
+        for (int ik = 0; ik < wfc_ks.get_nk(); ++ik)
+            for (int ib = ibstart; ib < nb; ++ib)
+                for (int iw = iwstart; iw < nw; ++iw)
+                    psi_slice[i++] = wfc_ks(ik, ib, iw);
+        return psi_slice;
+    }
+
+    template <typename TK, typename TR>
+    TLRI<TK> cal_Cs_mo(const UnitCell& ucell,
+        const TLRI<TR>& Cs_ao,
+        const psi::Psi<TK>& wfc_ks,
+        const int& nocc,
+        const int& nvirt,
+        const int& occ_first)
+    {
+        assert(wfc_ks.get_nk() == 1);   // currently only gamma-only is supported
+        assert(nocc + nvirt <= wfc_ks.get_nbands());
+        const int& naos = cs.get_nbasis();
+        assert(count_nao_from_Cs(Cs_ao) == naos); // wfc_ks must be global
+
+        TLRI<TK> Cs_mo;
+        int iw1 = 0, iw2 = 0;
+        for (auto& it1 : Cs_ao[i])
+        {
+            const int& iat1 = it1.first;
+            const int& it1 = ucell.iat2it[iat1];
+            const int& nw1 = ucell.atoms[it1].nw;
+            for (auto& it2 : it1.second)
+            {
+                const int& iat2 = it2.first.first;
+                const int& it2 = ucell.iat2it[iat2];
+                const int& nw2 = ucell.atoms[it2].nw;
+
+                const auto& tensor_ao = it2.second;
+                const int& nabf = tensor_ao.shape[0];
+                assert(tensor_ao.shape.size() == 3); // abf, nw1, nw2
+
+                std:: : vector<TK> psi_a1 = occ_first ? slice_psi(wfc_ks, 0, nocc, iw1, nw1)
+                    : slice_psi(wfc_ks, nocc, nvirt, iw1, nw1); // virt
+                std:: : vector<TK> psi_a2 = occ_first ? slice_psi(wfc_ks, nocc, nvirt, iw2, nw2)
+                    : slice_psi(wfc_ks, 0, nocc, iw2, nw2); // occ
+
+                Cs_mo[i][it.first][it2.first] = RI::Tensor<TK>({ nabf,nocc,nvirt });
+                for (int iabf = 0; iabf < nabf; ++iabf)
+                {
+                    const auto ptr = &tensor_ao(iabf, 0, 0);
+                    std::vector<TK> tmp(nw1 * (occ_first ? nvirt : nocc));
+                    // caution: Cs are row-major  (ia2 contiguous)
+                    if (occ_first)
+                    {
+                        gemm('T', 'N', nvirt, nw1, nw2, 1.0, psi_a2.data(), nw2, ptr, nw2, 0.0, tmp.data(), nvirt);
+                        gemm('N', 'N', nvirt, nocc, nw1, 1.0, tmp.data(), nvirt, psi_a1.data(), nw1, 0.0, Cs_mo[i][it.first][it2.first](iabf, 0, 0), nvirt);
+                    }
+                    else
+                    {
+                        gemm('T', 'N', nw1, nocc, nw2, 1.0, ptr, nw2, psi_a2.data(), nw2, 0.0, tmp.data(), nw1);
+                        gemm('T', 'N', nvirt, nocc, nw1, 1.0, psi_a1.data(), nw1, tmp.data(), nw1, 0.0, Cs_mo[i][it.first][it2.first](iabf, 0, 0), nvirt);
+                    }
+                }
+                iw1 += nw1;
+                iw2 += nw2;
+            }
+        }
+        return Cs_mo;
+    }
+
+    template <typename TK, typename TR>
+    std::vector<TK> cal_Amat_full(const TLRI<TK>& Cs_a,
+        const TLRI<TK>& Cs_b,
+        const TLRI<TR>& Vs)
+    {
+        assert(Cs_a.second.second.shape.size() == 3); // abf, nocc, nvirt
+        assert(Cs_b.second.second.shape.size() == 3); // abf, nocc, nvirt
+        assert(Vs.second.second.shape.size() == 2); // abf, abf
+
+        const int& npairs = Cs_a.second.second.shape[1] * Cs_a.second.second.shape[2];
+        std::vector<TK> Amat_full(npairs * npairs, 0.0);
+
+        for (auto& itv1 : Vs)
+        {
+            const int& iat1 = itv1.first;
+            for (auto& itv2 : itv1.second)
+            {
+                const int& iat2 = itv2.first.first;
+                const auto& tensor_v = itv2.second; // (nabf2, nabf1), T
+                for (auto& itca2 : Cs_a.at(iat1))
+                {
+                    const int& iat3 = itca2.first.first;
+                    const auto& tensor_ca = itca2.second; // (nvirt*nocc, nabf1), N
+                    for (auto& itcb2 : Cs_b.at(iat2))
+                    {
+                        const int& iat4 = itcb2.first.first;
+                        const auto& tensor_cb = itcb2.second; // (nvirt*nocc, nabf2), T
+                        const int& nabf1 = tensor_v.shape[0];
+                        const int& nabf2 = tensor_v.shape[1];
+                        assert(tensor_ca.shape[0] == nabf1);   //abf1
+                        assert(tensor_cb.shape[0] == nabf2); //abf2
+                        std::vector<TK> tmp(npairs * nabf1);
+                        gemm('T', 'T', nabf1, npairs, nabf2, 1.0, tensor_v.ptr(), nabf2, tensor_cb.ptr(), npairs, 0.0, tmp.data(), nabf1);
+                        gemm('N', 'N', npairs, npairs, nabf1, 1.0, tensor_ca.ptr(), npairs, tmp.data(), nabf1, 1.0, Amat_full.data(), npairs);
+                    }
+                }
+            }
+        }
+        return Amat_full;
+    }
+
+    template <typename TK, typename TR>
+    TLRIX<TK> cal_CsX(const UnitCell& ucell,
+        const TLRI<TR>& Cs_ao,
+        const psi::Psi<TK>& wfc_ks,
+        const psi::Psi<TK>& X_istate,
+        const int& nocc)
+    {
+        assert(X_istate.get_nk() == 1);   // currently only gamma-only is supported
+        const int& npairs = X_istate.get_nbasis();
+        assert(npairs % nocc == 0);
+        const int& nvirt = npairs / nocc;
+        const TLRI<TK>& Cs_mo = cal_Cs_mo(ucell, Cs_ao, wfc_ks, nocc, nvirt,/*jb: occ_first=*/true);
+
+        TLRIX<TK> CsX;
+        for (auto& it1 : Cs_mo)
+        {
+            const int& iat1 = it1.first;
+            for (auto& it2 : it1.second)
+            {
+                const int& iat2 = it2.first.first;
+                auto& tensor_c = it2.second;
+                const int& nabf = tensor_c.shape[0];
+
+                std::vector<TK> CX(nabf);
+                for (int iabf = 0;iabf < nabf;++iabf)
+                    CX[iabf] = dot(X.get_nbasis(), &tensor_c(iabf, 0, 0), 1, X_istate.get_pointer(), 1);
+                CsX[iat1][it2.first] = CX;
+            }
+        }
+        return CsX;
+    }
+
+    template <typename TK, typename TR>
+    std::vector<TK> cal_AX(const TLRI<TK>& Cs_a,
+        const TLRIX<TK>& Cs_bX,
+        const TLRI<TR>& Vs)
+    {
+        assert(Cs_a.second.second.shape.size() == 3); // abf, nocc, nvirt
+        assert(Vs.second.second.shape.size() == 2); // abf, abf
+
+        const int& npairs = Cs_a.second.second.shape[1] * Cs_a.second.second.shape[2];
+        std::vector<TK> AX(npairs, 0.0);
+
+        for (auto& itv1 : Vs)
+        {
+            const int& iat1 = itv1.first;
+            for (auto& itv2 : itv1.second)
+            {
+                const int& iat2 = itv2.first.first;
+                const auto& tensor_v = itv2.second; // (nabf2, nabf1), T
+                for (auto& itca2 : Cs_a.at(iat1))
+                {
+                    const int& iat3 = itca2.first.first;
+                    const auto& tensor_ca = itca2.second; // (nvirt*nocc, nabf1), N
+                    for (auto& itcb2 : Cs_bX.at(iat2))
+                    {
+                        const int& iat4 = itcb2.first.first;
+                        const auto& vector_cb = itcb2.second; // (nvirt*nocc, nabf2), T
+                        const int& nabf1 = tensor_v.shape[0];
+                        const int& nabf2 = tensor_v.shape[1];
+                        assert(tensor_ca.shape[0] == nabf1);   //abf1
+                        assert(vector_cb.size() == nabf2); //abf2
+                        std::vector<TK> tmp(nabf1);
+                        gemv('T', nabf1, nabf2, 1.0, tensor_v.ptr(), nabf2, vector_cb.data(), 1, 0.0, tmp.data(), 1);
+                        gemv('N', npairs, nabf1, 1.0, tensor_ca.ptr(), npairs, tmp.data(), 1, 1.0, AX.data(), 1);
+                    }
+                }
+            }
+        }
+        return Amat_full;
+    }
+
+    template <typename TK, typename TR>
+    void benchmark_driver_A<TK, TR>(std::string& file_Cs, std::string& file_Vs, std::string& file_kswfc,
+        const int& nocc, const int& nvirt)
+    {
+        const int nks = 1;
+        const int& npairs = nocc * nvirt;
+        const TLRI<TR>& Cs_ao = read_Cs_ao<TR>(file_Cs);
+        const TLRI<TR>& Vs = read_Vs_abf<TR>(file_Vs);
+        const int nbands = LR_Util::
+            const psi::Psi<TK>&wfc_ks = read_wfc_ks<TK>(file_kswfc);
+
+        const TLRI<TK>& Cs_mo_ai = cal_Cs_mo(Cs_ao, wfc_ks, nocc, nvirt, /*occ_first=*/false);
+        const TLRI<TK>& Cs_mo_jb = cal_Cs_mo(Cs_ao, wfc_ks, nocc, nvirt, /*occ_first=*/true);
+        const std::vector<TK>& Amat_full = cal_Amat_full(Cs_mo_ai, Cs_mo_jb, Vs);
+        std::vector<Real> eigenvalue(npairs);
+        LR_Util::diag_lapack(nks * npairs, Amat_full.data(), eigenvalue.data());
+        std::cout << "eigenvalues:" << std::endl;
+        for (int i = 0; i < npairs; ++i) { std::cout << eigenvalue[i] << "  " << std::endl; }
+
+    }
+}
diff --git a/source/module_lr/utils/ri_benchmark.h b/source/module_lr/utils/ri_benchmark.h
new file mode 100644
index 0000000000..c34ba83d18
--- /dev/null
+++ b/source/module_lr/utils/ri_benchmark.h
@@ -0,0 +1,63 @@
+#pragma once
+#include "module_cell/unitcell.h"
+#include "module_psi/psi.h"
+
+#include <RI/global/Tensor.h>
+namespace RI_Benchmark
+{
+    using TC = std::array<int, 3>;
+    using TAC = std::pair<int, TC>;
+
+    template <typename T>
+    using TLRI = std::map<int, std::map<TAC, RI::Tensor<T>>>;
+    template<typename T>
+    using TLRIX = std::map<int, std::map<TAC, std::vector<T>>>;
+
+    template <typename TK, typename TR>
+    void benchmark_driver_A(std::string& file_Cs, std::string& file_Vs, std::string& file_kswfc, const int nocc, const int nvirt);
+    template <typename TK, typename TR>
+    void benchmark_driver_AX(std::string& file_Cs, std::string& file_Vs, std::string& file_kswfc, const int nocc, const int nvirt);
+
+    // 1. full-matrix version
+
+    /// calculate $Cs_{Ua\alpha,Vi}^{mo} = \sum_{\mu\nu} C_{U\mu\alpha,V\nu}^{ao} c^*_{\mu a} c_{\nu i}$
+    /// if occ_first, calculate $Cs_{Ui\alpha,Va}^{mo} = \sum_{\mu\nu} C_{U\mu\alpha,V\nu}^{ao} c^*_{\mu i} c_{\nu a}$
+    template <typename TK, typename TR>
+    TLRI<TK> cal_Cs_mo(const UnitCell& ucell,
+        const TLRI<TR>& Cs_ao,
+        const psi::Psi<TK>& wfc_ks,
+        const int& nocc,
+        const int& nvirt,
+        const int& occ_first = false);
+
+    /// A=CVC, sum over atom quads
+    template <typename TK, typename TR>
+    std::vector<TK> cal_Amat_full(const TLRI<TK>& Cs_a,
+        const TLRI<TK>& Cs_b,
+        const TLRI<TR>& Vs);
+
+    // 2. AX version
+    template <typename TK, typename TR>
+    TLRIX<TK> cal_CsX(const UnitCell& ucell,
+        const TLRI<TR>& Cs_ao,
+        const psi::Psi<TK>& wfc_ks,
+        const psi::Psi<TK>& X,
+        const int& nocc);
+
+    /// AX=CVCX, sum over atom quads
+    template <typename TK, typename TR>
+    std::vector<TK> cal_AX(const TLRI<TK>& Cs_a,
+        const TLRIX<TK>& Cs_bX,
+        const TLRI<TR>& Vs);
+
+    // read/write
+    template <typename TR>
+    TLRI<TR> read_Cs_ao(const std::string& file_path);
+    template <typename TR>
+    TLRI<TR> read_Vs_abf(const std::string& file_path);
+    template <typename TR>
+    void write_Vs_abf(const TLRI<TR>& Vs, const std::string& file_path);
+    // template <typename TK>
+    // psi::Psi<TK> read_wfc_ks(const std::string& file_path);
+
+} // namespace LR_Util
\ No newline at end of file
diff --git a/source/module_lr/utils/ri_benchmark_driver.cpp b/source/module_lr/utils/ri_benchmark_driver.cpp
new file mode 100644
index 0000000000..2da4216fe5
--- /dev/null
+++ b/source/module_lr/utils/ri_benchmark_driver.cpp
@@ -0,0 +1,27 @@
+#include "ri_benchmark.h"
+#include "lr_util.h"
+namespace RI_Benchmark
+{
+    template <typename TK, typename TR>
+    void benchmark_driver_A<TK, TR>(std::string& file_Cs, std::string& file_Vs, std::string& file_kswfc,
+        const int homo, const int nocc, const int nvirt)
+    {
+        assert(nocc > 0 && nocc < homo);
+        const int nks = 1;
+        const int& npairs = nocc * nvirt;
+        const TLRI<TR>& Cs_ao = read_Cs_ao<TR>(file_Cs);
+        const TLRI<TR>& Vs = read_Vs_abf<TR>(file_Vs);
+
+        // read range: [homo-nocc, homo+nvirt]
+        const psi::Psi<TK>& wfc_ks = read_wfc_ks<TK>(file_kswfc);
+
+        const TLRI<TK>& Cs_mo_ai = cal_Cs_mo(Cs_ao, wfc_ks, nocc, nvirt, /*occ_first=*/false);
+        const TLRI<TK>& Cs_mo_jb = cal_Cs_mo(Cs_ao, wfc_ks, nocc, nvirt, /*occ_first=*/true);
+        const std::vector<TK>& Amat_full = cal_Amat_full(Cs_mo_ai, Cs_mo_jb, Vs);
+        std::vector<Real> eigenvalue(npairs);
+        LR_Util::diag_lapack(nks * npairs, Amat_full.data(), eigenvalue.data());
+        std::cout << "eigenvalues:" << std::endl;
+        for (int i = 0; i < npairs; ++i) { std::cout << eigenvalue[i] << "  " << std::endl; }
+
+    }
+}
\ No newline at end of file
diff --git a/source/module_lr/utils/ri_benchmark_io.cpp b/source/module_lr/utils/ri_benchmark_io.cpp
new file mode 100644
index 0000000000..caeb2673bb
--- /dev/null
+++ b/source/module_lr/utils/ri_benchmark_io.cpp
@@ -0,0 +1,82 @@
+#include "ri_benchmark.h"
+#include "module_base/abfs-vector3_order.h"
+
+namespace RI_Benchmark
+{
+    template<typename TR>
+    TLRI<TR> read_Cs_ao(const std::string& file_path)
+    {
+        const int natom, ncell, ia1, ia2, ic_1, ic_2, ic_3, nw1, nw2, nabf;
+        std::ifstream infile;
+        infile.open(file_path);
+        infile >> natom >> ncell;   // no use of ncell
+
+        TLRI<TR> Cs;
+        while (infile.peek() != EOF)
+        {
+            infile >> ia1 >> ia2 >> ic_1 >> ic_2 >> ic_3 >> nw1 >> nw2 >> nabf;
+            ModuleBase::Vector3_Order<int> box(ic_1, ic_2, ic_3);
+            RI::Tensor<TR> tensor_cs({ nabf, na1, na2 });
+            for (int i = 0; i != nw1; i++)
+                for (int j = 0; j != nw2; j++)
+                    for (int mu = 0; mu != nabf; mu++)
+                        infile >> tensor_cs(mu, i, j);
+            // no screening for data-structure consistency
+            // if (loc_atp_index.count(ia1) && (*cs_ptr).absmax() >= threshold)
+            Cs[ia1][{ia2, box}] = tensor_cs;
+            // else ++cs_discard;
+        }
+        return Cs;
+    }
+
+    template<typename TR>
+    TLRI<TR> read_Vs_abf(const std::string& file_path)
+    {
+        const int natom, ncell, ia1, ia2, ic_1, ic_2, ic_3, nabf1, nabf2;
+        std::ifstream infile;
+        infile.open(file_path);
+        infile >> natom >> ncell;   // no use of ncell
+
+        TLRI<TR> Vs;
+        while (infile.peek() != EOF)
+        {
+            infile >> ia1 >> ia2 >> ic_1 >> ic_2 >> ic_3 >> nabf1 >> nabf2;
+            ModuleBase::Vector3_Order<int> box(ic_1, ic_2, ic_3);
+            RI::Tensor<TR> tensor_cs({ n_mu,n_nu });
+            for (int i = 0; i != nabf1; i++)
+                for (int j = 0; j != nabf2; j++)
+                    infile >> tensor_cs(i, j);
+            // no screening for data-structure consistency
+            // if (loc_atp_index.count(ia1) && (*cs_ptr).absmax() >= threshold)
+            Vs[ia1][{ia2, box}] = tensor_cs;
+            // else ++cs_discard;
+        }
+        return Vs;
+    }
+
+    template <typename TR>
+    void write_Vs_abf(const TLRI<TR>& Vs, const std::string& file_path)
+    {
+        std::ofstream outfile;
+        outfile.open(file_path);
+        outfile << Vs.size() << " " << Vs.at(0).size() / Vs.size() << std::endl;    //natom, ncell
+        for (const auto& it1 : Vs)
+        {
+            const int& ia1 = it1.first;
+            for (const auto& it2 : it1.second)
+            {
+                const int& ia2 = it2.first.first;
+                const auto& box = it2.first.second;
+                const auto& tensor_v = it2.second;
+                outfile << ia1 << " " << ia2 << " " << box[0] << " " << box[1] << " " << box[2] << std::endl;
+                outfile << << tensor_v.shape[0] << " " << tensor_v.shape[1] << std::endl;
+                for (int i = 0; i != tensor_v.shape[0]; i++)
+                {
+                    for (int j = 0; j != tensor_v.shape[1]; j++)
+                        outfile << tensor_v(i, j) << " ";
+                    outfile << std::endl;
+                }
+            }
+        }
+    }
+}
\ No newline at end of file