New Rys kernel (#221)

* New Rys kernel Improve tasks generation Tune Fix cupy_helper Optimized cache Rpa Adapt to scf code q_cond double to float do_j and do_k cleanup and bugfix Add LR ERI Add SR ERI Tune sr-eri Rys ERI based j-engine Tune J-engine Task adjustment Fix condense function Add tests and examples for JK matrix evaluation * Solve merging conflicts * Missing import * Fix import errors * Conditionally compiling against CUDA_VERSION * Restore CMakeLists.txt * Missing code * test name conflicts * Fix various merging conflicts * Update unittest configuration * compilation for cuda<12.4 * Update the treatment of omega in get_jk * Handle h functions * Fix a gradients initialization bug * Add assertatin and comments * Compiling issue for rys-j * some code in legacy version --------- Co-authored-by: Qiming Sun <[email protected]> Co-authored-by: xiaojie.wu <[email protected]>
pyscf · Oct 11, 2024 · ed993be · ed993be
1 parent a98497f
commit ed993be
Show file tree

Hide file tree

Showing 50 changed files with 83,896 additions and 847 deletions.
diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
@@ -33,8 +33,7 @@ jobs:
     - name: Build GPU4PySCF
       run: |
         export CUDA_HOME=/usr/local/cuda
-        export PATH=${CUDA_HOME}/bin:${PATH}
-        export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:$LD_LIBRARY_PATH
+        export CMAKE_CONFIGURE_ARGS="-DBUILD_LIBXC=OFF -DCUDA_ARCHITECTURES=70-real -DBUILD_CUTLASS=ON"
         sh build.sh
     - name: Test with pytest
       run: |

diff --git a/build.sh b/build.sh
@@ -1,8 +1,10 @@
 #!/bin/bash
 
+export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
 echo "PATH=${PATH}"
 echo "CUDA_HOME=${CUDA_HOME}"
-export PATH="$CUDA_HOME/bin:$PATH"
+export PATH="${CUDA_HOME}/bin:$PATH"
+export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:$LD_LIBRARY_PATH
 python3 setup.py bdist_wheel
 rm -rf output && mv dist output
 CURRENT_PATH=`pwd`

diff --git a/examples/23-j_engine.py b/examples/23-j_engine.py
@@ -0,0 +1,41 @@
+# Copyright 2024 The GPU4PySCF Authors. All Rights Reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+'''
+Compute J and K matrices separately. The J matrix is evaluated using J-engine.
+'''
+
+import pyscf
+from gpu4pyscf import scf
+from gpu4pyscf.scf import jk
+
+mol = pyscf.M(
+atom = '''
+O       0.0000000000    -0.0000000000     0.1174000000
+H      -0.7570000000    -0.0000000000    -0.4696000000
+H       0.7570000000     0.0000000000    -0.4696000000
+''',
+basis='def2-tzvp',
+verbose=5
+)
+
+def get_veff(self, mol, dm, *args, **kwargs):
+    vj = jk.get_j(mol, dm[0] + dm[1])
+    _, vk = jk.get_jk(mol, dm, with_j=False)
+    return vj - vk
+
+scf.uhf.UHF.get_veff = get_veff
+
+mf = mol.UHF().to_gpu().run()
diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py
@@ -1,4 +1,4 @@
-from . import lib, grad, hessian, solvent, scf, dft
+#from . import lib, grad, hessian, solvent, scf, dft
 
 __version__ = '1.0.2'
 

diff --git a/gpu4pyscf/cc/ccsd_incore.py b/gpu4pyscf/cc/ccsd_incore.py
@@ -31,7 +31,7 @@
 from pyscf.cc import ccsd
 from pyscf.cc import _ccsd
 from pyscf import __config__
-from gpu4pyscf.scf import hf as gpu_hf
+from gpu4pyscf.scf import int4c2e
 from gpu4pyscf.lib.cupy_helper import load_library
 from gpu4pyscf.lib import logger
 
@@ -171,7 +171,7 @@ def _direct_ovvv_vvvv(mycc, t1, t2):
                                   ((mem_avail-Ht2_mem)*.5/8/nao_cart**2)**.5)))
     logger.debug1(mycc, 'blksize %d nao %d', blksize, nao_cart)
 
-    vhfopt = gpu_hf._VHFOpt(mycc.mol, 'int2e')
+    vhfopt = int4c2e._VHFOpt(mycc.mol, 'int2e')
     vhfopt.build(group_size=blksize, diag_block_with_triu=True)
     mol = vhfopt.mol
 
@@ -219,7 +219,7 @@ def contract_vvvv_(eri, i0, i1, j0, j1):
     log_qs = vhfopt.log_qs
     cp_idx, cp_jdx = np.tril_indices(len(vhfopt.uniq_l_ctr))
 
-    if vhfopt.uniq_l_ctr[:,0].max() <= gpu_hf.LMAX_ON_GPU:
+    if vhfopt.uniq_l_ctr[:,0].max() <= int4c2e.LMAX_ON_GPU:
         # Computing ERIs on GPU
         idx, idy = cupy.tril_indices(nao)
         #eribuf = cupy.empty(blksize**2*nao**2)
@@ -279,7 +279,7 @@ def fint(ish0, ish1, jsh0, jsh1, group_id):
         cpj = cp_jdx[cp_ij_id]
         li = vhfopt.uniq_l_ctr[cpi,0]
         lj = vhfopt.uniq_l_ctr[cpj,0]
-        if li > gpu_hf.LMAX_ON_GPU or lj > gpu_hf.LMAX_ON_GPU or log_q_ij.size == 0:
+        if li > int4c2e.LMAX_ON_GPU or lj > int4c2e.LMAX_ON_GPU or log_q_ij.size == 0:
             continue
 
         ish0 = l_ctr_offsets[cpi]
@@ -375,7 +375,7 @@ def _fill_eri_block(eri, strides, ao_offsets, vhfopt, group_id):
     cpl = cp_jdx[cp_kl_id]
     lk = vhfopt.uniq_l_ctr[cpk,0]
     ll = vhfopt.uniq_l_ctr[cpl,0]
-    if lk > gpu_hf.LMAX_ON_GPU or ll > gpu_hf.LMAX_ON_GPU:
+    if lk > int4c2e.LMAX_ON_GPU or ll > int4c2e.LMAX_ON_GPU:
         raise NotImplementedError
 
     stream = cupy.cuda.get_current_stream()
@@ -394,7 +394,7 @@ def _fill_eri_block(eri, strides, ao_offsets, vhfopt, group_id):
         cpj = cp_jdx[cp_ij_id]
         li = vhfopt.uniq_l_ctr[cpi,0]
         lj = vhfopt.uniq_l_ctr[cpj,0]
-        if li > gpu_hf.LMAX_ON_GPU or lj > gpu_hf.LMAX_ON_GPU or log_q_ij.size == 0:
+        if li > int4c2e.LMAX_ON_GPU or lj > int4c2e.LMAX_ON_GPU or log_q_ij.size == 0:
             continue
 
         t0 = time.perf_counter()
@@ -446,7 +446,7 @@ def _make_eris_incore(mycc, mo_coeff=None):
                                   (mem_avail*.5/8/nao_cart**2)**.5)))
     logger.debug1(mycc, 'blksize %d nao %d', blksize, nao_cart)
 
-    vhfopt = gpu_hf._VHFOpt(mycc.mol, 'int2e')
+    vhfopt = int4c2e._VHFOpt(mycc.mol, 'int2e')
     vhfopt.build(group_size=blksize, diag_block_with_triu=True)
     mol = vhfopt.mol
     mo = vhfopt.coeff.dot(mo_coeff)
@@ -470,7 +470,7 @@ def _make_eris_incore(mycc, mo_coeff=None):
         cpj = cp_jdx[cp_ij_id]
         li = vhfopt.uniq_l_ctr[cpi,0]
         lj = vhfopt.uniq_l_ctr[cpj,0]
-        if li > gpu_hf.LMAX_ON_GPU or lj > gpu_hf.LMAX_ON_GPU or log_q_ij.size == 0:
+        if li > int4c2e.LMAX_ON_GPU or lj > int4c2e.LMAX_ON_GPU or log_q_ij.size == 0:
             continue
 
         ish0 = l_ctr_offsets[cpi]

diff --git a/gpu4pyscf/cc/tests/test_ccsd.py b/gpu4pyscf/cc/tests/test_ccsd.py
@@ -65,4 +65,12 @@ def test_to_gpu(self):
 
 if __name__ == '__main__':
     print("Full Tests for CCSD")
-    unittest.main()
+    #unittest.main()
+    mol = pyscf.M(atom = [
+        [8 , (0. , 0.     , 0.)],
+        [1 , (0. , -0.757 , 0.587)],
+        [1 , (0. , 0.757  , 0.587)],
+    ], basis = 'cc-pvdz', verbose=0)
+    mf = mol.RHF().run()
+    mcc = ccsd_incore.CCSD(mf)
+    eris = mcc.ao2mo()
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
@@ -20,9 +20,11 @@
 import cupy
 from pyscf import gto, df, lib
 from pyscf.scf import _vhf
-from gpu4pyscf.scf.hf import BasisProdCache, _make_s_index_offsets
+from gpu4pyscf.scf.int4c2e import (BasisProdCache, _make_s_index_offsets,
+                                   libgvhf, libgint)
 from gpu4pyscf.lib.cupy_helper import (
-    block_c2s_diag, cart2sph, block_diag, contract, load_library, get_avail_mem, print_mem_info, take_last2d)
+    block_c2s_diag, cart2sph, block_diag, contract, load_library, get_avail_mem,
+    print_mem_info, take_last2d, libcupy_helper)
 from gpu4pyscf.lib import logger
 from gpu4pyscf.gto.mole import basis_seg_contraction
 
@@ -32,10 +34,6 @@
 BLKSIZE = 128
 NROOT_ON_GPU = 7
 
-libgvhf = load_library('libgvhf')
-libgint = load_library('libgint')
-libcupy_helper = load_library('libcupy_helper')
-
 def make_fake_mol():
     '''
     fake mol for pairing with auxiliary basis

diff --git a/gpu4pyscf/dft/__init__.py b/gpu4pyscf/dft/__init__.py
@@ -10,4 +10,3 @@ def KS(mol, xc='LDA,VWN'):
         return RKS(mol, xc)
     else:
         return UKS(mol, xc)
-
diff --git a/gpu4pyscf/dft/libxc.py b/gpu4pyscf/dft/libxc.py
@@ -40,12 +40,10 @@
     libxc_path = os.path.abspath(os.path.join(path, 'gpu4pyscf', 'lib', 'deps', 'lib'))
     try:
         _libxc = np.ctypeslib.load_library('libxc', libxc_path)
+        break
     except Exception:
         _libxc = None
 
-    if _libxc is not None:
-        break
-
 libgdft = load_library('libgdft')
 libgdft.GDFT_xc_lda.argtypes = (
     ctypes.c_void_p,

diff --git a/gpu4pyscf/dft/xc_deriv.py b/gpu4pyscf/dft/xc_deriv.py
@@ -241,4 +241,4 @@ def transform_kxc(rho, fxc, kxc, xctype, spin=0):
             vp[4,0,4] = frtt
             vp[4,4,0] = frtt
             vp[4,4,4] = fttt
-    return vp
+    return vp
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,4 +10,3 @@ def KS(mol, xc='LDA,VWN'):
		return RKS(mol, xc)
		else:
		return UKS(mol, xc)