to_gpu and group grids (#69)

* fixed a bug in screen_index * added unit test for to_gpu * new grids group scheme * use grid_aligned in gpu4pyscf.__config__
pyscf · Dec 27, 2023 · 2e5bc47 · 2e5bc47
1 parent 43f21be
commit 2e5bc47
Show file tree

Hide file tree

Showing 18 changed files with 521 additions and 203 deletions.
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ Installation
 > [!NOTE]
 > The compiled binary packages support compute capability 7.0 and later (Volta and later, such as Tesla V100, RTX 20 series and later). For older GPUs (GTX 10**, Tesla P100), please compile the package with the source code as follows.
 
-Run ```nvidia-smi``` in your terminal to check the installed CUDA version. 
+Run ```nvidia-smi``` in your terminal to check the installed CUDA version.
 
 Choose the proper package based on your CUDA environment.
 
@@ -15,7 +15,7 @@ Choose the proper package based on your CUDA environment.
 | **CUDA 11.x** |  ```pip3 install gpu4pyscf-cuda11x``` |
 | **CUDA 12.x** |  ```pip3 install gpu4pyscf-cuda12x``` |
 
-```cuTensor``` is **highly recommended** to be installed for accelerating tensor contractions. 
+```cuTensor``` is **highly recommended** for accelerating tensor contractions.
 
 For **CUDA 11.x**, ```python -m cupyx.tools.install_library --cuda 11.x --library cutensor```
 
@@ -59,7 +59,7 @@ Limitations
 - Rys roots up to 9 for direct scf scheme;
 - Atomic basis up to g orbitals;
 - Auxiliary basis up to h orbitals;
-- Up to ~168 atoms with def2-tzvpd basis, consuming a large amount of CPU memory;
+- Density fitting scheme up to ~168 atoms with def2-tzvpd basis, bounded CPU memory;
 - Hessian is unavailable for Direct SCF yet;
 - meta-GGA without density laplacian;
 

diff --git a/examples/00-h2o.py b/examples/00-h2o.py
@@ -13,8 +13,11 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
+import numpy as np
+import cupy
 import pyscf
 from pyscf import lib
+from pyscf.hessian import thermo
 from gpu4pyscf.dft import rks
 lib.num_threads(8)
 
@@ -30,13 +33,14 @@
 scf_tol = 1e-10
 max_scf_cycles = 50
 screen_tol = 1e-14
-grids_level = 3
+grids_level = 5
 
-mol = pyscf.M(atom=atom, basis=bas, max_memory=32000, output='./pyscf.log')
+mol = pyscf.M(atom=atom, basis=bas, max_memory=32000)
 
 mol.verbose = 4
 mf_GPU = rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis)
 mf_GPU.grids.level = grids_level
+mf_GPU.grids.atom_grid = (99,590)
 mf_GPU.conv_tol = scf_tol
 mf_GPU.max_cycle = max_scf_cycles
 mf_GPU.screen_tol = screen_tol
@@ -55,3 +59,17 @@
 h = mf_GPU.Hessian()
 h.auxbasis_response = 2
 h_dft = h.kernel()
+
+# harmonic analysis
+results = thermo.harmonic_analysis(mol, h_dft)
+thermo.dump_normal_mode(mol, results)
+
+results = thermo.thermo(mf_GPU, results['freq_au'], 298.15, 101325)
+thermo.dump_thermo(mol, results)
+
+# force translational symmetry
+natm = mol.natm
+h_dft = h_dft.transpose([0,2,1,3]).reshape(3*natm,3*natm)
+h_diag = h_dft.sum(axis=0)
+h_dft -= np.diag(h_diag)
+print(h_dft[:3,:3])
diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py
@@ -5,23 +5,23 @@
 # such as A100-80G
 if props['totalGlobalMem'] >= 64 * GB:
     min_ao_blksize = 128
-    min_grid_blksize = 128*128
+    min_grid_blksize = 256*256#128*128
     ao_aligned = 32
     grid_aligned = 128
     mem_fraction = 0.9
     number_of_threads = 2048 * 108
 # such as V100-32G
 elif props['totalGlobalMem'] >= 32 * GB:
     min_ao_blksize = 128
-    min_grid_blksize = 128*128
+    min_grid_blksize = 256*256#128*128
     ao_aligned = 32
     grid_aligned = 128
     mem_fraction = 0.9
     number_of_threads = 1024 * 80
 # such as A30-24GB
 elif props['totalGlobalMem'] >= 16 * GB:
     min_ao_blksize = 128
-    min_grid_blksize = 128*128
+    min_grid_blksize = 256*256#128*128
     ao_aligned = 32
     grid_aligned = 128
     mem_fraction = 0.9

diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py
@@ -1,5 +1,5 @@
 from . import lib, grad, hessian, solvent, scf, dft
-__version__ = '0.6.11'
+__version__ = '0.6.12'
 
 # monkey patch libxc reference due to a bug in nvcc
 from pyscf.dft import libxc

diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
@@ -35,6 +35,8 @@
 class DF(df.DF):
     from gpu4pyscf.lib.utils import to_gpu, device
 
+    _keys = {'intopt'}
+
     def __init__(self, mol, auxbasis=None):
         super().__init__(mol, auxbasis)
         self.auxmol = None
@@ -210,8 +212,12 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
     if(not use_gpu_memory):
         log.debug("Not enough GPU memory")
         # TODO: async allocate memory
-        mem = cupy.cuda.alloc_pinned_memory(naux * npair * 8)
-        cderi = np.ndarray([naux, npair], dtype=np.float64, order='C', buffer=mem)
+        try:
+            mem = cupy.cuda.alloc_pinned_memory(naux * npair * 8)
+            cderi = np.ndarray([naux, npair], dtype=np.float64, order='C', buffer=mem)
+        except Exception:
+            raise RuntimeError('Out of CPU memory')
+
     data_stream = cupy.cuda.stream.Stream(non_blocking=False)
     count = 0
     nq = len(intopt.log_qs)

diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py
@@ -13,7 +13,7 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-
+import copy
 import numpy
 import cupy
 from cupyx.scipy.linalg import solve_triangular
@@ -44,9 +44,13 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         if key in mf_grad.base.with_df._rsh_df:
             with_df = mf_grad.base.with_df._rsh_df[key]
         else:
-            raise RuntimeError(f'omega={omega} is not calculated in SCF')
+            dfobj = mf_grad.base.with_df
+            with_df = dfobj._rsh_df[key] = copy.copy(dfobj).reset()
+            #raise RuntimeError(f'omega={omega} is not calculated in SCF')
 
     auxmol = with_df.auxmol
+    if not hasattr(with_df, 'intopt') or with_df._cderi is None:
+        with_df.build(omega=omega)
     intopt = with_df.intopt
 
     naux = with_df.naux

diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
@@ -383,16 +383,21 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     for ia, h1, vj1, vk1 in _gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
                                     atmlst, verbose, True):
         h1 += vj1 - vk1 * .5
+        h1ao[ia] = h1
+        '''
         if chkfile is None:
             h1ao[ia] = h1
         else:
             key = 'scf_f1ao/%d' % ia
             lib.chkfile.save(chkfile, key, h1)
+        '''
+    return h1ao
+    '''
     if chkfile is None:
         return h1ao
     else:
         return chkfile
-
+    '''
 def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
             verbose=None, with_k=True, omega=None):
     log = logger.new_logger(hessobj, verbose)

diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py
@@ -103,13 +103,17 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
         for ia, h1, vj1_lr, vk1_lr in df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
                                                 atmlst, verbose, True, omega=omega):
             h1ao[ia] -= .5 * (alpha - hyb) * vk1_lr
+    return h1ao
+
+    # support chkfile?
+    '''
     if chkfile is None:
         return h1ao
     else:
         for ia in atmlst:
             lib.chkfile.save(chkfile, 'scf_f1ao/%d'%ia, h1ao[ia])
         return chkfile
-
+    '''
 
 class Hessian(rks_hess.Hessian):
     '''Non-relativistic RKS hessian'''

diff --git a/gpu4pyscf/dft/gen_grid.py b/gpu4pyscf/dft/gen_grid.py
@@ -37,14 +37,14 @@
 from cupyx.scipy.spatial.distance import cdist
 from gpu4pyscf.dft import radi
 from gpu4pyscf.lib.cupy_helper import load_library
-
+from gpu4pyscf import __config__ as __gpu4pyscf_config__
 libdft = lib.load_library('libdft')
 libgdft = load_library('libgdft')
 
 from pyscf.dft.gen_grid import GROUP_BOUNDARY_PENALTY, NELEC_ERROR_TOL, LEBEDEV_ORDER, LEBEDEV_NGRID
 
 GROUP_BOX_SIZE = 3.0
-ALIGNMENT_UNIT = 32
+ALIGNMENT_UNIT = getattr(__gpu4pyscf_config__, 'grid_aligned', 128)
 # SG0
 # S. Chien and P. Gill,  J. Comput. Chem. 27 (2006) 730-739.
 
@@ -334,6 +334,7 @@ def gen_grid_partition(coords):
 
     coords_all = []
     weights_all = []
+    # support atomic_radii_adjust = None
     assert radii_adjust == radi.treutler_atomic_radii_adjust
     a = -radi.get_treutler_fac(mol, atomic_radii)
     for ia in range(mol.natm):
@@ -377,6 +378,49 @@ def make_mask(mol, coords, relativity=0, shls_slice=None, cutoff=CUTOFF,
     '''
     return make_screen_index(mol, coords, shls_slice, cutoff)
 
+def atomic_group_grids(mol, coords):
+    '''
+    partition the entire space based on atomic position
+    '''
+    from scipy.spatial import distance_matrix
+    natm = mol.natm
+    ngrids = coords.shape[0]
+    atom_coords = mol.atom_coords()
+    dist = distance_matrix(atom_coords, atom_coords)
+    visited = numpy.zeros(natm, dtype=bool)
+    current_node = numpy.argmin(atom_coords[:,0])
+    # greedy traverse atoms
+    path = [current_node]
+    while len(path) < natm:
+        visited[current_node] = True
+        # Set distances to visited nodes as infinity so they won't be chosen
+        distances_to_unvisited = numpy.where(visited, numpy.inf, dist[current_node])
+        next_node = numpy.argmin(distances_to_unvisited)
+        path.append(next_node)
+        current_node = next_node
+
+    atom_coords = cupy.asarray(atom_coords[path])
+    #dij = cupy.sum((atom_coords[:,None,:] - coords[None,:,:])**2, axis=2)
+    #group_ids = cupy.argmin(dij, axis=0)
+
+    coords = cupy.asarray(coords, order='F')
+    atom_coords = cupy.asarray(atom_coords, order='F')
+    group_ids = cupy.empty([ngrids], dtype=numpy.int32)
+    stream = cupy.cuda.get_current_stream()
+    err = libgdft.GDFTgroup_grids(
+        ctypes.cast(stream.ptr, ctypes.c_void_p),
+        ctypes.cast(group_ids.data.ptr, ctypes.c_void_p),
+        ctypes.cast(atom_coords.data.ptr, ctypes.c_void_p),
+        ctypes.cast(coords.data.ptr, ctypes.c_void_p),
+        ctypes.c_int(natm),
+        ctypes.c_int(ngrids)
+    )
+    if err != 0:
+        raise RuntimeError('CUDA Error')
+
+    return group_ids.argsort()
+
+
 def arg_group_grids(mol, coords, box_size=GROUP_BOX_SIZE):
     '''
     Parition the entire space into small boxes according to the input box_size.
@@ -510,11 +554,6 @@ def build(self, mol=None, with_non0tab=False, sort_grids=True, **kwargs):
         self.coords, self.weights = self.get_partition(
             mol, atom_grids_tab, self.radii_adjust, self.atomic_radii, self.becke_scheme)
 
-        if sort_grids:
-            idx = arg_group_grids(mol, self.coords)
-            self.coords = self.coords[idx]
-            self.weights = self.weights[idx]
-
         if self.alignment > 1:
             padding = _padding_size(self.size, self.alignment)
             logger.debug(self, 'Padding %d grids', padding)
@@ -523,6 +562,13 @@ def build(self, mol=None, with_non0tab=False, sort_grids=True, **kwargs):
                 self.coords = cupy.vstack(
                     [self.coords, numpy.repeat([[1e4]*3], padding, axis=0)])
                 self.weights = cupy.hstack([self.weights, numpy.zeros(padding)])
+
+        if sort_grids:
+            #idx = arg_group_grids(mol, self.coords)
+            idx = atomic_group_grids(mol, self.coords)
+            self.coords = self.coords[idx]
+            self.weights = self.weights[idx]
+
         if with_non0tab:
             self.non0tab = self.make_mask(mol, self.coords)
             self.screen_index = self.non0tab

diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
@@ -1306,6 +1306,8 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
 class NumInt(numint.NumInt):
     from gpu4pyscf.lib.utils import to_cpu, to_gpu, device
 
+    _keys = {'screen_idx', 'xcfuns', 'gdftopt'}
+
     def __init__(self, xc='LDA'):
         super().__init__()
         self.gdftopt = None