small fixes

pyscf · Jan 5, 2025 · e85afa6 · e85afa6
1 parent e5ddd9a
commit e85afa6
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 68 deletions.
diff --git a/benchmarks/cupy_helper/benchmark_memory_copy.py b/benchmarks/cupy_helper/benchmark_memory_copy.py
@@ -123,3 +123,19 @@ def cupy_asarray_contiguous(a, b):
 print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
 
 assert np.linalg.norm(a.get() - b.get()) < 1e-10
+
+
+print('----------- Benchmark reduction across devices ------ ')
+from gpu4pyscf.lib.cupy_helper import reduce_to_device
+_num_devices = cp.cuda.runtime.getDeviceCount()
+a_dist = []
+for device_id in range(_num_devices):
+    with cp.cuda.Device(device_id):
+        a = cp.random.rand(512,512,512)
+        a_dist.append(a)
+
+perf_cupy = profiler.benchmark(reduce_to_device, (a_dist,), n_repeat=20, n_warmup=3)
+t_kernel = perf_cupy.gpu_times.mean()
+bandwidth = a_dist[0].nbytes * _num_devices / t_kernel / 1e9
+print('Cupy set contiguous array', t_kernel)
+print(f"Effective Bandwidth: {bandwidth:.2f} GB/s")
diff --git a/examples/dft_driver.py b/examples/dft_driver.py
@@ -34,10 +34,10 @@
     basis=bas,
     max_memory=32000)
 # set verbose >= 6 for debugging timer
-mol.verbose = 6
+mol.verbose = 4
 
 mf_df = dft.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis)
-mf_df.verbose = 6
+mf_df.verbose = 4
 
 if args.solvent:
     mf_df = mf_df.PCM()

diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py
@@ -151,7 +151,8 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
         cart2sph = intopt.cart2sph
         orbo_cart = cart2sph @ orbo
         dm_cart = cart2sph @ dm @ cart2sph.T
-
+
+    with_df._cderi = None # release GPU memory
     vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart,
                                         with_j=with_j, with_k=with_k, omega=omega)
     # NOTE: vj and vk are still in cartesian

diff --git a/gpu4pyscf/df/grad/uhf.py b/gpu4pyscf/df/grad/uhf.py
@@ -165,59 +165,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True,
         orbo_cart = orbo
     dm = orbo = None
 
-    """
-    vj = vk = rhoj_tmp = rhok_tmp = None
-    vjaux = vkaux = None
-
-    naux_cart = intopt._sorted_auxmol.nao
-    if with_j:
-        vj = cupy.zeros((3,nao_cart), order='C')
-        vjaux = cupy.zeros((3,naux_cart))
-    if with_k:
-        vk = cupy.zeros((3,nao_cart), order='C')
-        vkaux = cupy.zeros((3,naux_cart))
-    cupy.get_default_memory_pool().free_all_blocks()
-    t1 = log.init_timer()
-    for cp_kl_id in range(len(intopt.aux_log_qs)):
-        k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1]
-        assert k1-k0 <= block_size
-        if with_j:
-            rhoj_tmp = rhoj_cart[k0:k1]
-        if with_k:
-            rhok_tmp = contract('por,ir->pio', rhok_cart[k0:k1], orbo_cart)
-            rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo_cart)
-        '''
-        if(rhoj_tmp.flags['C_CONTIGUOUS'] == False):
-            rhoj_tmp = rhoj_tmp.astype(cupy.float64, order='C')
-
-        if(rhok_tmp.flags['C_CONTIGUOUS'] == False):
-            rhok_tmp = rhok_tmp.astype(cupy.float64, order='C')
-        '''
-        '''
-        # outcore implementation
-        int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 1, out=buf)
-        size = 3*(k1-k0)*nao_cart*nao_cart
-        int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C')
-        rhoj_tmp = contract('xpji,ij->xip', int3c_ip, dm_cart)
-        vj += contract('xip,p->xi', rhoj_tmp, rhoj_cart[k0:k1])
-        vk += contract('pji,xpji->xi', rhok_tmp, int3c_ip)
-
-        int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 2, out=buf)
-        rhoj_tmp = contract('xpji,ji->xp', int3c_ip, dm_cart)
-        vjaux[:, k0:k1] = contract('xp,p->xp', rhoj_tmp, rhoj_cart[k0:k1])
-        vkaux[:, k0:k1] = contract('xpji,pji->xp', int3c_ip, rhok_tmp)
-        '''
-        vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip1', rhoj_tmp, rhok_tmp, dm_cart, omega=omega)
-        if with_j: vj += vj_tmp
-        if with_k: vk += vk_tmp
-
-        vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip2', rhoj_tmp, rhok_tmp, dm_cart, omega=omega)
-        if with_j: vjaux[:, k0:k1] = vj_tmp
-        if with_k: vkaux[:, k0:k1] = vk_tmp
-
-        rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None
-        t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1)
-    """
+    with_df._cderi = None  # release GPU memory
     vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart,
                                         with_j=with_j, with_k=with_k, omega=omega)
 

diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py
@@ -121,7 +121,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
     int2c = cupy.asarray(int2c, order='C')
     int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1])
     solve_j2c = _gen_metric_solver(int2c)
-
+    
     #  int3c contributions
     wj, wk_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0_tag, omega=omega)
     rhoj0_P = rhok0_P__ = None
@@ -172,10 +172,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmls
         mem_avail = get_avail_mem()
         nocc = mocc.shape[1]
         slice_size = naux*nocc*9   # largest slice of intermediate variables
-        blksize = int(mem_avail*0.4/8/slice_size/ALIGNED) * ALIGNED
+        blksize = int(mem_avail*0.2/8/slice_size)
         log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, {blksize} aux AOs per block')
-        if blksize < ALIGNED:
-            raise RuntimeError('Not enough memory for intermediate variables')
+        assert blksize > 0
         if hessobj.auxbasis_response:
             hk_ao_aux = cupy.zeros([nao,naux,3,3])
         for i0, i1 in lib.prange(0,nao,blksize):

diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
@@ -414,9 +414,11 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
 
         ngrids_glob = grids.coords.shape[0]
         ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
+        ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
         grid_start = device_id * ngrids_per_device
-        grid_end = (device_id + 1) * ngrids_per_device
+        grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
         ngrids_local = grid_end - grid_start
+        log.debug(f"{ngrids_local} on Device {device_id}")
 
         weights = cupy.empty([ngrids_local])
         if xctype == 'LDA':
@@ -425,16 +427,18 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
             rho_tot = cupy.empty([nset,4,ngrids_local])
         else:
             rho_tot = cupy.empty([nset,5,ngrids_local])
-
+        
         p0 = p1 = 0
         for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
                                                      max_memory=None,
                                                      grid_range=(grid_start, grid_end)):
             p1 = p0 + weight.size
             weights[p0:p1] = weight
             for i in range(nset):
-                if mo_coeff is None:
-                    rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][idx[:,None],idx],
+                # If AO is sparse enough, use density matrix to calculate rho
+                if mo_coeff is None or len(idx) < mo_occ.sum():
+                    dms_mask = dms[i][idx[:,None],idx]
+                    rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms_mask,
                                                 xctype=xctype, hermi=hermi, with_lapl=with_lapl)
                 else:
                     assert hermi == 1
@@ -443,7 +447,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
                                                 None, xctype, with_lapl)
             p0 = p1
         t0 = log.timer_debug1(f'eval rho on Device {device_id}', *t0)
-
+        
         # libxc calls are still running on default stream
         nelec = cupy.zeros(nset)
         excsum = cupy.zeros(nset)
@@ -814,8 +818,11 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
 
         ngrids_glob = grids.coords.shape[0]
         ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
+        ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
         grid_start = device_id * ngrids_per_device
-        grid_end = (device_id + 1) * ngrids_per_device
+        grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
+        ngrids_local = grid_end - grid_start
+        log.debug(f"{ngrids_local} on Device {device_id}")
 
         for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv,
                                                      max_memory=None,
@@ -1016,8 +1023,11 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
 
         ngrids_glob = grids.coords.shape[0]
         ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
+        ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
         grid_start = device_id * ngrids_per_device
-        grid_end = (device_id + 1) * ngrids_per_device
+        grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
+        ngrids_local = grid_end - grid_start
+        log.debug(f"{ngrids_local} on Device {device_id}")
 
         p0 = p1 = grid_start
         t1 = t0 = log.init_timer()
@@ -1165,8 +1175,11 @@ def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
 
         ngrids_glob = grids.coords.shape[0]
         ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices
+        ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE
         grid_start = device_id * ngrids_per_device
-        grid_end = (device_id + 1) * ngrids_per_device
+        grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob)
+        ngrids_local = grid_end - grid_start
+        log.debug(f"{ngrids_local} on Device {device_id}")
 
         p0 = p1 = grid_start
         t1 = t0 = log.init_timer()