diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index 17498c7d..bf6c65c9 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -391,8 +391,18 @@ def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars): vxc[1,threshind] = 1.5*W*dW0dG return exc,vxc +def gen_grid_range(ngrids, device_id, blksize=MIN_BLK_SIZE): + ''' + Calculate the range of grids assigned the given device + ''' + ngrids_per_device = (ngrids + _num_devices - 1) // _num_devices + ngrids_per_device = (ngrids_per_device + blksize - 1) // blksize * blksize + grid_start = min(device_id * ngrids_per_device, ngrids) + grid_end = min((device_id + 1) * ngrids_per_device, ngrids) + return grid_start, grid_end + def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, - verbose=None, with_lapl=False, grid_range=(), device_id=0, hermi=1): + verbose=None, with_lapl=False, device_id=0, hermi=1): ''' nr_rks task on given device ''' with cupy.cuda.Device(device_id), _streams[device_id]: @@ -413,12 +423,9 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, ao_deriv = 1 ngrids_glob = grids.coords.shape[0] - ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices - ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE - grid_start = min(device_id * ngrids_per_device, ngrids_glob) - grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob) + grid_start, grid_end = gen_grid_range(ngrids_glob, device_id) ngrids_local = grid_end - grid_start - log.debug(f"{ngrids_local} on Device {device_id}") + log.debug(f"{ngrids_local} grids on Device {device_id}") weights = cupy.empty([ngrids_local]) if xctype == 'LDA': @@ -439,12 +446,12 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, if mo_coeff is None: dms_mask = dms[i][idx[:,None],idx] rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms_mask, - xctype=xctype, hermi=hermi, with_lapl=with_lapl) + xctype=xctype, hermi=hermi, with_lapl=with_lapl) else: assert hermi == 1 mo_coeff_mask = mo_coeff[idx,:] rho_tot[i,:,p0:p1] = eval_rho2(_sorted_mol, ao_mask, mo_coeff_mask, mo_occ, - None, xctype, with_lapl) + None, xctype, with_lapl) p0 = p1 t0 = log.timer_debug1(f'eval rho on Device {device_id}', *t0) @@ -787,7 +794,7 @@ def nr_rks_group(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, return nelec, excsum, vmat def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, - verbose=None, with_lapl=False, grid_range=(), device_id=0, hermi=1): + verbose=None, with_lapl=False, device_id=0, hermi=1): ''' nr_uks task on one device ''' with cupy.cuda.Device(device_id), _streams[device_id]: @@ -817,12 +824,9 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, ao_deriv = 1 ngrids_glob = grids.coords.shape[0] - ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices - ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE - grid_start = min(device_id * ngrids_per_device, ngrids_glob) - grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob) + grid_start, grid_end = gen_grid_range(ngrids_glob, device_id) ngrids_local = grid_end - grid_start - log.debug(f"{ngrids_local} on Device {device_id}") + log.debug(f"{ngrids_local} grids on Device {device_id}") for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=None, @@ -1674,6 +1678,9 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, ni.non0ao_idx[lookup_key] = _sparse_index(_sorted_mol, coords, opt.l_ctr_offsets) pad, idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice = ni.non0ao_idx[lookup_key] + if len(idx) == 0: + continue + ao_mask = eval_ao( _sorted_mol, coords, deriv, nao_slice=len(idx), diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py index 2c01d0ff..e0d535a4 100644 --- a/gpu4pyscf/grad/rks.py +++ b/gpu4pyscf/grad/rks.py @@ -136,7 +136,7 @@ def get_veff(ks_grad, mol=None, dm=None, verbose=None): return tag_array(exc1_per_atom, exc1_grid=exc) def _get_vxc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, - verbose=None, with_lapl=False, grid_range=(), device_id=0): + verbose=None, with_lapl=False, device_id=0): ''' Calculate the gradient of vxc on given device ''' with cupy.cuda.Device(device_id), _streams[device_id]: @@ -151,10 +151,11 @@ def _get_vxc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, opt = ni.gdftopt _sorted_mol = opt._sorted_mol nset = dms.shape[0] + ngrids_glob = grids.coords.shape[0] - ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices - grid_start = device_id * ngrids_per_device - grid_end = (device_id + 1) * ngrids_per_device + grid_start, grid_end = numint.gen_grid_range(ngrids_glob, device_id) + ngrids_local = grid_end - grid_start + log.debug(f"{ngrids_local} grids on Device {device_id}") nset = len(dms) assert nset == 1 diff --git a/gpu4pyscf/grad/uks.py b/gpu4pyscf/grad/uks.py index 2be8a2b3..90582d73 100644 --- a/gpu4pyscf/grad/uks.py +++ b/gpu4pyscf/grad/uks.py @@ -153,10 +153,11 @@ def _get_vxc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, opt = ni.gdftopt _sorted_mol = opt._sorted_mol nset = dms.shape[0] + ngrids_glob = grids.coords.shape[0] - ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices - grid_start = device_id * ngrids_per_device - grid_end = (device_id + 1) * ngrids_per_device + grid_start, grid_end = numint.gen_grid_range(ngrids_glob, device_id) + ngrids_local = grid_end - grid_start + log.debug(f"{ngrids_local} grids on Device {device_id}") vmat = cupy.zeros((nset,3,nao,nao)) if xctype == 'LDA': diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index 79a8496f..a1c01079 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -347,9 +347,7 @@ def _get_vxc_deriv2_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id ao_loc = mol.ao_loc_nr() ngrids_glob = grids.coords.shape[0] - ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices - grid_start = device_id * ngrids_per_device - grid_end = (device_id + 1) * ngrids_per_device + grid_start, grid_end = numint.gen_grid_range(ngrids_glob, device_id) with cupy.cuda.Device(device_id), _streams[device_id]: log = logger.new_logger(mol, verbose) @@ -551,10 +549,8 @@ def _get_vxc_deriv1_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id ao_loc = mol.ao_loc_nr() ngrids_glob = grids.coords.shape[0] - ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices - grid_start = device_id * ngrids_per_device - grid_end = (device_id + 1) * ngrids_per_device - + grid_start, grid_end = numint.gen_grid_range(ngrids_glob, device_id) + with cupy.cuda.Device(device_id), _streams[device_id]: mo_occ = cupy.asarray(mo_occ) mo_coeff = cupy.asarray(mo_coeff) @@ -727,9 +723,9 @@ def _nr_rks_fxc_mo_task(ni, mol, grids, xc_code, fxc, mo_coeff, mo1, mocc, ao_deriv = 1 ngrids_glob = grids.coords.shape[0] - ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices - grid_start = device_id * ngrids_per_device - grid_end = (device_id + 1) * ngrids_per_device + grid_start, grid_end = numint.gen_grid_range(ngrids_glob, device_id) + ngrids_local = grid_end - grid_start + log.debug(f"{ngrids_local} grids on Device {device_id}") p0 = p1 = grid_start t1 = t0 = log.init_timer()