From e2bad5831ad9ae92c35abffc373ff0d6a449c2ee Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Fri, 15 Dec 2023 15:47:29 -0800 Subject: [PATCH] Improve dft (#66) * save * implement SMD solvent model * updated readme * fixed issues after v0.6.9 * format examples * output = /dev/null in test_smd.py * evaluate sparse ao directly --- examples/dft_driver.py | 5 +- gpu4pyscf/__config__.py | 4 +- gpu4pyscf/df/int3c2e.py | 23 ++- gpu4pyscf/dft/gen_grid.py | 11 +- gpu4pyscf/dft/numint.py | 222 ++++++++++++++++--------- gpu4pyscf/lib/gdft/nr_eval_gto.cu | 258 ++++++++++++++++++----------- gpu4pyscf/lib/gdft/nr_eval_gto.cuh | 4 + gpu4pyscf/lib/gdft/vv10.cu | 49 +++--- gpu4pyscf/lib/gint/gout3c2e.cu | 64 +++---- 9 files changed, 411 insertions(+), 229 deletions(-) diff --git a/examples/dft_driver.py b/examples/dft_driver.py index 96b1718c..0349598f 100644 --- a/examples/dft_driver.py +++ b/examples/dft_driver.py @@ -35,7 +35,7 @@ max_memory=32000) # set verbose >= 6 for debugging timer -mol.verbose = 0 +mol.verbose = 6 mf_df = rks.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis) mf_df.verbose = 6 @@ -47,12 +47,15 @@ mf_df.with_solvent.eps = 78.3553 mf_df.grids.atom_grid = (99,590) +if mf_df._numint.libxc.is_nlc(mf_df.xc): + mf_df.nlcgrids.atom_grid = (50,194) mf_df.direct_scf_tol = 1e-14 mf_df.direct_scf = 1e-14 mf_df.conv_tol = 1e-12 e_tot = mf_df.kernel() scf_time = time.time() - start_time print(f'compute time for energy: {scf_time:.3f} s') +exit() start_time = time.time() g = mf_df.nuc_grad_method() diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py index 0a478cc2..3bf09110 100644 --- a/gpu4pyscf/__config__.py +++ b/gpu4pyscf/__config__.py @@ -5,8 +5,8 @@ # such as A100-80G if props['totalGlobalMem'] >= 64 * GB: min_ao_blksize = 256 - min_grid_blksize = 256*256 - ao_aligned = 64 + min_grid_blksize = 128*128 + ao_aligned = 32 grid_aligned = 128 mem_fraction = 0.9 number_of_threads = 2048 * 108 diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index d61accb9..56668354 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -215,6 +215,7 @@ def build(self, cutoff=1e-14, group_size=None, l_ctr_offsets, l_ctr_offsets, q_cond, diag_block_with_triu=diag_block_with_triu, aosym=aosym) self.log_qs = log_qs.copy() + cput1 = logger.timer_debug1(sorted_mol, 'Get pairing', *cput1) # contraction coefficient for ao basis cart_ao_loc = self.sorted_mol.ao_loc_nr(cart=True) @@ -239,6 +240,7 @@ def build(self, cutoff=1e-14, group_size=None, inv_idx = np.argsort(self.sph_ao_idx, kind='stable').astype(np.int32) self.rev_ao_idx = inv_idx self.coeff = self.cart2sph[:, inv_idx] + cput1 = logger.timer_debug1(sorted_mol, 'AO cart2sph coeff', *cput1) # pairing auxiliary basis with fake basis set fake_l_ctr_offsets = np.append(0, np.cumsum(fake_l_ctr_counts)) @@ -268,6 +270,7 @@ def build(self, cutoff=1e-14, group_size=None, inv_idx = np.argsort(self.sph_aux_idx, kind='stable').astype(np.int32) self.aux_coeff = self.aux_cart2sph[:, inv_idx] aux_l_ctr_offsets += fake_l_ctr_offsets[-1] + cput1 = logger.timer_debug1(tot_mol, 'aux cart2sph coeff', *cput1) ao_loc = self.sorted_mol.ao_loc_nr(cart=False) self.ao_pairs_row, self.ao_pairs_col = get_ao_pairs(pair2bra, pair2ket, ao_loc) @@ -276,6 +279,7 @@ def build(self, cutoff=1e-14, group_size=None, self.cderi_row = cderi_row self.cderi_col = cderi_col self.cderi_diag = cupy.argwhere(cderi_row == cderi_col)[:,0] + cput1 = logger.timer_debug1(tot_mol, 'Get AO pairs', *cput1) aux_pair2bra = [] aux_pair2ket = [] @@ -1434,28 +1438,43 @@ def get_pairing(p_offsets, q_offsets, q_cond, log_qs = [] pair2bra = [] pair2ket = [] + for p0, p1 in zip(p_offsets[:-1], p_offsets[1:]): for q0, q1 in zip(q_offsets[:-1], q_offsets[1:]): if aosym and q0 < p0 or not aosym: q_sub = q_cond[p0:p1,q0:q1].ravel() + ''' idx = q_sub.argsort(axis=None)[::-1] q_sorted = q_sub[idx] mask = q_sorted > cutoff idx = idx[mask] ishs, jshs = np.unravel_index(idx, (p1-p0, q1-q0)) + print(ishs.shape) + ''' + mask = q_sub > cutoff + ishs, jshs = np.indices((p1-p0,q1-q0)) + ishs = ishs.ravel()[mask] + jshs = jshs.ravel()[mask] ishs += p0 jshs += q0 pair2bra.append(ishs) pair2ket.append(jshs) - log_q = np.log(q_sorted[mask]) + log_q = np.log(q_sub[mask]) log_q[log_q > 0] = 0 log_qs.append(log_q) elif aosym and p0 == q0 and p1 == q1: q_sub = q_cond[p0:p1,p0:p1].ravel() + ''' idx = q_sub.argsort(axis=None)[::-1] q_sorted = q_sub[idx] ishs, jshs = np.unravel_index(idx, (p1-p0, p1-p0)) mask = q_sorted > cutoff + ''' + + ishs, jshs = np.indices((p1-p0, p1-p0)) + ishs = ishs.ravel() + jshs = jshs.ravel() + mask = q_sub > cutoff if not diag_block_with_triu: # Drop the shell pairs in the upper triangle for diagonal blocks mask &= ishs >= jshs @@ -1469,7 +1488,7 @@ def get_pairing(p_offsets, q_offsets, q_cond, pair2bra.append(ishs) pair2ket.append(jshs) - log_q = np.log(q_sorted[mask]) + log_q = np.log(q_sub[mask]) log_q[log_q > 0] = 0 log_qs.append(log_q) return log_qs, pair2bra, pair2ket diff --git a/gpu4pyscf/dft/gen_grid.py b/gpu4pyscf/dft/gen_grid.py index ea76e2f8..f1ca947b 100644 --- a/gpu4pyscf/dft/gen_grid.py +++ b/gpu4pyscf/dft/gen_grid.py @@ -41,8 +41,9 @@ libdft = lib.load_library('libdft') libgdft = load_library('libgdft') -from pyscf.dft.gen_grid import GROUP_BOX_SIZE, GROUP_BOUNDARY_PENALTY, NELEC_ERROR_TOL, LEBEDEV_ORDER, LEBEDEV_NGRID +from pyscf.dft.gen_grid import GROUP_BOUNDARY_PENALTY, NELEC_ERROR_TOL, LEBEDEV_ORDER, LEBEDEV_NGRID +GROUP_BOX_SIZE = 3.0 ALIGNMENT_UNIT = 32 # SG0 # S. Chien and P. Gill, J. Comput. Chem. 27 (2006) 730-739. @@ -388,8 +389,12 @@ def arg_group_grids(mol, coords, box_size=GROUP_BOX_SIZE): box_ids[box_ids[:,0] > boxes[0], 0] = boxes[0] box_ids[box_ids[:,1] > boxes[1], 1] = boxes[1] box_ids[box_ids[:,2] > boxes[2], 2] = boxes[2] - rev_idx = numpy.unique(box_ids.get(), axis=0, return_inverse=True)[1] - return rev_idx.argsort(kind='stable') + + boxes *= 2 # for safety + box_id = box_ids[:,0] + box_ids[:,1] * boxes[0] + box_ids[:,2] * boxes[0] * boxes[1] + #rev_idx = numpy.unique(box_ids.get(), axis=0, return_inverse=True)[1] + rev_idx = cupy.unique(box_id, return_inverse=True)[1] + return rev_idx.argsort() def _load_conf(mod, name, default): var = getattr(__config__, name, None) diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index 237091d4..2e61b533 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -24,7 +24,7 @@ from pyscf.dft import numint from pyscf.gto.eval_gto import NBINS, CUTOFF, make_screen_index from gpu4pyscf.scf.hf import basis_seg_contraction -from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, load_library, add_sparse, release_gpu_stack +from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, load_library, add_sparse, release_gpu_stack, take_last2d from gpu4pyscf.dft import xc_deriv, xc_alias, libxc from gpu4pyscf import __config__ from gpu4pyscf.lib import logger @@ -50,50 +50,72 @@ libgdft.GDFTdot_ao_ao_sparse.restype = ctypes.c_int libgdft.GDFTdot_aow_ao_sparse.restype = ctypes.c_int -def eval_ao(ni, mol, coords, deriv=0, shls_slice=None, - non0tab=None, out=None, verbose=None): - assert shls_slice is None - ngrids = coords.shape[0] - coords = cupy.asarray(coords.T, order='C') - comp = (deriv+1)*(deriv+2)*(deriv+3)//6 +def eval_ao(ni, mol, coords, deriv=0, shls_slice=None, nao_slice=None, ao_loc_slice=None, + non0tab=None, out=None, verbose=None, ctr_offsets_slice=None): + ''' evaluate ao values for given coords and shell indices + Kwargs: + shls_slice : offsets of shell slices to be evaluated + ao_loc_slice: offsets of ao slices to be evaluated + ctr_offsets_slice: offsets of contraction patterns + Returns: + ao: comp x nao_slice x ngrids, ao is in C-contiguous + ''' opt = getattr(ni, 'gdftopt', None) - stream = cupy.cuda.get_current_stream() + with_opt = True if opt is None: ni.build(mol, coords) opt = ni.gdftopt + with_opt = False + mol = opt.mol + + if shls_slice is None: + shls_slice = cupy.arange(mol.nbas, dtype=np.int32) + ctr_offsets = opt.l_ctr_offsets + ctr_offsets_slice = opt.l_ctr_offsets + ao_loc_slice = cupy.asarray(mol.ao_loc_nr()) + nao_slice = mol.nao + else: + ctr_offsets = opt.l_ctr_offsets + + nctr = ctr_offsets.size - 1 + ngrids = coords.shape[0] + coords = cupy.asarray(coords.T, order='C') + comp = (deriv+1)*(deriv+2)*(deriv+3)//6 + stream = cupy.cuda.get_current_stream() + + if not with_opt: # mol may be different to _GDFTOpt.mol. # nao should be consistent with the _GDFTOpt.mol object - nao = opt.coeff.shape[0] coeff = cupy.asarray(opt.coeff) - ao = cupy.zeros((comp, nao, ngrids), order='C') - mol = opt.mol + ao = cupy.zeros((comp, nao_slice, ngrids), order='C') with opt.gdft_envs_cache(): err = libgdft.GDFTeval_gto( ctypes.cast(stream.ptr, ctypes.c_void_p), ctypes.cast(ao.data.ptr, ctypes.c_void_p), ctypes.c_int(deriv), ctypes.c_int(opt.mol.cart), ctypes.cast(coords.data.ptr, ctypes.c_void_p), ctypes.c_int(ngrids), - opt.l_ctr_offsets.ctypes.data_as(ctypes.c_void_p), - ctypes.c_int(opt.l_ctr_offsets.size - 1), - mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(mol.natm), - mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(mol.nbas), - mol._env.ctypes.data_as(ctypes.c_void_p)) + ctypes.cast(shls_slice.data.ptr, ctypes.c_void_p), + ctypes.cast(ao_loc_slice.data.ptr, ctypes.c_void_p), + ctypes.c_int(nao_slice), + ctr_offsets.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nctr), + ctr_offsets_slice.ctypes.data_as(ctypes.c_void_p), + mol._bas.ctypes.data_as(ctypes.c_void_p)) ao = contract('nig,ij->njg', ao, coeff).transpose([0,2,1]) else: - nao = opt.coeff.shape[0] - ao = cupy.zeros((comp, nao, ngrids), order='C') + ao = cupy.zeros((comp, nao_slice, ngrids), order='C') err = libgdft.GDFTeval_gto( ctypes.cast(stream.ptr, ctypes.c_void_p), ctypes.cast(ao.data.ptr, ctypes.c_void_p), ctypes.c_int(deriv), ctypes.c_int(opt.mol.cart), ctypes.cast(coords.data.ptr, ctypes.c_void_p), ctypes.c_int(ngrids), - opt.l_ctr_offsets.ctypes.data_as(ctypes.c_void_p), - ctypes.c_int(opt.l_ctr_offsets.size - 1), - mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(mol.natm), - mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(mol.nbas), - mol._env.ctypes.data_as(ctypes.c_void_p)) + ctypes.cast(shls_slice.data.ptr, ctypes.c_void_p), + ctypes.cast(ao_loc_slice.data.ptr, ctypes.c_void_p), + ctypes.c_int(nao_slice), + ctr_offsets.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nctr), + ctr_offsets_slice.ctypes.data_as(ctypes.c_void_p), + mol._bas.ctypes.data_as(ctypes.c_void_p)) if err != 0: - raise RuntimeError('CUDA Error') + raise RuntimeError('CUDA Error in evaluating AO') if deriv == 0: ao = ao[0] @@ -307,7 +329,7 @@ def eval_rho4(mol, ao, c0, mo1, non0tab=None, xctype='LDA', return rho def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars): - thresh=1e-8 + thresh=1e-10 #output exc=cupy.zeros(rho[0,:].size) @@ -354,6 +376,7 @@ def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars): dW0dG=W0tmp*R/(G*W0) K=Kvv*(R**(1./6.)) dKdR=(1./6.)*K + vvcoords = cupy.asarray(vvcoords, order='F') coords = cupy.asarray(coords, order='F') @@ -456,7 +479,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, exc, vxc = ni.eval_xc_eff(xc_code, rho, deriv=1, xctype=xctype)[:2] vxc = cupy.asarray(vxc, order='C') exc = cupy.asarray(exc, order='C') - t1 = log.timer_debug1('eval vxc', *t0) + t1 = log.timer_debug1('eval vxc', *t1) if xctype == 'LDA': den = rho * weight wv = weight * vxc[0] @@ -526,6 +549,9 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, vmat = contract('pi,npq->niq', coeff, vmat) vmat = contract('qj,niq->nij', coeff, vmat) + #rev_ao_idx = opt.rev_ao_idx + #vmat = take_last2d(vmat, rev_ao_idx) + if xctype != 'LDA': #transpose_sum(vmat) vmat = vmat + vmat.transpose([0,2,1]) @@ -652,28 +678,37 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, return nelec, excsum, vmat -def get_rho(ni, mol, dm, grids, max_memory=2000): +def get_rho(ni, mol, dm, grids, max_memory=2000, verbose=None): opt = getattr(ni, 'gdftopt', None) if opt is None: ni.build(mol, grids.coords) opt = ni.gdftopt mol = opt.mol + log = logger.new_logger(mol, verbose) coeff = cupy.asarray(opt.coeff) nao = coeff.shape[0] dm = coeff @ cupy.asarray(dm) @ coeff.T - with opt.gdft_envs_cache(): - mem_avail = get_avail_mem() - block_size = int((mem_avail*.2/8/3/nao - nao*2)/ ALIGNED) * ALIGNED - #logger.debug1(mol, 'Available GPU mem %f Mb, block_size %d', mem_avail/1e6, block_size) - if block_size < ALIGNED: - raise RuntimeError('Not enough GPU memory') - ngrids = grids.weights.size - rho = cupy.empty(ngrids) - for p0, p1 in lib.prange(0, ngrids, block_size): - ao = eval_ao(ni, mol, grids.coords[p0:p1], deriv=0) - rho[p0:p1] = eval_rho(mol, ao, dm, xctype='LDA', hermi=1) - ao = None + mo_coeff = getattr(dm, 'mo_coeff', None) + mo_occ = getattr(dm,'mo_occ', None) + + if mo_coeff is not None: + mo_coeff = coeff @ mo_coeff + + ngrids = grids.weights.size + rho = cupy.empty(ngrids) + p0 = p1 = 0 + for ao_mask, idx, weight, _ in ni.block_loop(mol, grids, nao, 0): + p1 = p0 + weight.size + t0 = log.init_timer() + if mo_coeff is None: + rho[p0:p1] = eval_rho(mol, ao_mask, dm[np.ix_(idx,idx)], xctype='LDA', hermi=1) + else: + mo_coeff_mask = mo_coeff[idx,:] + rho[p0:p1] = eval_rho2(mol, ao_mask, mo_coeff_mask, mo_occ, None, 'LDA') + p0 = p1 + log.timer_debug1('eval rho', *t0) + if FREE_CUPY_CACHE: dm = None cupy.get_default_memory_pool().free_all_blocks() @@ -960,6 +995,7 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, rho = eval_rho(opt.mol, ao, dms[0][np.ix_(mask,mask)], xctype='GGA', hermi=1) vvrho.append(rho) rho = cupy.hstack(vvrho) + t1 = log.timer_debug1('eval rho', *t0) exc = 0 vxc = 0 nlc_coefs = ni.nlc_coeff(xc_code) @@ -968,10 +1004,14 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, grids.coords, nlc_pars) exc += e * fac vxc += v * fac + t1 = log.timer_debug1('eval vv on grids', *t1) + den = rho[0] * grids.weights nelec = den.sum() excsum = cupy.dot(den, exc) vv_vxc = xc_deriv.transform_vxc(rho, vxc, 'GGA', spin=0) + t1 = log.timer_debug1('transform vxc', *t1) + vmat = cupy.zeros((nao,nao)) p1 = 0 for ao, mask, weight, coords \ @@ -982,6 +1022,7 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, aow = _scale_ao(ao, wv) #vmat += ao[0].dot(aow.T) add_sparse(vmat, ao[0].dot(aow.T), mask) + t1 = log.timer_debug1('integration', *t1) vmat = vmat + vmat.T vmat = contract('pi,pq->iq', coeff, vmat) @@ -1095,14 +1136,19 @@ def eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None, verbose=None "v3sigma2lapl", "v3sigma2tau", "v3sigmalapl2", "v3sigmalapltau", "v3sigmatau2", "v3lapl3", "v3lapl2tau", "v3lapltau2", "v3tau3"] - ret_full = {} - for xcfun, w in ni.xcfuns: + if len(ni.xcfuns) == 1: + xcfun, _ = ni.xcfuns[0] xc_res = xcfun.compute(inp, do_exc=True, do_vxc=do_vxc, do_fxc=do_fxc, do_kxc=do_kxc) - for label in xc_res: - if label in ret_full: - ret_full[label] += xc_res[label] * w - else: - ret_full[label] = xc_res[label] * w + ret_full = xc_res + else: + ret_full = {} + for xcfun, w in ni.xcfuns: + xc_res = xcfun.compute(inp, do_exc=True, do_vxc=do_vxc, do_fxc=do_fxc, do_kxc=do_kxc) + for label in xc_res: + if label in ret_full: + ret_full[label] += xc_res[label] * w + else: + ret_full[label] = xc_res[label] * w vxc = None fxc = None kxc = None @@ -1177,40 +1223,68 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, for ip0, ip1 in lib.prange(0, ngrids, blksize): coords = grids.coords[ip0:ip1] weight = grids.weights[ip0:ip1] - #sindex = ni.screen_index[ip0//GRID_BLKSIZE:] - t0 = log.init_timer() - ao = eval_ao(ni, mol, coords, deriv) - t0 = log.timer_debug1('eval ao', *t0) # cache ao indices if (deriv, block_id, blksize, ngrids) not in ni.non0ao_idx: + stream = cupy.cuda.get_current_stream() + cutoff = 1e-12 + ng = ip1 - ip0 + ao_loc = mol.ao_loc_nr() + nbas = mol.nbas t0 = log.init_timer() - if deriv == 0: - mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[1]) - all_idx = cupy.arange(ao.shape[0], dtype=np.int32) - idx = all_idx[mask] - pad = (len(idx) + AO_ALIGNMENT - 1) // AO_ALIGNMENT * AO_ALIGNMENT - len(idx) - zero_idx = all_idx[~mask][:pad] - idx = cupy.hstack([idx, zero_idx]) - ao_mask = ao[idx,:] - else: - mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[0,2]) - all_idx = cupy.arange(ao.shape[1], dtype=np.int32) - idx = all_idx[mask] - pad = (len(idx) + AO_ALIGNMENT - 1) // AO_ALIGNMENT * AO_ALIGNMENT - len(idx) - zero_idx = all_idx[~mask][:pad] - idx = cupy.hstack([idx, zero_idx]) - ao_mask = ao[:,idx,:] - ni.non0ao_idx[deriv, block_id, blksize, ngrids] = idx + non0shl_idx = cupy.zeros(len(ao_loc)-1, dtype=np.int32) + libgdft.GDFTscreen_index( + ctypes.cast(stream.ptr, ctypes.c_void_p), + ctypes.cast(non0shl_idx.data.ptr, ctypes.c_void_p), + ctypes.c_double(cutoff), + ctypes.cast(coords.data.ptr, ctypes.c_void_p), + ctypes.c_int(ng), + ao_loc.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbas), + mol._bas.ctypes.data_as(ctypes.c_void_p)) + non0shl_idx = non0shl_idx.get() + + # offset of contraction pattern, used in eval_ao + cumsum = np.cumsum(non0shl_idx, dtype=np.int32) + glob_ctr_offsets = opt.l_ctr_offsets + ctr_offsets_slice = cumsum[glob_ctr_offsets-1] + ctr_offsets_slice[0] = 0 + + from pyscf import gto + gto_type = 'cart' if mol.cart else 'sph' + non0shl_idx = non0shl_idx == 1 + ao_loc_slice = gto.moleintor.make_loc(mol._bas[non0shl_idx,:], gto_type) + ao_loc_slice = cupy.asarray(ao_loc_slice, dtype=np.int32) + non0ao_idx = [] + zero_idx = [] + for sh_idx in range(len(ao_loc)-1): + p0, p1 = ao_loc[sh_idx], ao_loc[sh_idx+1] + if non0shl_idx[sh_idx]: + non0ao_idx += range(p0,p1) + else: + zero_idx += range(p0,p1) + + idx = cupy.asarray(non0ao_idx, dtype=np.int32) + zero_idx = cupy.asarray(zero_idx, dtype=np.int32) + pad = (len(idx) + AO_ALIGNMENT - 1) // AO_ALIGNMENT * AO_ALIGNMENT - len(idx) + idx = cupy.hstack([idx, zero_idx[:pad]]) + non0shl_idx = cupy.asarray(np.where(non0shl_idx)[0], dtype=np.int32) + + ni.non0ao_idx[deriv, block_id, blksize, ngrids] = (idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice) log.timer_debug1('init ao sparsity', *t0) else: - idx = ni.non0ao_idx[deriv, block_id, blksize, ngrids] - if deriv == 0: - ao_mask = ao[idx,:] - else: - ao_mask = ao[:,idx,:] + idx, non0shl_idx, ctr_offsets_slice, ao_loc_slice = ni.non0ao_idx[deriv, block_id, blksize, ngrids] + + t0 = log.init_timer() + ao_mask = eval_ao( + ni, mol, coords, deriv, + nao_slice=len(idx), + shls_slice=non0shl_idx, + ao_loc_slice=ao_loc_slice, + ctr_offsets_slice=ctr_offsets_slice) + block_id += 1 - log.timer_debug1('extract sparse ao', *t0) + log.timer_debug1('evaluate ao slice', *t0) yield ao_mask, idx, weight, coords class NumInt(numint.NumInt): @@ -1539,7 +1613,7 @@ def build(self, mol=None): pmol._decontracted = True self.mol = pmol inv_idx = np.argsort(ao_idx, kind='stable').astype(np.int32) - self.rev_ao_idx = cupy.asarray(inv_idx) + self.rev_ao_idx = cupy.asarray(inv_idx, dtype=np.int32) self.coeff = coeff[ao_idx] self.l_ctr_offsets = np.append(0, np.cumsum(l_ctr_counts)).astype(np.int32) self.l_bas_offsets = np.append(0, np.cumsum(l_counts)).astype(np.int32) diff --git a/gpu4pyscf/lib/gdft/nr_eval_gto.cu b/gpu4pyscf/lib/gdft/nr_eval_gto.cu index da59b1c9..fe99ed74 100644 --- a/gpu4pyscf/lib/gdft/nr_eval_gto.cu +++ b/gpu4pyscf/lib/gdft/nr_eval_gto.cu @@ -32,6 +32,9 @@ #define LMAX 8 #define GTO_MAX_CART 15 +#define MIN(X,Y) ((X)<(Y)?(X):(Y)) +#define MAX(X,Y) ((X)>(Y)?(X):(Y)) + template __device__ static void _nabla1(double *fx1, double *fy1, double *fz1, double *fx0, double *fy0, double *fz0, double a){ @@ -40,7 +43,7 @@ static void _nabla1(double *fx1, double *fy1, double *fz1, fx1[0] = a2*fx0[1]; fy1[0] = a2*fy0[1]; fz1[0] = a2*fz0[1]; - +#pragma unroll for (i = 1; i <= ANG; i++) { fx1[i] = i*fx0[i-1] + a2*fx0[i+1]; fy1[i] = i*fy0[i-1] + a2*fy0[i+1]; @@ -48,6 +51,38 @@ static void _nabla1(double *fx1, double *fy1, double *fz1, } } +__global__ +void _screen_index(int *non0shl_idx, double cutoff, int l, int ish, int nprim, double *coords, int ngrids){ + int grid_id = blockIdx.x * blockDim.x + threadIdx.x; + if (grid_id >= ngrids){ + return; + } + int natm = c_envs.natm; + int atm_id = c_bas_atom[ish]; + double* atm_coords = c_envs.atom_coordx; + + double gridx = coords[3*grid_id + 0]; + double gridy = coords[3*grid_id + 1]; + double gridz = coords[3*grid_id + 2]; + + double rx = gridx - atm_coords[atm_id + 0*natm]; + double ry = gridy - atm_coords[atm_id + 1*natm]; + double rz = gridz - atm_coords[atm_id + 2*natm]; + double rr = rx * rx + ry * ry + rz * rz; + + double *exps = c_envs.env + c_bas_exp[ish]; + double *coeffs = c_envs.env + c_bas_coeff[ish]; + double maxc = 0.0; + double min_exp = 1e9; + for (int ip = 0; ip < nprim; ++ip) { + min_exp = MIN(min_exp, exps[ip]); + maxc = MAX(maxc, coeffs[ip]); + } + double gto_sup = -min_exp * rr + .5 * log(rr) * l + log(maxc); + int is_large = gto_sup > log(cutoff); + atomicOr(non0shl_idx + ish, is_large); +} + template __device__ static void _cart2sph(double g_cart[GTO_MAX_CART], double *g_sph, int stride, int grid_id){ if (ANG == 0) { @@ -105,9 +140,10 @@ static void _cart_kernel_deriv0(BasOffsets offsets) int bas_id = blockIdx.y; int natm = c_envs.natm; - int ish = offsets.bas_off + bas_id; - int atm_id = c_bas_atom[ish]; - size_t i0 = c_envs.ao_loc[ish]; + int local_ish = offsets.bas_off + bas_id; + int glob_ish = offsets.bas_indices[local_ish]; + int atm_id = c_bas_atom[glob_ish]; + size_t i0 = offsets.ao_loc[local_ish]; double* __restrict__ gto = offsets.data + i0 * ngrids; double *atom_coordx = c_envs.atom_coordx; @@ -120,8 +156,8 @@ static void _cart_kernel_deriv0(BasOffsets offsets) double ry = gridy[grid_id] - atom_coordy[atm_id]; double rz = gridz[grid_id] - atom_coordz[atm_id]; double rr = rx * rx + ry * ry + rz * rz; - double *exps = c_envs.env + c_bas_exp[ish]; - double *coeffs = c_envs.env + c_bas_coeff[ish]; + double *exps = c_envs.env + c_bas_exp[glob_ish]; + double *coeffs = c_envs.env + c_bas_coeff[glob_ish]; double ce = 0; for (int ip = 0; ip < offsets.nprim; ++ip) { @@ -205,11 +241,11 @@ static void _cart_kernel_deriv1(BasOffsets offsets) int bas_id = blockIdx.y; int natm = c_envs.natm; - int nbas = c_envs.nbas; - int nao = c_envs.ao_loc[nbas]; - int ish = offsets.bas_off + bas_id; - int atm_id = c_bas_atom[ish]; - size_t i0 = c_envs.ao_loc[ish]; + int nao = offsets.nao; + int local_ish = offsets.bas_off + bas_id; + int glob_ish = offsets.bas_indices[local_ish]; + int atm_id = c_bas_atom[glob_ish]; + size_t i0 = offsets.ao_loc[local_ish]; double* __restrict__ gto = offsets.data + i0 * ngrids; double* __restrict__ gtox = offsets.data + (nao * 1 + i0) * ngrids; double* __restrict__ gtoy = offsets.data + (nao * 2 + i0) * ngrids; @@ -225,8 +261,8 @@ static void _cart_kernel_deriv1(BasOffsets offsets) double ry = gridy[grid_id] - atom_coordy[atm_id]; double rz = gridz[grid_id] - atom_coordz[atm_id]; double rr = rx * rx + ry * ry + rz * rz; - double *exps = c_envs.env + c_bas_exp[ish]; - double *coeffs = c_envs.env + c_bas_coeff[ish]; + double *exps = c_envs.env + c_bas_exp[glob_ish]; + double *coeffs = c_envs.env + c_bas_coeff[glob_ish]; double ce = 0; double ce_2a = 0; @@ -424,7 +460,6 @@ static void _cart_kernel_deriv1(BasOffsets offsets) */ else{ double fx0[16], fy0[16], fz0[16]; - double fx1[16], fy1[16], fz1[16]; fx0[0] = 1.0; fy0[0] = 1.0; fz0[0] = 1.0; for (int lx = 1; lx <= ANG+2; lx++){ @@ -433,6 +468,7 @@ static void _cart_kernel_deriv1(BasOffsets offsets) fz0[lx] = fz0[lx-1] * rz; } + double fx1[16], fy1[16], fz1[16]; for (int ip = 0; ip < offsets.nprim; ++ip) { double ce = coeffs[ip] * exp(-exps[ip] * rr) * offsets.fac; @@ -462,11 +498,11 @@ static void _cart_kernel_deriv2(BasOffsets offsets) int bas_id = blockIdx.y; int natm = c_envs.natm; - int nbas = c_envs.nbas; - int nao = c_envs.ao_loc[nbas]; - int ish = offsets.bas_off + bas_id; - int atm_id = c_bas_atom[ish]; - size_t i0 = c_envs.ao_loc[ish]; + int nao = offsets.nao; + int local_ish = offsets.bas_off + bas_id; + int glob_ish = offsets.bas_indices[local_ish]; + int atm_id = c_bas_atom[glob_ish]; + size_t i0 = offsets.ao_loc[local_ish]; double* __restrict__ gto = offsets.data + i0 * ngrids; double* __restrict__ gtox = offsets.data + (nao * 1 + i0) * ngrids; double* __restrict__ gtoy = offsets.data + (nao * 2 + i0) * ngrids; @@ -488,8 +524,8 @@ static void _cart_kernel_deriv2(BasOffsets offsets) double ry = gridy[grid_id] - atom_coordy[atm_id]; double rz = gridz[grid_id] - atom_coordz[atm_id]; double rr = rx * rx + ry * ry + rz * rz; - double *exps = c_envs.env + c_bas_exp[ish]; - double *coeffs = c_envs.env + c_bas_coeff[ish]; + double *exps = c_envs.env + c_bas_exp[glob_ish]; + double *coeffs = c_envs.env + c_bas_coeff[glob_ish]; double fx0[16], fy0[16], fz0[16]; double fx1[16], fy1[16], fz1[16]; @@ -537,11 +573,11 @@ static void _cart_kernel_deriv3(BasOffsets offsets) int bas_id = blockIdx.y; int natm = c_envs.natm; - int nbas = c_envs.nbas; - int nao = c_envs.ao_loc[nbas]; - int ish = offsets.bas_off + bas_id; - int atm_id = c_bas_atom[ish]; - size_t i0 = c_envs.ao_loc[ish]; + int nao = offsets.nao; + int local_ish = offsets.bas_off + bas_id; + int glob_ish = offsets.bas_indices[local_ish]; + int atm_id = c_bas_atom[glob_ish]; + size_t i0 = offsets.ao_loc[local_ish]; double* __restrict__ gto = offsets.data + i0 * ngrids; double* __restrict__ gtox = offsets.data + (nao * 1 + i0) * ngrids; double* __restrict__ gtoy = offsets.data + (nao * 2 + i0) * ngrids; @@ -573,8 +609,8 @@ static void _cart_kernel_deriv3(BasOffsets offsets) double ry = gridy[grid_id] - atom_coordy[atm_id]; double rz = gridz[grid_id] - atom_coordz[atm_id]; double rr = rx * rx + ry * ry + rz * rz; - double *exps = c_envs.env + c_bas_exp[ish]; - double *coeffs = c_envs.env + c_bas_coeff[ish]; + double *exps = c_envs.env + c_bas_exp[glob_ish]; + double *coeffs = c_envs.env + c_bas_coeff[glob_ish]; double fx0[16], fy0[16], fz0[16]; double fx1[16], fy1[16], fz1[16]; @@ -635,11 +671,11 @@ static void _cart_kernel_deriv4(BasOffsets offsets) int bas_id = blockIdx.y; int natm = c_envs.natm; - int nbas = c_envs.nbas; - int nao = c_envs.ao_loc[nbas]; - int ish = offsets.bas_off + bas_id; - int atm_id = c_bas_atom[ish]; - size_t i0 = c_envs.ao_loc[ish]; + int nao = offsets.nao; + int local_ish = offsets.bas_off + bas_id; + int glob_ish = offsets.bas_indices[local_ish]; + int atm_id = c_bas_atom[glob_ish]; + size_t i0 = offsets.ao_loc[local_ish]; double* __restrict__ gto = offsets.data + i0 * ngrids; double* __restrict__ gtox = offsets.data + (nao * 1 + i0) * ngrids; double* __restrict__ gtoy = offsets.data + (nao * 2 + i0) * ngrids; @@ -686,8 +722,8 @@ static void _cart_kernel_deriv4(BasOffsets offsets) double ry = gridy[grid_id] - atom_coordy[atm_id]; double rz = gridz[grid_id] - atom_coordz[atm_id]; double rr = rx * rx + ry * ry + rz * rz; - double *exps = c_envs.env + c_bas_exp[ish]; - double *coeffs = c_envs.env + c_bas_coeff[ish]; + double *exps = c_envs.env + c_bas_exp[glob_ish]; + double *coeffs = c_envs.env + c_bas_coeff[glob_ish]; double fx0[16], fy0[16], fz0[16]; double fx1[16], fy1[16], fz1[16]; @@ -766,9 +802,10 @@ static void _sph_kernel_deriv0(BasOffsets offsets) int bas_id = blockIdx.y; int natm = c_envs.natm; - int ish = offsets.bas_off + bas_id; - int atm_id = c_bas_atom[ish]; - size_t i0 = c_envs.ao_loc[ish]; + int local_ish = offsets.bas_off + bas_id; + int glob_ish = offsets.bas_indices[local_ish]; + int atm_id = c_bas_atom[glob_ish]; + size_t i0 = offsets.ao_loc[local_ish]; double* __restrict__ gto = offsets.data + i0 * ngrids; double *atom_coordx = c_envs.atom_coordx; @@ -781,8 +818,8 @@ static void _sph_kernel_deriv0(BasOffsets offsets) double ry = gridy[grid_id] - atom_coordy[atm_id]; double rz = gridz[grid_id] - atom_coordz[atm_id]; double rr = rx * rx + ry * ry + rz * rz; - double *exps = c_envs.env + c_bas_exp[ish]; - double *coeffs = c_envs.env + c_bas_coeff[ish]; + double *exps = c_envs.env + c_bas_exp[glob_ish]; + double *coeffs = c_envs.env + c_bas_coeff[glob_ish]; double ce = 0; for (int ip = 0; ip < offsets.nprim; ++ip) { @@ -860,11 +897,12 @@ static void _sph_kernel_deriv1(BasOffsets offsets) int bas_id = blockIdx.y; int natm = c_envs.natm; - int nbas = c_envs.nbas; - int nao = c_envs.ao_loc[nbas]; - int ish = offsets.bas_off + bas_id; - int atm_id = c_bas_atom[ish]; - size_t i0 = c_envs.ao_loc[ish]; + int nao = offsets.nao; + int local_ish = offsets.bas_off + bas_id; + int glob_ish = offsets.bas_indices[local_ish]; + int atm_id = c_bas_atom[glob_ish]; + size_t i0 = offsets.ao_loc[local_ish]; + double* __restrict__ gto = offsets.data + i0 * ngrids; double* __restrict__ gtox = offsets.data + (nao * 1 + i0) * ngrids; double* __restrict__ gtoy = offsets.data + (nao * 2 + i0) * ngrids; @@ -880,8 +918,8 @@ static void _sph_kernel_deriv1(BasOffsets offsets) double ry = gridy[grid_id] - atom_coordy[atm_id]; double rz = gridz[grid_id] - atom_coordz[atm_id]; double rr = rx * rx + ry * ry + rz * rz; - double *exps = c_envs.env + c_bas_exp[ish]; - double *coeffs = c_envs.env + c_bas_coeff[ish]; + double *exps = c_envs.env + c_bas_exp[glob_ish]; + double *coeffs = c_envs.env + c_bas_coeff[glob_ish]; double ce = 0; double ce_2a = 0; @@ -1147,11 +1185,11 @@ static void _sph_kernel_deriv2(BasOffsets offsets) int bas_id = blockIdx.y; int natm = c_envs.natm; - int nbas = c_envs.nbas; - int nao = c_envs.ao_loc[nbas]; - int ish = offsets.bas_off + bas_id; - int atm_id = c_bas_atom[ish]; - size_t i0 = c_envs.ao_loc[ish]; + int nao = offsets.nao; + int local_ish = offsets.bas_off + bas_id; + int glob_ish = offsets.bas_indices[local_ish]; + int atm_id = c_bas_atom[glob_ish]; + size_t i0 = offsets.ao_loc[local_ish]; double* __restrict__ gto = offsets.data + i0 * ngrids; double* __restrict__ gtox = offsets.data + (nao * 1 + i0) * ngrids; double* __restrict__ gtoy = offsets.data + (nao * 2 + i0) * ngrids; @@ -1173,25 +1211,27 @@ static void _sph_kernel_deriv2(BasOffsets offsets) double ry = gridy[grid_id] - atom_coordy[atm_id]; double rz = gridz[grid_id] - atom_coordz[atm_id]; double rr = rx * rx + ry * ry + rz * rz; - double *exps = c_envs.env + c_bas_exp[ish]; - double *coeffs = c_envs.env + c_bas_coeff[ish]; + double *exps = c_envs.env + c_bas_exp[glob_ish]; + double *coeffs = c_envs.env + c_bas_coeff[glob_ish]; double fx0[16], fy0[16], fz0[16]; - double fx1[16], fy1[16], fz1[16]; - double fx2[16], fy2[16], fz2[16]; - fx0[0] = 1.0; fy0[0] = 1.0; fz0[0] = 1.0; +#pragma unroll for (int lx = 1; lx <= ANG+2; lx++){ fx0[lx] = fx0[lx-1] * rx; fy0[lx] = fy0[lx-1] * ry; fz0[lx] = fz0[lx-1] * rz; } - double g[GTO_MAX_CART]; + double fx1[16], fy1[16], fz1[16]; + double fx2[16], fy2[16], fz2[16]; + for (int ip = 0; ip < offsets.nprim; ++ip) { double ce = coeffs[ip] * exp(-exps[ip] * rr) * offsets.fac; _nabla1(fx1, fy1, fz1, fx0, fy0, fz0, exps[ip]); _nabla1(fx2, fy2, fz2, fx1, fy1, fz1, exps[ip]); + + double g[GTO_MAX_CART]; _cart_gto(g, ce, fx0, fy0, fz0); _cart2sph(g, gto, ngrids, grid_id); _cart_gto(g, ce, fx1, fy0, fz0); _cart2sph(g, gtox, ngrids, grid_id); _cart_gto(g, ce, fx0, fy1, fz0); _cart2sph(g, gtoy, ngrids, grid_id); @@ -1217,11 +1257,11 @@ static void _sph_kernel_deriv3(BasOffsets offsets) int bas_id = blockIdx.y; int natm = c_envs.natm; - int nbas = c_envs.nbas; - int nao = c_envs.ao_loc[nbas]; - int ish = offsets.bas_off + bas_id; - int atm_id = c_bas_atom[ish]; - size_t i0 = c_envs.ao_loc[ish]; + int nao = offsets.nao; + int local_ish = offsets.bas_off + bas_id; + int glob_ish = offsets.bas_indices[local_ish]; + int atm_id = c_bas_atom[glob_ish]; + size_t i0 = offsets.ao_loc[local_ish]; double* __restrict__ gto = offsets.data + i0 * ngrids; double* __restrict__ gtox = offsets.data + (nao * 1 + i0) * ngrids; double* __restrict__ gtoy = offsets.data + (nao * 2 + i0) * ngrids; @@ -1253,28 +1293,28 @@ static void _sph_kernel_deriv3(BasOffsets offsets) double ry = gridy[grid_id] - atom_coordy[atm_id]; double rz = gridz[grid_id] - atom_coordz[atm_id]; double rr = rx * rx + ry * ry + rz * rz; - double *exps = c_envs.env + c_bas_exp[ish]; - double *coeffs = c_envs.env + c_bas_coeff[ish]; + double *exps = c_envs.env + c_bas_exp[glob_ish]; + double *coeffs = c_envs.env + c_bas_coeff[glob_ish]; double fx0[16], fy0[16], fz0[16]; - double fx1[16], fy1[16], fz1[16]; - double fx2[16], fy2[16], fz2[16]; - double fx3[16], fy3[16], fz3[16]; - fx0[0] = 1.0; fy0[0] = 1.0; fz0[0] = 1.0; +#pragma unroll for (int lx = 1; lx <= ANG+3; lx++){ fx0[lx] = fx0[lx-1] * rx; fy0[lx] = fy0[lx-1] * ry; fz0[lx] = fz0[lx-1] * rz; } + double fx1[16], fy1[16], fz1[16]; + double fx2[16], fy2[16], fz2[16]; + double fx3[16], fy3[16], fz3[16]; - double g[GTO_MAX_CART]; for (int ip = 0; ip < offsets.nprim; ++ip) { double ce = coeffs[ip] * exp(-exps[ip] * rr) * offsets.fac; _nabla1(fx1, fy1, fz1, fx0, fy0, fz0, exps[ip]); _nabla1(fx2, fy2, fz2, fx1, fy1, fz1, exps[ip]); _nabla1(fx3, fy3, fz3, fx2, fy2, fz2, exps[ip]); + double g[GTO_MAX_CART]; _cart_gto(g, ce, fx0, fy0, fz0); _cart2sph(g, gto, ngrids, grid_id); _cart_gto(g, ce, fx1, fy0, fz0); _cart2sph(g, gtox, ngrids, grid_id); _cart_gto(g, ce, fx0, fy1, fz0); _cart2sph(g, gtoy, ngrids, grid_id); @@ -1310,11 +1350,11 @@ static void _sph_kernel_deriv4(BasOffsets offsets) int bas_id = blockIdx.y; int natm = c_envs.natm; - int nbas = c_envs.nbas; - int nao = c_envs.ao_loc[nbas]; - int ish = offsets.bas_off + bas_id; - int atm_id = c_bas_atom[ish]; - size_t i0 = c_envs.ao_loc[ish]; + int nao = offsets.nao; + int local_ish = offsets.bas_off + bas_id; + int glob_ish = offsets.bas_indices[local_ish]; + int atm_id = c_bas_atom[glob_ish]; + size_t i0 = offsets.ao_loc[local_ish]; double* __restrict__ gto = offsets.data + i0 * ngrids; double* __restrict__ gtox = offsets.data + (nao * 1 + i0) * ngrids; double* __restrict__ gtoy = offsets.data + (nao * 2 + i0) * ngrids; @@ -1361,22 +1401,22 @@ static void _sph_kernel_deriv4(BasOffsets offsets) double ry = gridy[grid_id] - atom_coordy[atm_id]; double rz = gridz[grid_id] - atom_coordz[atm_id]; double rr = rx * rx + ry * ry + rz * rz; - double *exps = c_envs.env + c_bas_exp[ish]; - double *coeffs = c_envs.env + c_bas_coeff[ish]; + double *exps = c_envs.env + c_bas_exp[glob_ish]; + double *coeffs = c_envs.env + c_bas_coeff[glob_ish]; double fx0[16], fy0[16], fz0[16]; - double fx1[16], fy1[16], fz1[16]; - double fx2[16], fy2[16], fz2[16]; - double fx3[16], fy3[16], fz3[16]; - double fx4[16], fy4[16], fz4[16]; - fx0[0] = 1.0; fy0[0] = 1.0; fz0[0] = 1.0; +#pragma unroll for (int lx = 1; lx <= ANG+4; lx++){ fx0[lx] = fx0[lx-1] * rx; fy0[lx] = fy0[lx-1] * ry; fz0[lx] = fz0[lx-1] * rz; } - double g[GTO_MAX_CART]; + double fx1[16], fy1[16], fz1[16]; + double fx2[16], fy2[16], fz2[16]; + double fx3[16], fy3[16], fz3[16]; + double fx4[16], fy4[16], fz4[16]; + for (int ip = 0; ip < offsets.nprim; ++ip) { double ce = coeffs[ip] * exp(-exps[ip] * rr) * offsets.fac; _nabla1(fx1, fy1, fz1, fx0, fy0, fz0, exps[ip]); @@ -1384,6 +1424,7 @@ static void _sph_kernel_deriv4(BasOffsets offsets) _nabla1(fx3, fy3, fz3, fx2, fy2, fz2, exps[ip]); _nabla1(fx4, fy4, fz4, fx3, fy3, fz3, exps[ip]); + double g[GTO_MAX_CART]; _cart_gto(g, ce, fx0, fy0, fz0); _cart2sph(g, gto, ngrids, grid_id); _cart_gto(g, ce, fx1, fy0, fz0); _cart2sph(g, gtox, ngrids, grid_id); _cart_gto(g, ce, fx0, fy1, fz0); _cart2sph(g, gtoy, ngrids, grid_id); @@ -1491,26 +1532,36 @@ inline double CINTcommon_fac_sp(int l) } int GDFTeval_gto(cudaStream_t stream, double *ao, int deriv, int cart, - double *grids, int ngrids, int *bas_loc, int nbuckets, - int *atm, int natm, int *bas, int nbas, double *env) + double *grids, int ngrids, + int *bas_indices, + int *ao_loc, int nao, + int *ctr_offsets, int nctr, + int *local_ctr_offsets, + int *bas) { BasOffsets offsets; //DEVICE_INIT(double, d_grids, grids, ngrids * 3); offsets.gridx = grids;//d_grids; offsets.ngrids = ngrids; offsets.data = ao; - + offsets.ao_loc = ao_loc; + offsets.bas_indices = bas_indices; + offsets.nbas = local_ctr_offsets[nctr]; + offsets.nao = nao; dim3 threads(THREADS); dim3 blocks((ngrids+THREADS-1)/THREADS); - for (int bucket = 0; bucket < nbuckets; ++bucket) { - int ish = bas_loc[bucket]; - int l = bas[ANG_OF+ish*BAS_SLOTS]; - - offsets.bas_off = ish; - offsets.nprim = bas[NPRIM_OF+ish*BAS_SLOTS]; + for (int ictr = 0; ictr < nctr; ++ictr) { + int local_ish = local_ctr_offsets[ictr]; + int glob_ish = ctr_offsets[ictr]; //bas_indices[local_ish]; + int l = bas[ANG_OF+glob_ish*BAS_SLOTS]; + offsets.bas_off = local_ish; + offsets.nprim = bas[NPRIM_OF+glob_ish*BAS_SLOTS]; offsets.fac = CINTcommon_fac_sp(l); - blocks.y = bas_loc[bucket+1] - ish; + blocks.y = local_ctr_offsets[ictr+1] - local_ctr_offsets[ictr]; + if (blocks.y == 0){ + continue; + } switch (deriv) { case 0: if (cart == 1) { @@ -1640,4 +1691,25 @@ int GDFTeval_gto(cudaStream_t stream, double *ao, int deriv, int cart, //FREE(d_grids); return 0; } + +int GDFTscreen_index(cudaStream_t stream, int *non0shl_idx, double cutoff, + double *grids, int ngrids, int *bas_loc, int nbas, int *bas) +{ + dim3 threads(THREADS); + dim3 blocks((ngrids+THREADS-1)/THREADS); + + for (int shl_id = 0; shl_id < nbas; ++shl_id) { + int l = bas[ANG_OF+shl_id*BAS_SLOTS]; + int nprim = bas[NPRIM_OF+shl_id*BAS_SLOTS]; + _screen_index<<>>(non0shl_idx, cutoff, l, shl_id, nprim, grids, ngrids); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error of GDFTscreen_index: %s\n", cudaGetErrorString(err)); + return 1; + } + } + return 0; +} + } diff --git a/gpu4pyscf/lib/gdft/nr_eval_gto.cuh b/gpu4pyscf/lib/gdft/nr_eval_gto.cuh index be1a76fa..430ef549 100644 --- a/gpu4pyscf/lib/gdft/nr_eval_gto.cuh +++ b/gpu4pyscf/lib/gdft/nr_eval_gto.cuh @@ -29,8 +29,12 @@ typedef struct { typedef struct { int ngrids; + int nbas; + int nao; int bas_off; int nprim; + int *ao_loc; + int *bas_indices; double fac; double *gridx; double *data; diff --git a/gpu4pyscf/lib/gdft/vv10.cu b/gpu4pyscf/lib/gdft/vv10.cu index df26dffe..620ae2e9 100644 --- a/gpu4pyscf/lib/gdft/vv10.cu +++ b/gpu4pyscf/lib/gdft/vv10.cu @@ -25,7 +25,7 @@ #include "nr_eval_gto.cuh" #include "contract_rho.cuh" -#define THREADS 256 +#define NG_PER_BLOCK 128 #define NG_PER_THREADS 1 __global__ @@ -46,6 +46,7 @@ static void vv10_kernel(double *Fvec, double *Uvec, double *Wvec, W0i = W0[grid_id]; Ki = K[grid_id]; } + double F = 0.0; double U = 0.0; double W = 0.0; @@ -54,42 +55,39 @@ static void vv10_kernel(double *Fvec, double *Uvec, double *Wvec, double *yj = vvcoords + vvngrids; double *zj = vvcoords + 2*vvngrids; - __shared__ double3 xj_t[THREADS]; - __shared__ double3 kp_t[THREADS]; + __shared__ double3 xj_t[NG_PER_BLOCK]; + __shared__ double3 kp_t[NG_PER_BLOCK]; const int tx = threadIdx.x; + for (int j = 0; j < vvngrids; j+=blockDim.x) { - int idx = j + threadIdx.x; + int idx = j + tx; xj_t[tx] = {xj[idx], yj[idx], zj[idx]}; kp_t[tx] = {Kp[idx], W0p[idx], RpW[idx]}; __syncthreads(); - for (int l = 0, M = min(THREADS, vvngrids - j); l < M; ++l){ + + for (int l = 0, M = min(NG_PER_BLOCK, vvngrids - j); l < M; ++l){ double3 xj_tmp = xj_t[l]; - double pjx = xj_tmp.x; - double pjy = xj_tmp.y; - double pjz = xj_tmp.z; // about 23 operations for each pair - double DX = pjx - xi; - double DY = pjy - yi; - double DZ = pjz - zi; + double DX = xj_tmp.x - xi; + double DY = xj_tmp.y - yi; + double DZ = xj_tmp.z - zi; double R2 = DX*DX + DY*DY + DZ*DZ; - double3 kp_tmp = kp_t[l]; - double Kpj = kp_tmp.x; - double W0pj = kp_tmp.y; - double RpWj = kp_tmp.z; + double3 kp_tmp = kp_t[l]; // (Kpj, W0pj, RpWj) - double gp = R2*W0pj + Kpj; + double gp = R2*kp_tmp.y + kp_tmp.x; double g = R2*W0i + Ki; double gt = g + gp; double ggt = g * gt; - double T = RpWj / (gp*ggt); + double g_gt = (g + gt)/ggt; + double T = kp_tmp.z / (gp*ggt); F += T; - T *= (g + gt)/ggt; + T *= g_gt; U += T; W += T * R2; } @@ -100,6 +98,7 @@ static void vv10_kernel(double *Fvec, double *Uvec, double *Wvec, Uvec[grid_id] = U; Wvec[grid_id] = W; } + } __global__ @@ -127,8 +126,8 @@ static void vv10_grad_kernel(double *Fvec, double *vvcoords, double *coords, double *yj = vvcoords + vvngrids; double *zj = vvcoords + 2*vvngrids; - __shared__ double3 xj_t[THREADS]; - __shared__ double3 kp_t[THREADS]; + __shared__ double3 xj_t[NG_PER_BLOCK]; + __shared__ double3 kp_t[NG_PER_BLOCK]; const int tx = threadIdx.x; for (int j = 0; j < vvngrids; j+=blockDim.x) { @@ -138,7 +137,7 @@ static void vv10_grad_kernel(double *Fvec, double *vvcoords, double *coords, kp_t[tx] = {Kp[idx], W0p[idx], RpW[idx]}; __syncthreads(); - for (int l = 0, M = min(THREADS, vvngrids - j); l < M; ++l){ + for (int l = 0, M = min(NG_PER_BLOCK, vvngrids - j); l < M; ++l){ double3 xj_tmp = xj_t[l]; double pjx = xj_tmp.x; double pjy = xj_tmp.y; @@ -181,8 +180,8 @@ int VXC_vv10nlc(cudaStream_t stream, double *Fvec, double *Uvec, double *Wvec, double *W0p, double *W0, double *K, double *Kp, double *RpW, int vvngrids, int ngrids) { - dim3 threads(THREADS); - dim3 blocks((ngrids/NG_PER_THREADS+1+THREADS-1)/THREADS); + dim3 threads(NG_PER_BLOCK); + dim3 blocks((ngrids/NG_PER_THREADS+1+NG_PER_BLOCK-1)/NG_PER_BLOCK); vv10_kernel<<>>(Fvec, Uvec, Wvec, vvcoords, coords, W0p, W0, K, Kp, RpW, vvngrids, ngrids); @@ -199,8 +198,8 @@ int VXC_vv10nlc_grad(cudaStream_t stream, double *Fvec, double *vvcoords, double double *W0p, double *W0, double *K, double *Kp, double *RpW, int vvngrids, int ngrids) { - dim3 threads(THREADS); - dim3 blocks((ngrids+THREADS-1)/THREADS); + dim3 threads(NG_PER_BLOCK); + dim3 blocks((ngrids+NG_PER_BLOCK-1)/NG_PER_BLOCK); vv10_grad_kernel<<>>(Fvec, vvcoords, coords, W0p, W0, K, Kp, RpW, vvngrids, ngrids); cudaError_t err = cudaGetLastError(); diff --git a/gpu4pyscf/lib/gint/gout3c2e.cu b/gpu4pyscf/lib/gint/gout3c2e.cu index 863cf844..fb4ff75e 100644 --- a/gpu4pyscf/lib/gint/gout3c2e.cu +++ b/gpu4pyscf/lib/gint/gout3c2e.cu @@ -13,7 +13,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ - + #pragma once #include @@ -32,14 +32,14 @@ static void GINTgout3c2e_ip(GINTEnvVars envs, double* __restrict__ gout, double* int16_t *idy = idx + nf; int16_t *idz = idx + nf * 2; int i, n, ix, iy, iz; - + for (i = 0; i < nf; i++) { - ix = idx[i]; - iy = idy[i]; + ix = idx[i]; + iy = idy[i]; iz = idz[i]; - double sx = gout[3*i + 0]; - double sy = gout[3*i + 1]; + double sx = gout[3*i + 0]; + double sy = gout[3*i + 1]; double sz = gout[3*i + 2]; #pragma unroll for (n = 0; n < NROOTS; ++n) { @@ -64,21 +64,27 @@ static void GINTgout3c2e_ip(GINTEnvVars envs, double* __restrict__ gout, double* int i, n, ix, iy, iz; for (i = 0; i < nf; i++) { - ix = idx[i]; iy = idy[i]; iz = idz[i]; - sx = gout[3*i + 0]; sy = gout[3*i + 1]; sz = gout[3*i + 2]; + ix = idx[i]; + iy = idy[i]; + iz = idz[i]; + sx = gout[3*i + 0]; + sy = gout[3*i + 1]; + sz = gout[3*i + 2]; #pragma unroll for (n = 0; n < NROOTS; ++n) { sx += f[ix+n] * g[iy+n] * g[iz+n]; sy += g[ix+n] * f[iy+n] * g[iz+n]; sz += g[ix+n] * g[iy+n] * f[iz+n]; } - gout[3*i + 0] = sx; gout[3*i + 1] = sy; gout[3*i + 2] = sz; + gout[3*i + 0] = sx; + gout[3*i + 1] = sy; + gout[3*i + 2] = sz; } } } template __device__ -static void GINTgout3c2e_ipip(GINTEnvVars envs, double* __restrict__ gout, double* __restrict__ g0, double* __restrict__ g1, double* __restrict__ g2, +static void GINTgout3c2e_ipip(GINTEnvVars envs, double* __restrict__ gout, double* __restrict__ g0, double* __restrict__ g1, double* __restrict__ g2, double* __restrict__ g3) { int nf = envs.nf; @@ -86,19 +92,19 @@ double* __restrict__ g3) int16_t *idy = idx + nf; int16_t *idz = idx + nf * 2; int i, n, ix, iy, iz; - + for (i = 0; i < nf; i++) { - ix = idx[i]; - iy = idy[i]; + ix = idx[i]; + iy = idy[i]; iz = idz[i]; - double sxx = gout[9*i + 0]; + double sxx = gout[9*i + 0]; double sxy = gout[9*i + 1]; double sxz = gout[9*i + 2]; - double syx = gout[9*i + 3]; + double syx = gout[9*i + 3]; double syy = gout[9*i + 4]; double syz = gout[9*i + 5]; - double szx = gout[9*i + 6]; + double szx = gout[9*i + 6]; double szy = gout[9*i + 7]; double szz = gout[9*i + 8]; #pragma unroll @@ -135,7 +141,7 @@ static void GINTgout3c2e(GINTEnvVars envs, double* __restrict__ gout, double* __ int16_t *idz = idx + nf * 2; double s; int i, n, ix, iy, iz; - + for (i = 0; i < nf; i++) { ix = idx[i]; iy = idy[i]; @@ -157,7 +163,7 @@ static void GINTgout3c2e(GINTEnvVars envs, double* __restrict__ gout, double* __ int16_t *idz = idx + nf * 2; double s; int i, n, ix, iy, iz; - + for (i = 0; i < nf; i++) { ix = idx[i]; iy = idy[i]; @@ -201,7 +207,7 @@ static void GINTwrite_int3c2e_ipip_direct(GINTEnvVars envs, ERITensor eri, doubl int16_t *idy = idx + nf; int16_t *idz = idx + nf * 2; int ix, iy, iz, off; - + for (n = 0, k = k0; k < k1; ++k) { pxx_eri = eri.data + 0 * lstride + k * kstride; pxy_eri = eri.data + 1 * lstride + k * kstride; @@ -216,9 +222,9 @@ static void GINTwrite_int3c2e_ipip_direct(GINTEnvVars envs, ERITensor eri, doubl for (j = j0; j < j1; ++j) { for (i = i0; i < i1; ++i, ++n) { ix = idx[n]; - iy = idy[n]; + iy = idy[n]; iz = idz[n]; - + double eri_xx = 0; double eri_xy = 0; double eri_xz = 0; @@ -278,7 +284,7 @@ static void GINTwrite_int3c2e_ip_direct(GINTEnvVars envs, ERITensor eri, double* int16_t *idy = idx + nf; int16_t *idz = idx + nf * 2; int ix, iy, iz, off; - + for (n = 0, k = k0; k < k1; ++k) { px_eri = eri.data + 0 * lstride + k * kstride; py_eri = eri.data + 1 * lstride + k * kstride; @@ -287,9 +293,9 @@ static void GINTwrite_int3c2e_ip_direct(GINTEnvVars envs, ERITensor eri, double* for (j = j0; j < j1; ++j) { for (i = i0; i < i1; ++i, ++n) { ix = idx[n]; - iy = idy[n]; + iy = idy[n]; iz = idz[n]; - + double eri_x = 0; double eri_y = 0; double eri_z = 0; @@ -329,16 +335,16 @@ static void GINTwrite_int3c2e_direct(GINTEnvVars envs, ERITensor eri, double* g, int16_t *idy = idx + nf; int16_t *idz = idx + nf * 2; int ix, iy, iz, off; - + for (n = 0, k = k0; k < k1; ++k) { p_eri = eri.data + 0 * lstride + k * kstride; for (j = j0; j < j1; ++j) { for (i = i0; i < i1; ++i, ++n) { ix = idx[n]; - iy = idy[n]; + iy = idy[n]; iz = idz[n]; - + double eri = 0; #pragma unroll for (int ir = 0; ir < NROOTS; ++ir){ @@ -399,7 +405,7 @@ static void GINTwrite_int3c2e_ip(ERITensor eri, double* __restrict__ gout, double* __restrict__ px_eri; double* __restrict__ py_eri; double* __restrict__ pz_eri; - + for (n = 0, k = k0; k < k1; ++k) { px_eri = eri.data + 0 * lstride + k * kstride; py_eri = eri.data + 1 * lstride + k * kstride; @@ -410,7 +416,7 @@ static void GINTwrite_int3c2e_ip(ERITensor eri, double* __restrict__ gout, sx = gout[3 * n]; sy = gout[3 * n + 1]; sz = gout[3 * n + 2]; - + off = i + jstride * j; px_eri[off] = sx; py_eri[off] = sy;