Skip to content

Commit

Permalink
optimized initialization
Browse files Browse the repository at this point in the history
  • Loading branch information
wxj6000 committed Jan 12, 2024
1 parent f4e831d commit 86d449d
Show file tree
Hide file tree
Showing 12 changed files with 76 additions and 124 deletions.
35 changes: 0 additions & 35 deletions examples/18-to_gpu0.py

This file was deleted.

41 changes: 0 additions & 41 deletions examples/19-pcm_optimize.py

This file was deleted.

4 changes: 2 additions & 2 deletions examples/dft_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@
basis=bas,
max_memory=32000)
# set verbose >= 6 for debugging timer
mol.verbose = 7
mol.verbose = 4

mf_df = rks.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis)
mf_df.verbose = 7
mf_df.verbose = 4

if args.solvent:
mf_df = mf_df.PCM()
Expand Down
19 changes: 10 additions & 9 deletions gpu4pyscf/df/df.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def loop(self, blksize=None, unpack=True):
rows = self.intopt.cderi_row
cols = self.intopt.cderi_col
buf_prefetch = None

buf_cderi = cupy.zeros([blksize,nao,nao])
data_stream = cupy.cuda.stream.Stream(non_blocking=True)
compute_stream = cupy.cuda.get_current_stream()
#compute_stream = cupy.cuda.stream.Stream()
Expand All @@ -153,14 +153,15 @@ def loop(self, blksize=None, unpack=True):
buf_prefetch.set(cderi_sparse[p1:p2,:])
stop_event = data_stream.record()
if unpack:
buf2 = cupy.zeros([p1-p0,nao,nao])
buf2[:p1-p0,rows,cols] = buf
buf2[:p1-p0,cols,rows] = buf
buf_cderi[:p1-p0,rows,cols] = buf
buf_cderi[:p1-p0,cols,rows] = buf
buf2 = buf_cderi[:p1-p0]
else:
buf2 = None
yield buf2, buf.T
compute_stream.wait_event(stop_event)
cupy.cuda.Device().synchronize()
if isinstance(cderi_sparse, np.ndarray):
cupy.cuda.Device().synchronize()

if buf_prefetch is not None:
buf = buf_prefetch
Expand Down Expand Up @@ -205,8 +206,8 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
cderi = np.ndarray([naux, npair], dtype=np.float64, order='C', buffer=mem)
except Exception:
raise RuntimeError('Out of CPU memory')

data_stream = cupy.cuda.stream.Stream(non_blocking=False)
if(not use_gpu_memory):
data_stream = cupy.cuda.stream.Stream(non_blocking=False)
count = 0
nq = len(intopt.log_qs)
for cp_ij_id, _ in enumerate(intopt.log_qs):
Expand Down Expand Up @@ -265,8 +266,8 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
for i in range(naux):
cderi_block[i].get(out=cderi[i,ij0:ij1])
t1 = log.timer_debug1(f'solve {cp_ij_id} / {nq}', *t1)

cupy.cuda.Device().synchronize()
if not use_gpu_memory:
cupy.cuda.Device().synchronize()
return cderi


1 change: 0 additions & 1 deletion gpu4pyscf/df/df_jk.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ def build_df():
rks.initialize_grids(mf, mf.mol, dm0)
ni.build(mf.mol, mf.grids.coords)
mf._numint.xcfuns = numint._init_xcfuns(mf.xc, dm0.ndim==3)
dm0 = cupy.asarray(dm0)
return

def _density_fit(mf, auxbasis=None, with_df=None, only_dfj=False):
Expand Down
40 changes: 23 additions & 17 deletions gpu4pyscf/dft/numint.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 64*64)
ALIGNED = getattr(__config__, 'grid_aligned', 16*16)
AO_ALIGNMENT = getattr(__config__, 'ao_aligned', 16)
AO_THRESHOLD = 1e-12
AO_THRESHOLD = 1e-10

# Should we release the cupy cache?
FREE_CUPY_CACHE = False
Expand Down Expand Up @@ -199,14 +199,15 @@ def eval_rho2(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
rho = _contract_rho(c0, c0)
elif xctype in ('GGA', 'NLC'):
rho = cupy.empty((4,ngrids))
c0 = _dot_ao_dm(mol, ao[0], cpos, non0tab, shls_slice, ao_loc)
#c0 = _dot_ao_dm(mol, ao[0], cpos, non0tab, shls_slice, ao_loc)
c0 = contract('nig,io->nog', ao, cpos)
#:rho[0] = numpy.einsum('pi,pi->p', c0, c0)
_contract_rho(c0, c0, rho=rho[0])
_contract_rho(c0[0], c0[0], rho=rho[0])
for i in range(1, 4):
c1 = _dot_ao_dm(mol, ao[i], cpos, non0tab, shls_slice, ao_loc)
#c1 = _dot_ao_dm(mol, ao[i], cpos, non0tab, shls_slice, ao_loc)
#:rho[i] = numpy.einsum('pi,pi->p', c0, c1) * 2 # *2 for +c.c.
_contract_rho(c0, c1, rho=rho[i])
rho[i] *= 2
_contract_rho(c0[0], c0[i], rho=rho[i])
rho[1:] *= 2
else: # meta-GGA
if with_lapl:
# rho[4] = \nabla^2 rho, rho[5] = 1/2 |nabla f|^2
Expand All @@ -215,29 +216,31 @@ def eval_rho2(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
else:
rho = cupy.empty((5,ngrids))
tau_idx = 4
c0 = _dot_ao_dm(mol, ao[0], cpos, non0tab, shls_slice, ao_loc)
#c0 = _dot_ao_dm(mol, ao[0], cpos, non0tab, shls_slice, ao_loc)
c0 = contract('nig,io->nog', ao, cpos)
#:rho[0] = numpy.einsum('pi,pi->p', c0, c0)
_contract_rho(c0, c0, rho=rho[0])
_contract_rho(c0[0], c0[0], rho=rho[0])

rho[tau_idx] = 0
for i in range(1, 4):
c1 = _dot_ao_dm(mol, ao[i], cpos, non0tab, shls_slice, ao_loc)
#c1 = _dot_ao_dm(mol, ao[i], cpos, non0tab, shls_slice, ao_loc)
#:rho[i] = numpy.einsum('pi,pi->p', c0, c1) * 2 # *2 for +c.c.
#:rho[5] += numpy.einsum('pi,pi->p', c1, c1)
rho[i] = _contract_rho(c0, c1) * 2
rho[tau_idx] += _contract_rho(c1, c1)
rho[i] = _contract_rho(c0[0], c0[i])
rho[tau_idx] += _contract_rho(c0[i], c0[i])

if with_lapl:
if ao.shape[0] > 4:
XX, YY, ZZ = 4, 7, 9
ao2 = ao[XX] + ao[YY] + ao[ZZ]
c1 = _dot_ao_dm(mol, ao2, cpos, non0tab, shls_slice, ao_loc)
#:rho[4] = numpy.einsum('pi,pi->p', c0, c1)
rho[4] = _contract_rho(c0, c1)
rho[4] = _contract_rho(c0[0], c1)
rho[4] += rho[5]
rho[4] *= 2
else:
rho[4] = 0
rho[1:4] *= 2
rho[tau_idx] *= .5
return rho

Expand Down Expand Up @@ -321,11 +324,14 @@ def eval_rho4(mol, ao, c0, mo1, non0tab=None, xctype='LDA',
rho[i] = _contract_rho(c0, c_0[i])
rho *= 2.0
elif xctype in ('GGA', 'NLC'):
log = logger.new_logger(mol, mol.verbose)
t0 = log.init_timer()
c_0 = contract('nig,aio->anog', ao, cpos1)
t0 = log.timer_debug1('ao * cpos', *t0)
rho = cupy.empty([na, 4, ngrids])
for i in range(na):
_contract_rho_gga(c0, c_0[i], rho=rho[i])

t0 = log.timer_debug1('contract rho', *t0)
else: # meta-GGA
if with_lapl:
raise NotImplementedError("mGGA with lapl not implemented")
Expand Down Expand Up @@ -454,7 +460,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
nset = len(dms)

if mo_coeff is not None:
mo_coeff = coeff @ mo_coeff
mo_coeff = mo_coeff[opt.ao_idx]

nelec = cupy.zeros(nset)
excsum = cupy.zeros(nset)
Expand Down Expand Up @@ -494,7 +500,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
mo_coeff_mask = mo_coeff[idx,:]
rho_tot[i,:,p0:p1] = eval_rho2(mol, ao_mask, mo_coeff_mask, mo_occ, None, xctype)
p0 = p1
t1 = log.timer_debug2('eval rho', *t1)
t1 = log.timer_debug2('eval rho slice', *t1)
t0 = log.timer_debug1('eval rho', *t0)

wv = []
Expand Down Expand Up @@ -755,7 +761,7 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=
c0 = contract('nig,io->nog', ao, occ_coeff_mask)
else: # mgga
c0 = contract('nig,io->nog', ao, occ_coeff_mask)

t1 = log.timer_debug2(f'eval occ_coeff, with mocc: {with_mocc}', *t1)
if with_mocc:
rho1 = eval_rho4(opt.mol, ao, c0, mo1[:,mask], xctype=xctype, with_lapl=False)
else:
Expand Down Expand Up @@ -1242,7 +1248,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
for ip0, ip1 in lib.prange(0, ngrids, blksize):
coords = grids.coords[ip0:ip1]
weight = grids.weights[ip0:ip1]

t1 = log.init_timer()
# cache ao indices
if (block_id, blksize, ngrids) not in ni.non0ao_idx:
stream = cupy.cuda.get_current_stream()
Expand Down
16 changes: 10 additions & 6 deletions gpu4pyscf/grad/rks.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,8 +233,10 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
vmat_tmp = _gga_grad_sum_(ao_mask, wv)
add_sparse(vmat, vmat_tmp, mask)

vmat = contract('npq,qj->npj', vmat, coeff)
vmat = contract('pi,npj->nij', coeff, vmat)
#vmat = contract('npq,qj->npj', vmat, coeff)
#vmat = contract('pi,npj->nij', coeff, vmat)
rev_ao_idx = opt.rev_ao_idx
vmat = take_last2d(vmat, rev_ao_idx)
exc = None
# - sign because nabla_X = -nabla_x
return exc, -vmat
Expand Down Expand Up @@ -278,10 +280,12 @@ def _make_dR_dao_w(ao, wv):

def _d1_dot_(ao1, ao2, out=None):
if out is None:
vmat0 = cupy.dot(ao1[0], ao2)
vmat1 = cupy.dot(ao1[1], ao2)
vmat2 = cupy.dot(ao1[2], ao2)
return cupy.stack([vmat0,vmat1,vmat2])
out = cupy.empty([3, ao1[0].shape[0], ao2.shape[1]])
out[0] = cupy.dot(ao1[0], ao2)
out[1] = cupy.dot(ao1[1], ao2)
out[2] = cupy.dot(ao1[2], ao2)
return out
#return cupy.stack([vmat0,vmat1,vmat2])
else:
cupy.dot(ao1[0], ao2, out=out[0])
cupy.dot(ao1[1], ao2, out=out[1])
Expand Down
8 changes: 6 additions & 2 deletions gpu4pyscf/hessian/rks.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,12 +354,13 @@ def _d1d2_dot_(vmat, mol, ao1, ao2, mask, ao_loc, dR1_on_bra=True):
for d2 in range(3):
vmat[d1,d2] += numint._dot_ao_ao(mol, ao1[d1], ao2[d2], mask,
shls_slice, ao_loc)
#vmat += contract('xig,yjg->xyij', ao1, ao2)
else: # (d/dR2 bra) * (d/dR1 ket)
for d1 in range(3):
for d2 in range(3):
vmat[d1,d2] += numint._dot_ao_ao(mol, ao1[d2], ao2[d1], mask,
shls_slice, ao_loc)

#vmat += contract('yig,xjg->xyij', ao1, ao2)
def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
mol = hessobj.mol
mf = hessobj.base
Expand Down Expand Up @@ -461,7 +462,10 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory):
for i in range(3):
aow = rks_grad._make_dR_dao_w(ao_mask, wv[i])
rks_grad._d1_dot_(aow, ao_mask[0].T, out=vmat_tmp[i])
aow = [numint._scale_ao(ao_mask[:4], wv[i,:4]) for i in range(3)]
ng = len(weight)
aow = cupy.empty([3,nao_non0,ng])
for i in range(3):
aow[i] = numint._scale_ao(ao_mask[:4], wv[i,:4])
_d1d2_dot_(vmat_tmp, mol, ao_mask[1:4], aow, mask, ao_loc, False)
#vmat_tmp = contract('pi,xypq->xyiq', coeff[mask], vmat_tmp)
#vmat_tmp = contract('qj,xyiq->xyij', coeff[mask], vmat_tmp)
Expand Down
12 changes: 7 additions & 5 deletions gpu4pyscf/lib/gdft/nr_eval_gto.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
#include "nr_eval_gto.cuh"
#include "contract_rho.cuh"

#define NG_PER_BLOCK 128
#define NG_PER_BLOCK 256
#define LMAX 8
#define GTO_MAX_CART 15

Expand Down Expand Up @@ -354,9 +354,10 @@ static void _cart_kernel_deriv1(BasOffsets offsets)
double ce_2a = 0;
for (int ip = 0; ip < offsets.nprim; ++ip) {
double c = coeffs[ip];
double e = exp(-exps[ip] * rr);
double exp_ip = exps[ip];
double e = exp(-exp_ip * rr);
ce += c * e;
ce_2a += c * e * exps[ip];
ce_2a += c * e * exp_ip;
}
ce *= offsets.fac;
ce_2a *= -2 * offsets.fac;
Expand Down Expand Up @@ -1025,9 +1026,10 @@ static void _sph_kernel_deriv1(BasOffsets offsets)
double ce_2a = 0;
for (int ip = 0; ip < offsets.nprim; ++ip) {
double c = coeffs[ip];
double e = exp(-exps[ip] * rr);
double exp_ip = exps[ip];
double e = exp(-exp_ip * rr);
ce += c * e;
ce_2a += c * e * exps[ip];
ce_2a += c * e * exp_ip;
}
ce *= offsets.fac;
ce_2a *= -2 * offsets.fac;
Expand Down
Loading

0 comments on commit 86d449d

Please sign in to comment.