From 761f85ff22cf04d62167451dbf82df48caefb2e8 Mon Sep 17 00:00:00 2001 From: Xiaojie Wu Date: Sat, 25 May 2024 09:32:23 -0700 Subject: [PATCH] Improve Krylov subspace solver in CPHF (#156) * improve df scheme * separate krylov subspace in cphf * sg1_prune * block krylov subspace * debug -> info * consistent linear dependence thresholds for df * df inv -> solve * df inv -> solve in uhf * bugfix * bugfix * dynamic slicing in df hessian * reduce the computational cost in df Hessian * bugfix * raise error if CPHF does not converge * vectorize krylov subspace * conv_tol_cpscf = 1e-6 * ucphf * unit test --- examples/00-h2o.py | 2 +- examples/dft_driver.py | 4 +- gpu4pyscf/df/df.py | 6 +- gpu4pyscf/df/grad/rhf.py | 26 +++++- gpu4pyscf/df/hessian/rhf.py | 86 +++++++++++--------- gpu4pyscf/df/hessian/uhf.py | 101 +++++++++++++----------- gpu4pyscf/df/int3c2e.py | 16 ++-- gpu4pyscf/dft/gen_grid.py | 7 +- gpu4pyscf/hessian/rhf.py | 12 +-- gpu4pyscf/hessian/uhf.py | 6 +- gpu4pyscf/lib/cupy_helper.py | 49 +++++++----- gpu4pyscf/lib/tests/test_cupy_helper.py | 14 +++- gpu4pyscf/scf/cphf.py | 28 ++++--- gpu4pyscf/scf/hf.py | 2 +- gpu4pyscf/scf/tests/test_cphf.py | 12 +-- gpu4pyscf/scf/ucphf.py | 14 ++-- 16 files changed, 229 insertions(+), 156 deletions(-) diff --git a/examples/00-h2o.py b/examples/00-h2o.py index acaa740b..2bf6c993 100644 --- a/examples/00-h2o.py +++ b/examples/00-h2o.py @@ -36,7 +36,7 @@ atom=atom, # water molecule basis='def2-tzvpp', # basis set output='./pyscf.log', # save log file - verbose=1 # control the level of print info + verbose=6 # control the level of print info ) mf_GPU = rks.RKS( # restricted Kohn-Sham DFT diff --git a/examples/dft_driver.py b/examples/dft_driver.py index 0a6bd911..50948d2e 100644 --- a/examples/dft_driver.py +++ b/examples/dft_driver.py @@ -36,13 +36,13 @@ basis=bas, max_memory=32000) # set verbose >= 6 for debugging timer -mol.verbose = 4 +mol.verbose = 6 if args.unrestricted: mf_df = uks.UKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis) else: mf_df = rks.RKS(mol, xc=args.xc).density_fit(auxbasis=args.auxbasis) -mf_df.verbose = 4 +mf_df.verbose = 6 if args.solvent: mf_df = mf_df.PCM() diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index e8198ccf..206e7902 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -30,7 +30,9 @@ MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128) ALIGNED = getattr(__config__, 'ao_aligned', 32) -LINEAR_DEP_TOL = incore.LINEAR_DEP_THR + +# TODO: reuse the setting in pyscf 2.6 +LINEAR_DEP_THR = 1e-6#incore.LINEAR_DEP_THR class DF(lib.StreamObject): from gpu4pyscf.lib.utils import to_gpu, device @@ -93,7 +95,7 @@ def build(self, direct_scf_tol=1e-14, omega=None): self.cd_low = tag_array(self.cd_low, tag='cd') except Exception: w, v = cupy.linalg.eigh(j2c) - idx = w > LINEAR_DEP_TOL + idx = w > LINEAR_DEP_THR self.cd_low = (v[:,idx] / cupy.sqrt(w[idx])) self.cd_low = tag_array(self.cd_low, tag='eig') diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py index 1a1dfb48..ad8721a0 100644 --- a/gpu4pyscf/df/grad/rhf.py +++ b/gpu4pyscf/df/grad/rhf.py @@ -18,17 +18,39 @@ import cupy from cupyx.scipy.linalg import solve_triangular from pyscf import scf -from gpu4pyscf.df import int3c2e -from gpu4pyscf.lib.cupy_helper import print_mem_info, tag_array, unpack_tril, contract, load_library, take_last2d +from gpu4pyscf.df import int3c2e, df +from gpu4pyscf.lib.cupy_helper import (print_mem_info, tag_array, +unpack_tril, contract, load_library, take_last2d, cholesky) from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf import __config__ from gpu4pyscf.lib import logger libcupy_helper = load_library('libcupy_helper') +LINEAR_DEP_THRESHOLD = df.LINEAR_DEP_THR MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128) ALIGNED = getattr(__config__, 'ao_aligned', 64) +def _gen_metric_solver(int2c, decompose_j2c='CD', lindep=LINEAR_DEP_THRESHOLD): + ''' generate a solver to solve Ax = b, RHS must be in (n,....) ''' + if decompose_j2c.upper() == 'CD': + try: + j2c = cholesky(int2c, lower=True) + def j2c_solver(v): + return solve_triangular(j2c, v, overwrite_b=False) + return j2c_solver + + except Exception: + pass + + w, v = cupy.linalg.eigh(int2c) + mask = w > lindep + v1 = v[:,mask] + j2c = cupy.dot(v1/w[mask], v1.conj().T) + def j2c_solver(b): # noqa: F811 + return j2c.dot(b.reshape(j2c.shape[0],-1)).reshape(b.shape) + return j2c_solver + def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega=None): if mol is None: mol = mf_grad.mol #TODO: dm has to be the SCF density matrix in this version. dm should be diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index e31d006d..1daae751 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -32,20 +32,23 @@ ''' - import numpy import cupy import numpy as np -from pyscf import lib, df -from pyscf.df.grad.rhf import LINEAR_DEP_THRESHOLD +from pyscf import lib +from pyscf.df.incore import LINEAR_DEP_THR from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf.hessian import rhf as rhf_hess from gpu4pyscf.lib.cupy_helper import ( - contract, tag_array, release_gpu_stack, print_mem_info, take_last2d, pinv) -from gpu4pyscf.df import int3c2e + contract, tag_array, get_avail_mem, release_gpu_stack, print_mem_info, take_last2d, pinv) +from gpu4pyscf.df import int3c2e, df from gpu4pyscf.lib import logger +from gpu4pyscf import __config__ +from gpu4pyscf.df.grad.rhf import _gen_metric_solver +LINEAR_DEP_THRESHOLD = df.LINEAR_DEP_THR BLKSIZE = 256 +ALIGNED = getattr(__config__, 'ao_aligned', 32) def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, max_memory=4000, verbose=None): @@ -101,6 +104,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c = cupy.asarray(int2c, order='C') int2c = take_last2d(int2c, aux_ao_idx) int2c_inv = pinv(int2c, lindep=LINEAR_DEP_THRESHOLD) + solve_j2c = _gen_metric_solver(int2c) int2c = None int2c_ip1 = cupy.asarray(int2c_ip1, order='C') @@ -114,8 +118,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # int3c contributions wj, wk_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0_tag, omega=omega) - rhoj0_P = contract('pq,q->p', int2c_inv, wj) - rhok0_P__ = contract('pq,qij->pij', int2c_inv, wk_P__) + rhoj0_P = solve_j2c(wj) + rhok0_P__ = solve_j2c(wk_P__) wj = wk_P__ = None t1 = log.timer_debug1('intermediate variables with int3c2e', *t1) @@ -125,56 +129,64 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # int3c_ip1 contributions wj1_P, wk1_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0_tag, omega=omega) - rhoj1_P = contract('pq,ipx->iqx', int2c_inv, wj1_P) + #rhoj1_P = contract('pq,pix->qix', int2c_inv, wj1_P) + rhoj1_P = solve_j2c(wj1_P) - hj_ao_ao += 4.0*contract('ipx,jpy->ijxy', rhoj1_P, wj1_P) # (10|0)(0|0)(0|01) + hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P) # (10|0)(0|0)(0|01) wj1_P = None if hessobj.auxbasis_response: wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P) - wj1_01 = contract('yqp,ipx->iqxy', int2c_ip1, rhoj1_P) - hj_ao_aux += contract('ipx,py->ipxy', rhoj1_P, wj_ip2) # (10|0)(1|00) - hj_ao_aux -= contract('ipx,yp->ipxy', rhoj1_P, wj0_01) # (10|0)(1|0)(0|00) + wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P) + hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2) # (10|0)(1|00) + hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01) # (10|0)(1|0)(0|00) hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01) # (10|0)(0|1)(0|00) wj1_01 = None rhoj1_P = None - int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv) if with_k: - for i0, i1 in lib.prange(0,nao,64): - wk1_Pko_islice = cupy.asarray(wk1_Pko[i0:i1]) - rhok1_Pko = contract('pq,iqox->ipox', int2c_inv, wk1_Pko_islice) - for k0, k1 in lib.prange(0,nao,64): - wk1_Pko_kslice = cupy.asarray(wk1_Pko[k0:k1]) + mem_avail = get_avail_mem() + nocc = mocc.shape[1] + slice_size = naux*nocc*9 # largest slice of intermediate variables + blksize = int(mem_avail*0.2/8/slice_size/ALIGNED) * ALIGNED + for i0, i1 in lib.prange(0,nao,blksize): + wk1_Pko_islice = cupy.asarray(wk1_Pko[:,i0:i1]) + #rhok1_Pko = contract('pq,qiox->piox', int2c_inv, wk1_Pko_islice) + rhok1_Pko = solve_j2c(wk1_Pko_islice) + wk1_Pko_islice = None + for k0, k1 in lib.prange(0,nao,blksize): + wk1_Pko_kslice = cupy.asarray(wk1_Pko[:,k0:k1]) # (10|0)(0|10) without response of RI basis - vk2_ip1_ip1 = contract('ipox,kpoy->ikxy', rhok1_Pko, wk1_Pko_kslice) + vk2_ip1_ip1 = contract('piox,pkoy->ikxy', rhok1_Pko, wk1_Pko_kslice) hk_ao_ao[i0:i1,k0:k1] += contract('ikxy,ik->ikxy', vk2_ip1_ip1, dm0[i0:i1,k0:k1]) vk2_ip1_ip1 = None # (10|0)(0|01) without response of RI basis - bra = contract('ipox,ko->ipkx', rhok1_Pko, mocc_2[k0:k1]) - ket = contract('kpoy,io->kpiy', wk1_Pko_kslice, mocc_2[i0:i1]) - hk_ao_ao[i0:i1,k0:k1] += contract('ipkx,kpiy->ikxy', bra, ket) + bra = contract('piox,ko->pikx', rhok1_Pko, mocc_2[k0:k1]) + ket = contract('pkoy,io->pkiy', wk1_Pko_kslice, mocc_2[i0:i1]) + hk_ao_ao[i0:i1,k0:k1] += contract('pikx,pkiy->ikxy', bra, ket) bra = ket = None wk1_Pko_kslice = None if hessobj.auxbasis_response: # (10|0)(1|00) - wk_ip2_Ipo = contract('porx,io->iprx', wk_ip2_P__, mocc_2[i0:i1]) - hk_ao_aux[i0:i1] += contract('ipox,ipoy->ipxy', rhok1_Pko, wk_ip2_Ipo) + wk_ip2_Ipo = contract('porx,io->pirx', wk_ip2_P__, mocc_2[i0:i1]) + hk_ao_aux[i0:i1] += contract('piox,pioy->ipxy', rhok1_Pko, wk_ip2_Ipo) wk_ip2_Ipo = None # (10|0)(1|0)(0|00) rhok0_P_I = contract('qor,ir->qoi', rhok0_P__, mocc_2[i0:i1]) - wk1_P_I = contract('ypq,qoi->ipoy', int2c_ip1, rhok0_P_I) - hk_ao_aux[i0:i1] -= contract("ipox,ipoy->ipxy", rhok1_Pko, wk1_P_I) - wk1_P_I = rhok1_Pko = None + wk1_P_I = contract('ypq,qoi->pioy', int2c_ip1, rhok0_P_I) + hk_ao_aux[i0:i1] -= contract("piox,pioy->ipxy", rhok1_Pko, wk1_P_I) + wk1_P_I = None # (10|0)(0|1)(0|00) - for q0,q1 in lib.prange(0,naux,64): - wk1_I = contract('yqp,ipox->iqoxy', int2c_ip1_inv[:,q0:q1], wk1_Pko_islice) - hk_ao_aux[i0:i1,q0:q1] -= contract('qoi,iqoxy->iqxy', rhok0_P_I[q0:q1], wk1_I) + #for q0,q1 in lib.prange(0,naux,64): + wk1_I = contract('yqp,piox->qioxy', int2c_ip1, rhok1_Pko) + hk_ao_aux[i0:i1] -= contract('qoi,qioxy->iqxy', rhok0_P_I, wk1_I) + #wk1_I = contract('piox,qoi->ipqx', rhok1_Pko, rhok0_P_I) + #hk_ao_aux[i0:i1] -= contract('ipqx,yqp->iqxy', wk1_I, int2c_ip1) wk1_I = rhok0_P_I = None - wk1_Pko_islice = None + wk1_Pko = None t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1) @@ -249,6 +261,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # aux-aux pair if hessobj.auxbasis_response > 1: wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P) + int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv) rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv) # (1|0)(0|00) hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10) # (00|0)(1|0), (0|1)(0|00) @@ -448,21 +461,22 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, dm0_tag = tag_array(dm0, occ_coeff=mocc) int2c = take_last2d(int2c, aux_ao_idx) - int2c_inv = pinv(int2c, lindep=LINEAR_DEP_THRESHOLD) + solve_j2c = _gen_metric_solver(int2c) int2c = None wj, wk_Pl_ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega) - rhoj0 = contract('pq,q->p', int2c_inv, wj) + rhoj0 = solve_j2c(wj) + wj = None if isinstance(wk_Pl_, cupy.ndarray): - rhok0_Pl_ = contract('pq,qio->pio', int2c_inv, wk_Pl_) + rhok0_Pl_ = solve_j2c(wk_Pl_) else: rhok0_Pl_ = np.empty_like(wk_Pl_) for p0, p1 in lib.prange(0,nao,64): wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1]) - rhok0_Pl_[:,p0:p1] = contract('pq,qio->pio', int2c_inv, wk_tmp).get() + rhok0_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get() wk_tmp = None - wk_Pl_ = int2c_inv = None + wk_Pl_ = None # ----------------------------- # int3c_ip1 contributions diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py index f20d2092..46dca4ea 100644 --- a/gpu4pyscf/df/hessian/uhf.py +++ b/gpu4pyscf/df/hessian/uhf.py @@ -36,14 +36,17 @@ import numpy import cupy import numpy as np -from pyscf import lib, df +from pyscf import lib from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf.hessian import uhf as uhf_hess from gpu4pyscf.hessian import rhf as rhf_hess -from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack, print_mem_info, take_last2d -from gpu4pyscf.df import int3c2e +from gpu4pyscf.lib.cupy_helper import ( + contract, tag_array, release_gpu_stack, print_mem_info, take_last2d, pinv) +from gpu4pyscf.df import int3c2e, df from gpu4pyscf.lib import logger +from gpu4pyscf.df.grad.rhf import _gen_metric_solver +LINEAR_DEP_THRESHOLD = df.LINEAR_DEP_THR BLKSIZE = 256 def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, @@ -105,7 +108,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, dm0b_tag = tag_array(dm0b, occ_coeff=moccb) int2c = cupy.asarray(int2c, order='C') int2c = take_last2d(int2c, aux_ao_idx) - int2c_inv = cupy.linalg.pinv(int2c, rcond=1e-12) + int2c_inv = pinv(int2c, lindep=LINEAR_DEP_THRESHOLD) + solve_j2c = _gen_metric_solver(int2c) int2c = None int2c_ip1 = cupy.asarray(int2c_ip1, order='C') @@ -120,9 +124,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # int3c contributions wja, wka_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0a_tag, omega=omega) wjb, wkb_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0b_tag, omega=omega) - rhoj0_P = contract('pq,q->p', int2c_inv, wja + wjb) - rhok0a_P__ = contract('pq,qij->pij', int2c_inv, wka_P__) - rhok0b_P__ = contract('pq,qij->pij', int2c_inv, wkb_P__) + rhoj0_P = solve_j2c(wja + wjb) + rhok0a_P__ = solve_j2c(wka_P__) + rhok0b_P__ = solve_j2c(wkb_P__) wja = wjb = wka_P__ = wkb_P__ = None t1 = log.timer_debug1('intermediate variables with int3c2e', *t1) @@ -136,71 +140,71 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, wj1a_P, wk1a_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0a_tag, omega=omega) wj1b_P, wk1b_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0b_tag, omega=omega) wj1_P = wj1a_P + wj1b_P - rhoj1_P = contract('pq,ipx->iqx', int2c_inv, wj1_P) + rhoj1_P = solve_j2c(wj1_P) - hj_ao_ao += 4.0*contract('ipx,jpy->ijxy', rhoj1_P, wj1_P) # (10|0)(0|0)(0|01) + hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P) # (10|0)(0|0)(0|01) wj1_P = None if hessobj.auxbasis_response: wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P) - wj1_01 = contract('yqp,ipx->iqxy', int2c_ip1, rhoj1_P) - hj_ao_aux += contract('ipx,py->ipxy', rhoj1_P, wj_ip2) # (10|0)(1|00) - hj_ao_aux -= contract('ipx,yp->ipxy', rhoj1_P, wj0_01) # (10|0)(1|0)(0|00) + wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P) + hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2) # (10|0)(1|00) + hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01) # (10|0)(1|0)(0|00) hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01) # (10|0)(0|1)(0|00) wj1_01 = None rhoj1_P = None - int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv) if with_k: for i0, i1 in lib.prange(0,nao,64): - wk1a_Pko_islice = cupy.asarray(wk1a_Pko[i0:i1]) - wk1b_Pko_islice = cupy.asarray(wk1b_Pko[i0:i1]) - rhok1a_Pko = contract('pq,iqox->ipox', int2c_inv, wk1a_Pko_islice) - rhok1b_Pko = contract('pq,iqox->ipox', int2c_inv, wk1b_Pko_islice) + wk1a_Pko_islice = cupy.asarray(wk1a_Pko[:,i0:i1]) + wk1b_Pko_islice = cupy.asarray(wk1b_Pko[:,i0:i1]) + rhok1a_Pko = solve_j2c(wk1a_Pko_islice) + rhok1b_Pko = solve_j2c(wk1b_Pko_islice) + wk1a_Pko_islice = wk1b_Pko_islice = None for k0, k1 in lib.prange(0,nao,64): - wk1a_Pko_kslice = cupy.asarray(wk1a_Pko[k0:k1]) - wk1b_Pko_kslice = cupy.asarray(wk1b_Pko[k0:k1]) + wk1a_Pko_kslice = cupy.asarray(wk1a_Pko[:,k0:k1]) + wk1b_Pko_kslice = cupy.asarray(wk1b_Pko[:,k0:k1]) # (10|0)(0|10) without response of RI basis - vk2_ip1_ip1 = contract('ipox,kpoy->ikxy', rhok1a_Pko, wk1a_Pko_kslice) + vk2_ip1_ip1 = contract('piox,pkoy->ikxy', rhok1a_Pko, wk1a_Pko_kslice) hk_ao_ao[i0:i1,k0:k1] += contract('ikxy,ik->ikxy', vk2_ip1_ip1, dm0a[i0:i1,k0:k1]) - vk2_ip1_ip1 = contract('ipox,kpoy->ikxy', rhok1b_Pko, wk1b_Pko_kslice) + vk2_ip1_ip1 = contract('piox,pkoy->ikxy', rhok1b_Pko, wk1b_Pko_kslice) hk_ao_ao[i0:i1,k0:k1] += contract('ikxy,ik->ikxy', vk2_ip1_ip1, dm0b[i0:i1,k0:k1]) vk2_ip1_ip1 = None # (10|0)(0|01) without response of RI basis - bra = contract('ipox,ko->ipkx', rhok1a_Pko, mocca[k0:k1]) - ket = contract('kpoy,io->kpiy', wk1a_Pko_kslice, mocca[i0:i1]) - hk_ao_ao[i0:i1,k0:k1] += contract('ipkx,kpiy->ikxy', bra, ket) - bra = contract('ipox,ko->ipkx', rhok1b_Pko, moccb[k0:k1]) - ket = contract('kpoy,io->kpiy', wk1b_Pko_kslice, moccb[i0:i1]) - hk_ao_ao[i0:i1,k0:k1] += contract('ipkx,kpiy->ikxy', bra, ket) + bra = contract('piox,ko->pikx', rhok1a_Pko, mocca[k0:k1]) + ket = contract('pkoy,io->pkiy', wk1a_Pko_kslice, mocca[i0:i1]) + hk_ao_ao[i0:i1,k0:k1] += contract('pikx,pkiy->ikxy', bra, ket) + bra = contract('piox,ko->pikx', rhok1b_Pko, moccb[k0:k1]) + ket = contract('pkoy,io->pkiy', wk1b_Pko_kslice, moccb[i0:i1]) + hk_ao_ao[i0:i1,k0:k1] += contract('pikx,pkiy->ikxy', bra, ket) bra = ket = None wk1a_Pko_kslice = wk1a_Pko_kslice = None if hessobj.auxbasis_response: # (10|0)(1|00) - wk_ip2_Ipo = contract('porx,io->iprx', wka_ip2_P__, mocca[i0:i1]) - hk_ao_aux[i0:i1] += contract('ipox,ipoy->ipxy', rhok1a_Pko, wk_ip2_Ipo) - wk_ip2_Ipo = contract('porx,io->iprx', wkb_ip2_P__, moccb[i0:i1]) - hk_ao_aux[i0:i1] += contract('ipox,ipoy->ipxy', rhok1b_Pko, wk_ip2_Ipo) + wk_ip2_Ipo = contract('porx,io->pirx', wka_ip2_P__, mocca[i0:i1]) + hk_ao_aux[i0:i1] += contract('piox,pioy->ipxy', rhok1a_Pko, wk_ip2_Ipo) + wk_ip2_Ipo = contract('porx,io->pirx', wkb_ip2_P__, moccb[i0:i1]) + hk_ao_aux[i0:i1] += contract('piox,pioy->ipxy', rhok1b_Pko, wk_ip2_Ipo) wk_ip2_Ipo = None # (10|0)(1|0)(0|00) rhok0a_P_I = contract('qor,ir->qoi', rhok0a_P__, mocca[i0:i1]) - wk1_P_I = contract('ypq,qoi->ipoy', int2c_ip1, rhok0a_P_I) - hk_ao_aux[i0:i1] -= contract("ipox,ipoy->ipxy", rhok1a_Pko, wk1_P_I) + wk1_P_I = contract('ypq,qoi->pioy', int2c_ip1, rhok0a_P_I) + hk_ao_aux[i0:i1] -= contract("piox,pioy->ipxy", rhok1a_Pko, wk1_P_I) rhok0b_P_I = contract('qor,ir->qoi', rhok0b_P__, moccb[i0:i1]) - wk1_P_I = contract('ypq,qoi->ipoy', int2c_ip1, rhok0b_P_I) - hk_ao_aux[i0:i1] -= contract("ipox,ipoy->ipxy", rhok1b_Pko, wk1_P_I) - wk1_P_I = rhok1a_Pko = rhok1b_Pko = None + wk1_P_I = contract('ypq,qoi->pioy', int2c_ip1, rhok0b_P_I) + hk_ao_aux[i0:i1] -= contract("piox,pioy->ipxy", rhok1b_Pko, wk1_P_I) + wk1_P_I = None # (10|0)(0|1)(0|00) for q0,q1 in lib.prange(0,naux,64): - wk1_I = contract('yqp,ipox->iqoxy', int2c_ip1_inv[:,q0:q1], wk1a_Pko_islice) - hk_ao_aux[i0:i1,q0:q1] -= contract('qoi,iqoxy->iqxy', rhok0a_P_I[q0:q1], wk1_I) - wk1_I = contract('yqp,ipox->iqoxy', int2c_ip1_inv[:,q0:q1], wk1b_Pko_islice) - hk_ao_aux[i0:i1,q0:q1] -= contract('qoi,iqoxy->iqxy', rhok0b_P_I[q0:q1], wk1_I) + wk1_I = contract('yqp,piox->qioxy', int2c_ip1[:,q0:q1], rhok1a_Pko) + hk_ao_aux[i0:i1,q0:q1] -= contract('qoi,qioxy->iqxy', rhok0a_P_I[q0:q1], wk1_I) + wk1_I = contract('yqp,piox->qioxy', int2c_ip1[:,q0:q1], rhok1b_Pko) + hk_ao_aux[i0:i1,q0:q1] -= contract('qoi,qioxy->iqxy', rhok0b_P_I[q0:q1], wk1_I) wk1_I = rhok0a_P_I = rhok0b_P_I = None - wk1a_Pko_islice = wk1b_Pko_islice = None + rhok1a_Pko = rhok1b_Pko = None wk1a_Pko = wk1b_Pko = None t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1) @@ -286,6 +290,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # aux-aux pair if hessobj.auxbasis_response > 1: wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P) + int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv) rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv) # (1|0)(0|00) hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10) # (00|0)(1|0), (0|1)(0|00) @@ -507,7 +512,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, dm0 = dm0a + dm0b int2c = take_last2d(int2c, aux_ao_idx) - int2c_inv = cupy.linalg.pinv(int2c, rcond=1e-12) + solve_j2c = _gen_metric_solver(int2c) int2c = None fn = int3c2e.get_int3c2e_wjk @@ -515,27 +520,27 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, wj, wka_Pl_ = fn(mol, auxmol, dm0_tag, omega=omega) dm0_tag = tag_array(dm0, occ_coeff=moccb) wj, wkb_Pl_ = fn(mol, auxmol, dm0_tag, omega=omega) - rhoj0 = contract('pq,q->p', int2c_inv, wj) + rhoj0 = solve_j2c(wj) wj = None if isinstance(wka_Pl_, cupy.ndarray): - rhok0a_Pl_ = contract('pq,qio->pio', int2c_inv, wka_Pl_) + rhok0a_Pl_ = solve_j2c(wka_Pl_) else: rhok0a_Pl_ = np.empty_like(wka_Pl_) for p0, p1 in lib.prange(0,nao,64): wk_tmp = cupy.asarray(wka_Pl_[:,p0:p1]) - rhok0a_Pl_[:,p0:p1] = contract('pq,qio->pio', int2c_inv, wk_tmp).get() + rhok0a_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get() wk_tmp = None if isinstance(wkb_Pl_, cupy.ndarray): - rhok0b_Pl_ = contract('pq,qio->pio', int2c_inv, wkb_Pl_) + rhok0b_Pl_ = solve_j2c(wkb_Pl_) else: rhok0b_Pl_ = np.empty_like(wkb_Pl_) for p0, p1 in lib.prange(0,nao,64): wk_tmp = cupy.asarray(wkb_Pl_[:,p0:p1]) - rhok0b_Pl_[:,p0:p1] = contract('pq,qio->pio', int2c_inv, wk_tmp).get() + rhok0b_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get() wk_tmp = None - wka_Pl_ = wkb_Pl_ = int2c_inv = None + wka_Pl_ = wkb_Pl_ = None # ----------------------------- # int3c_ip1 contributions diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index 16da7baa..40b22b27 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -902,12 +902,12 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') nocc = orbo.shape[1] - wj = cupy.zeros([nao,naux,3]) + wj = cupy.zeros([naux,nao,3]) avail_mem = get_avail_mem() use_gpu_memory = True if nao*naux*nocc*3*8 < 0.4*avail_mem: try: - wk = cupy.empty([nao,naux,nocc,3]) + wk = cupy.empty([naux,nao,nocc,3]) except Exception: use_gpu_memory = False else: @@ -915,21 +915,21 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): if not use_gpu_memory: mem = cupy.cuda.alloc_pinned_memory(nao*naux*nocc*3*8) - wk = np.ndarray([nao,naux,nocc,3], dtype=np.float64, order='C', buffer=mem) + wk = np.ndarray([naux,nao,nocc,3], dtype=np.float64, order='C', buffer=mem) # TODO: async data transfer ncp_ij = len(intopt.log_qs) count = 0 for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ip1', omega=omega): if count % ncp_ij == 0: - wk_tmp = cupy.zeros([nao, k1-k0, nocc, 3]) - wj[i0:i1,k0:k1] += contract('xpji,ij->ipx', int3c_blk, dm0_tag[i0:i1,j0:j1]) - wk_tmp[i0:i1,:] += contract('xpji,jo->ipox', int3c_blk, orbo[j0:j1]) + wk_tmp = cupy.zeros([k1-k0,nao,nocc,3]) + wj[k0:k1,i0:i1] += contract('xpji,ij->pix', int3c_blk, dm0_tag[i0:i1,j0:j1]) + wk_tmp[:,i0:i1] += contract('xpji,jo->piox', int3c_blk, orbo[j0:j1]) if (count+1) % ncp_ij == 0: if use_gpu_memory: - wk[:,k0:k1] = wk_tmp + wk[k0:k1] = wk_tmp else: - wk[:,k0:k1] = wk_tmp.get() + wk[k0:k1] = wk_tmp.get() count += 1 return wj, wk diff --git a/gpu4pyscf/dft/gen_grid.py b/gpu4pyscf/dft/gen_grid.py index 2c8ba444..4c3d31ba 100644 --- a/gpu4pyscf/dft/gen_grid.py +++ b/gpu4pyscf/dft/gen_grid.py @@ -72,12 +72,13 @@ def sg1_prune(nuc, rads, n_ang, radii=radi.SG1RADII): ''' # In SG1 the ang grids for the five regions # 6 38 86 194 86 - leb_ngrid = numpy.array([6, 38, 86, 194, 86]) - alphas = numpy.array(( + leb_ngrid = cupy.array([6, 38, 86, 194, 86]) + alphas = cupy.array(( (0.25 , 0.5, 1.0, 4.5), (0.1667, 0.5, 0.9, 3.5), (0.1 , 0.4, 0.8, 2.5))) r_atom = radii[nuc] + 1e-200 + rads = cupy.asarray(rads) if nuc <= 2: # H, He place = ((rads/r_atom).reshape(-1,1) > alphas[0]).sum(axis=1) elif nuc <= 10: # Li - Ne @@ -244,7 +245,7 @@ def gen_atomic_grids(mol, atom_grid={}, radi_method=radi.gauss_chebyshev, angs = [n_ang] * n_rad logger.debug(mol, 'atom %s rad-grids = %d, ang-grids = %s', symb, n_rad, angs) - + if isinstance(angs, cupy.ndarray): angs = angs.get() angs = numpy.array(angs) coords = [] vol = [] diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index f09cfb00..7cd90a69 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -36,7 +36,7 @@ # import pyscf.grad.rhf to activate nuc_grad_method method from pyscf.grad import rhf # noqa from gpu4pyscf.scf import cphf -from gpu4pyscf.lib.cupy_helper import contract, tag_array, print_mem_info +from gpu4pyscf.lib.cupy_helper import contract, tag_array, print_mem_info, transpose_sum from gpu4pyscf.lib import logger from gpu4pyscf.df import int3c2e @@ -307,7 +307,8 @@ def _get_jk(mol, intor, comp, aosym, script_dms, return vs def solve_mo1(mf, mo_energy, mo_coeff, mo_occ, h1mo, - fx=None, atmlst=None, max_memory=4000, verbose=None): + fx=None, atmlst=None, max_memory=4000, verbose=None, + max_cycle=50, level_shift=0): '''Solve the first order equation Kwargs: fx : function(dm_mo) => v1_mo @@ -331,7 +332,7 @@ def _ao2mo(mat): return contract('xik,ip->xpk', tmp, mo_coeff) cupy.get_default_memory_pool().free_all_blocks() # TODO: calculate blksize dynamically - blksize = 8 + blksize = 48 mo1s = [None] * mol.natm e1s = [None] * mol.natm aoslices = mol.aoslice_by_atom() @@ -349,7 +350,7 @@ def _ao2mo(mat): h1vo = cupy.vstack(h1vo) s1vo = cupy.vstack(s1vo) - tol = mf.conv_tol_cpscf * (ia1 - ia0) + tol = mf.conv_tol_cpscf mo1, e1 = cphf.solve(fx, mo_energy, mo_occ, h1vo, s1vo, tol=tol, verbose=verbose) # Different from PySCF, mo1 is in AO mo1 = mo1.reshape(-1,3,nao,nocc) @@ -619,7 +620,8 @@ def get_hcore(self, mol=None): def solve_mo1(self, mo_energy, mo_coeff, mo_occ, h1ao_or_chkfile, fx=None, atmlst=None, max_memory=4000, verbose=None): return solve_mo1(self.base, mo_energy, mo_coeff, mo_occ, h1ao_or_chkfile, - fx, atmlst, max_memory, verbose) + fx, atmlst, max_memory, verbose, + max_cycle=self.max_cycle, level_shift=self.level_shift) def hess_nuc(self, mol=None, atmlst=None): if mol is None: mol = self.mol diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py index c574f0cb..eec93661 100644 --- a/gpu4pyscf/hessian/uhf.py +++ b/gpu4pyscf/hessian/uhf.py @@ -333,7 +333,8 @@ def _get_jk(mol, intor, comp, aosym, script_dms, return vs def solve_mo1(mf, mo_energy, mo_coeff, mo_occ, h1mo, - fx=None, atmlst=None, max_memory=4000, verbose=None): + fx=None, atmlst=None, max_memory=4000, verbose=None, + max_cycle=50, level_shift=0): '''Solve the first order equation Kwargs: fx : function(dm_mo) => v1_mo @@ -521,7 +522,8 @@ class Hessian(HessianBase): def solve_mo1(self, mo_energy, mo_coeff, mo_occ, h1ao_or_chkfile, fx=None, atmlst=None, max_memory=4000, verbose=None): return solve_mo1(self.base, mo_energy, mo_coeff, mo_occ, h1ao_or_chkfile, - fx, atmlst, max_memory, verbose) + fx, atmlst, max_memory, verbose, + max_cycle=self.max_cycle, level_shift=self.level_shift) gen_hop = gen_hop diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index 302b7c56..6dff7feb 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -28,7 +28,7 @@ from gpu4pyscf.lib.cusolver import eigh, cholesky #NOQA LMAX_ON_GPU = 7 -DSOLVE_LINDEP = 1e-15 +DSOLVE_LINDEP = 1e-12 c2s_l = mole.get_cart2sph(lmax=LMAX_ON_GPU) c2s_data = cupy.concatenate([x.ravel() for x in c2s_l]) @@ -532,14 +532,18 @@ def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=cupy.dot, ax.extend(axt) if callable(callback): callback(cycle, xs, ax) - x1 = axt.copy() + for i in range(len(xs)): xsi = cupy.asarray(xs[i]) - for j, axj in enumerate(axt): - x1[j] -= xsi * (cupy.dot(xsi.conj(), axj) / innerprod[i]) + w = cupy.dot(axt, xsi.conj()) / innerprod[i] + x1 -= xsi * cupy.expand_dims(w,-1) axt = xsi = None + x1, rmat = _qr(x1, cupy.dot, lindep) + for i in range(len(x1)): + x1[i] *= rmat[i,i] + max_innerprod = 0 idx = [] for i, xi in enumerate(x1): @@ -548,16 +552,17 @@ def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=cupy.dot, if innerprod1 > lindep and innerprod1 > tol**2: idx.append(i) innerprod.append(innerprod1) - log.debug('krylov cycle %d r = %g', cycle, max_innerprod**.5) - + log.info(f'krylov cycle {cycle} r = {max_innerprod**.5:.3e} {x1.shape[0]} equations') if max_innerprod < lindep or max_innerprod < tol**2: break - x1 = x1[idx] + if len(idx) > 0: + raise RuntimeError("CPSCF failed to converge.") + xs = cupy.asarray(xs) ax = cupy.asarray(ax) - nd = cycle + 1 + nd = xs.shape[0] h = cupy.dot(xs, ax.T) @@ -568,15 +573,10 @@ def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=cupy.dot, if b.ndim == 1: g[0] = innerprod[0] else: - ng = min(nd, nroots) - g[:ng, :nroots] += cupy.dot(xs[:ng], b[:nroots].T) - ''' # Restore the first nroots vectors, which are array b or b-(1+a)x0 for i in range(min(nd, nroots)): xsi = cupy.asarray(xs[i]) - for j in range(nroots): - g[i,j] = cupy.dot(xsi.conj(), b[j]) - ''' + g[i] = cupy.dot(xsi.conj(), b.T) c = cupy.linalg.solve(h, g) x = _gen_x0(c, cupy.asarray(xs)) @@ -601,10 +601,11 @@ def _qr(xs, dot, lindep=1e-14): xi = cupy.array(xs[i], copy=True) rmat[:,nv] = 0 rmat[nv,nv] = 1 - for j in range(nv): - prod = dot(qs[j].conj(), xi) - xi -= qs[j] * prod - rmat[:,nv] -= rmat[:,j] * prod + + prod = dot(qs[:nv].conj(), xi) + xi -= cupy.dot(qs[:nv].T, prod) + rmat[:,nv] -= cupy.dot(rmat[:,:nv], prod) + innerprod = dot(xi.conj(), xi).real norm = cupy.sqrt(innerprod) if innerprod > lindep: @@ -614,7 +615,17 @@ def _qr(xs, dot, lindep=1e-14): return qs[:nv], cupy.linalg.inv(rmat[:nv,:nv]) def _gen_x0(v, xs): - return cupy.dot(v.T, xs) + ndim = v.ndim + if ndim == 1: + v = v[:,None] + space, nroots = v.shape + x0 = cupy.einsum('c,x->cx', v[space-1], cupy.asarray(xs[space-1])) + for i in reversed(range(space-1)): + xsi = cupy.asarray(xs[i]) + x0 += cupy.expand_dims(v[i],-1) * xsi + if ndim == 1: + x0 = x0[0] + return x0 def empty_mapped(shape, dtype=float, order='C'): '''(experimental) diff --git a/gpu4pyscf/lib/tests/test_cupy_helper.py b/gpu4pyscf/lib/tests/test_cupy_helper.py index af54dc0d..2eb969a8 100644 --- a/gpu4pyscf/lib/tests/test_cupy_helper.py +++ b/gpu4pyscf/lib/tests/test_cupy_helper.py @@ -41,12 +41,21 @@ def test_transpose_sum(self): def test_krylov(self): a = cupy.random.random((10,10)) * 1e-2 - b = cupy.random.random(10) + b = cupy.random.random((3,10)) def aop(x): return cupy.dot(a, x.T).T x = krylov(aop, b) - cupy.allclose(cupy.dot(a,x)+x, b) + + assert cupy.allclose(cupy.dot(a,x.T)+x.T, b.T) + + a = cupy.random.random((10,10)) * 1e-2 + b = cupy.random.random((10)) + + def aop(x): + return cupy.dot(a, x.T).T + x = krylov(aop, b) + assert cupy.allclose(cupy.dot(a,x)+x, b) def test_cderi_sparse(self): naux = 4 @@ -90,6 +99,7 @@ def test_cond(self): cond_cpu = numpy.linalg.cond(a.get()) cond_gpu = cond(a) assert abs(cond_cpu - cond_gpu) < 1e-5 + def test_grouped_dot(self): dtype = cupy.float64 def initialize(dtype, M, N, K): diff --git a/gpu4pyscf/scf/cphf.py b/gpu4pyscf/scf/cphf.py index 52b4a84f..ef77d6c0 100644 --- a/gpu4pyscf/scf/cphf.py +++ b/gpu4pyscf/scf/cphf.py @@ -70,7 +70,8 @@ def vind_vo(mo1): # h1 shape is (:,nocc+nvir,nocc) def solve_withs1(fvind, mo_energy, mo_occ, h1, s1, - max_cycle=50, tol=1e-9, hermi=False, verbose=logger.WARN): + max_cycle=50, tol=1e-9, hermi=False, + verbose=logger.WARN, level_shift=0): '''For field dependent basis. First order overlap matrix is non-zero. The first order orbitals are set to C^1_{ij} = -1/2 S1 @@ -89,36 +90,39 @@ def solve_withs1(fvind, mo_energy, mo_occ, h1, s1, viridx = mo_occ == 0 e_a = mo_energy[viridx] e_i = mo_energy[occidx] - I, A = cupy.meshgrid(e_i, e_a) - e_ai = 1 / (A - I) + e_ai = 1 / (e_a[:,None] + level_shift - e_i) nvir, nocc = e_ai.shape nmo = nocc + nvir s1 = s1.reshape(-1,nmo,nocc) hs = mo1base = h1.reshape(-1,nmo,nocc) - s1*e_i - mo_e1 = hs[:,occidx,:].copy() + mo1base = hs.copy() mo1base[:,viridx] *= -e_ai mo1base[:,occidx] = -s1[:,occidx] * .5 def vind_vo(mo1): - v = fvind(mo1.reshape(h1.shape)).reshape(-1,nmo,nocc) + mo1 = mo1.reshape(-1,nmo, nocc) + v = fvind(mo1).reshape(-1,nmo, nocc) + if level_shift != 0: + v -= mo1 * level_shift v[:,viridx,:] *= e_ai v[:,occidx,:] = 0 - return v.ravel() - mo1 = krylov(vind_vo, mo1base.ravel(), + return v.reshape(-1,nmo*nocc) + + mo1 = krylov(vind_vo, mo1base.reshape(-1,nmo*nocc), tol=tol, max_cycle=max_cycle, hermi=hermi, verbose=log) mo1 = mo1.reshape(mo1base.shape) + mo1[:,occidx] = mo1base[:,occidx] log.timer('krylov solver in CPHF', *t0) - v1mo = fvind(mo1.reshape(h1.shape)).reshape(-1,nmo,nocc) - mo1[:,viridx] = mo1base[:,viridx] - v1mo[:,viridx]*e_ai + hs += fvind(mo1).reshape(mo1base.shape) + mo1[:,viridx] = hs[:,viridx] / (e_i - e_a[:,None]) # mo_e1 has the same symmetry as the first order Fock matrix (hermitian or # anti-hermitian). mo_e1 = v1mo - s1*lib.direct_sum('i+j->ij',e_i,e_i) - It, I = cupy.meshgrid(e_i, e_i) - mo_e1 += mo1[:,occidx] * (I - It)#(cupy.einsum('i,j->ij', e_i, e_i) - mo_e1 += v1mo[:,occidx,:] + mo_e1 = hs[:,occidx,:] + mo_e1 += mo1[:,occidx] * (e_i[:,None] - e_i) if h1.ndim == 3: return mo1, mo_e1 diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py index 8e11b6ff..297aa1d1 100644 --- a/gpu4pyscf/scf/hf.py +++ b/gpu4pyscf/scf/hf.py @@ -672,7 +672,7 @@ class RHF(SCF): _keys = {'e_disp', 'h1e', 's1e', 'e_mf', 'screen_tol', 'conv_tol_cpscf', 'disp_with_3body'} screen_tol = 1e-14 - conv_tol_cpscf = 1e-3 + conv_tol_cpscf = 1e-6 DIIS = diis.SCF_DIIS get_jk = _get_jk _eigh = staticmethod(eigh) diff --git a/gpu4pyscf/scf/tests/test_cphf.py b/gpu4pyscf/scf/tests/test_cphf.py index 6217f67c..f3565350 100644 --- a/gpu4pyscf/scf/tests/test_cphf.py +++ b/gpu4pyscf/scf/tests/test_cphf.py @@ -74,7 +74,7 @@ def test_ucphf(self): s1vob.append(_ao2mo(s1a, mo_coeff[1], moccb)) h1vo = (numpy.vstack(h1voa), numpy.vstack(h1vob)) s1vo = (numpy.vstack(s1voa), numpy.vstack(s1vob)) - mo1_cpu, e1_cpu = ucphf_cpu.solve(fx, mo_energy, mo_occ, h1vo, s1vo) + mo1_cpu, e1_cpu = ucphf_cpu.solve(fx, mo_energy, mo_occ, h1vo, s1vo, tol=1e-9) def fx_gpu(mo1): v1vo = fx(mo1.get()) @@ -83,12 +83,12 @@ def fx_gpu(mo1): mo_occ = cupy.asarray(mo_occ) h1vo = cupy.asarray(h1vo) s1vo = cupy.asarray(s1vo) - mo1_gpu, e1_gpu = ucphf_gpu.solve(fx_gpu, mo_energy, mo_occ, h1vo, s1vo) + mo1_gpu, e1_gpu = ucphf_gpu.solve(fx_gpu, mo_energy, mo_occ, h1vo, s1vo, tol=1e-9) - assert cupy.linalg.norm(mo1_cpu[0] - mo1_gpu[0].get()) < 1e-10 - assert cupy.linalg.norm(mo1_cpu[1] - mo1_gpu[1].get()) < 1e-10 - assert cupy.linalg.norm(e1_cpu[0] - e1_gpu[0].get()) < 1e-10 - assert cupy.linalg.norm(e1_cpu[1] - e1_gpu[1].get()) < 1e-10 + assert cupy.linalg.norm(mo1_cpu[0] - mo1_gpu[0].get()) < 1e-6 + assert cupy.linalg.norm(mo1_cpu[1] - mo1_gpu[1].get()) < 1e-6 + assert cupy.linalg.norm(e1_cpu[0] - e1_gpu[0].get()) < 1e-6 + assert cupy.linalg.norm(e1_cpu[1] - e1_gpu[1].get()) < 1e-6 if __name__ == "__main__": print("Full Tests for Unrestricted CPHF") diff --git a/gpu4pyscf/scf/ucphf.py b/gpu4pyscf/scf/ucphf.py index 30e267e4..8f876564 100644 --- a/gpu4pyscf/scf/ucphf.py +++ b/gpu4pyscf/scf/ucphf.py @@ -130,20 +130,20 @@ def solve_withs1(fvind, mo_energy, mo_occ, h1, s1, mo1base_a[:,occidxa] = -s1_a[:,occidxa] * .5 mo1base_b[:,occidxb] = -s1_b[:,occidxb] * .5 mo1base = cupy.hstack((mo1base_a.reshape(nset,-1), mo1base_b.reshape(nset,-1))) - + def vind_vo(mo1): - mo1 = mo1.reshape(mo1base.shape) - v = fvind(mo1).reshape(mo1base.shape) + mo1 = mo1.reshape(-1,nmoa*nocca+nmob*noccb) + v = fvind(mo1).reshape(-1,nmoa*nocca+nmob*noccb) if level_shift != 0: v -= mo1 * level_shift - v1a = v[:,:nmoa*nocca].reshape(nset,nmoa,nocca) - v1b = v[:,nmoa*nocca:].reshape(nset,nmob,noccb) + v1a = v[:,:nmoa*nocca].reshape(-1,nmoa,nocca) + v1b = v[:,nmoa*nocca:].reshape(-1,nmob,noccb) v1a[:,viridxa] *= eai_a v1b[:,viridxb] *= eai_b v1a[:,occidxa] = 0 v1b[:,occidxb] = 0 - return v.ravel() - mo1 = krylov(vind_vo, mo1base.ravel(), + return v.reshape(-1,nmoa*nocca+nmob*noccb) + mo1 = krylov(vind_vo, mo1base.reshape(-1,nmoa*nocca+nmob*noccb), tol=tol, max_cycle=max_cycle, hermi=hermi, verbose=log) mo1 = mo1.reshape(mo1base.shape)