diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml index 7f2b816e..29ec300f 100644 --- a/.github/workflows/nightly_build.yml +++ b/.github/workflows/nightly_build.yml @@ -14,7 +14,7 @@ permissions: jobs: build: - runs-on: self-hosted + runs-on: [self-hosted, Linux, X64, v100] steps: - uses: actions/checkout@v3 @@ -23,6 +23,7 @@ jobs: pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple python3 -m pip install --upgrade pip pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion + pip3 install pytest-benchmark pip3 install pyscf --upgrade pip3 install numpy --upgrade pip3 install scipy --upgrade @@ -35,8 +36,13 @@ jobs: export PATH=${CUDA_HOME}/bin:${PATH} export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:$LD_LIBRARY_PATH sh build.sh - - name: Smoke Test + - name: Test RKS run: | echo $GITHUB_WORKSPACE export PYTHONPATH="${PYTHONPATH}:$(pwd)" - pytest --durations=0 + pytest gpu4pyscf/tests/test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_rks_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/ + - name: Test UKS + run: | + echo $GITHUB_WORKSPACE + export PYTHONPATH="${PYTHONPATH}:$(pwd)" + pytest gpu4pyscf/tests/test_benchmark_uks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=v1.3.0_uks_1v100 --benchmark-storage=gpu4pyscf/tests/benchmark_results/ diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 4eb534e3..12464ab5 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -21,6 +21,7 @@ jobs: run: | pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple python3 -m pip install --upgrade pip + pip3 install pytest-benchmark pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion pip3 install pyscf --upgrade pip3 install git+https://github.com/pyscf/properties --upgrade @@ -38,7 +39,7 @@ jobs: run: | echo $GITHUB_WORKSPACE export PYTHONPATH="${PYTHONPATH}:$(pwd)" - pytest -m "not smoke" --cov=$GITHUB_WORKSPACE + pytest -m "not benchmark" --cov=$GITHUB_WORKSPACE multi-gpu: runs-on: [self-hosted, Linux, X64, 2T4] @@ -48,6 +49,7 @@ jobs: run: | pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple python3 -m pip install --upgrade pip + pip3 install pytest-benchmark pip3 install flake8 pytest coverage pytest-cov pyscf-dispersion pip3 install pyscf --upgrade pip3 install git+https://github.com/pyscf/properties --upgrade @@ -65,4 +67,4 @@ jobs: run: | echo $GITHUB_WORKSPACE export PYTHONPATH="${PYTHONPATH}:$(pwd)" - pytest -m "not smoke" --cov=$GITHUB_WORKSPACE + pytest -m "not benchmark" --cov=$GITHUB_WORKSPACE diff --git a/.gitignore b/.gitignore index 427ffd8a..b8dd78e9 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ **/build **/launch_logs **/deps +**/.benchmarks core **tmp* *.egg-info/ diff --git a/CHANGELOG b/CHANGELOG new file mode 100644 index 00000000..7f747686 --- /dev/null +++ b/CHANGELOG @@ -0,0 +1,98 @@ +v1.3.0 (2025-01-07) +------------------- +* New Features + - PBC analytical Fourier transform on GPU +* Improvements + - Optimized computation efficiency and memory footprint for density fitting Hessian + - Support pickle serialization for most classes (SCF, DF, PCM, etc.) + - Efficiency of moving CuPy arrays between GPU cards + + +v1.2.1 (2024-12-20) +------------------- +* New Features + - Change the license from GPL v3.0 to Apache 2.0 + - Multi-GPU support for SCF, Gradients, and Hessian computation using AO-direct algorithm + - Add PBC HF and DFT with k-points, UHF/UKS, and density fitting +* Improvements + - Change the default conv_tol_cpscf = 1e-3 / batch of atoms to conv_tol_cpscf = 1e-6 / atom + - Fix numerical instability in complex-valued TDHF diagonalization + - Improve PCM and QMMM with int1e_grids kernel + - Support non-symmetric int3c2e integral + - Optimize Hessian calculation with direct SCF + - Improve the numerical stability of int3c2e for point charge + - Add CI workflow for multi-GPU +* Fixes + - Fix non-contiguous array error in p2p transfer between GPUs. + - Fix bugs in NMR calculations + + +v1.2.0 (2024-12-09) +------------------- +* New Features + - Spin-conserved TDA and TDDFT methods + - Spin-flip TDA method. + - J-engine using McMuchie-Davidson integral algorithm + - Support multi-GPU density fitting energy, gradients and Hessian computation. + - Second order SCF solver +* Improvements + - Support non-hermitian density matrix in J/K builder + - Secondary grids for CPHF solver + - 3-center integral computation efficiency for gradients and hessian + - One-electron Coulomb integrals against point charges and Gaussian charge distributions on grids. + - Automatically apply SCF initial guess from existing wavefunction + + +v1.1.0 (2024-10-29) +------------------- +* New Features + - Add esp charge and resp charge by @wxj6000 in #208 + - New Rys kernel by @sunqm in #221 + - Optimize nuclear gradients using new Rys kernel by @sunqm in #224 + - GPU kernel for analytical hessian by @sunqm in #227 + - Add QM/MM by @MoleOrbitalHybridAnalyst in #218 +* Improvements + - Improved compatiability with pyscf 2.7.0 by @wxj6000 in #216 + - Add skipping SCF cycles by @kvkarandashev in #229 + - Skip building gint, gvhf, ... when building libxc by @wxj6000 in #210 +* Bugfix + - Typo in build_wheels.sh by @wxj6000 in #209 + - Typo in dft_driver.py by @wxj6000 in #220 + - Bugfix: cusolver error when specifying gpu by @wxj6000 in #213 + - Bugfix: error in int2c2e by @wxj6000 in #212 + - Bugfix: inconsistent gradient with CPU. Improved to_cpu, uks gradient, and grid_response by @wxj6000 in #230 + - Bugfix: recompute int3c2e in DF UHF by @wxj6000 in #226 + - New Contributors + - @MoleOrbitalHybridAnalyst made their first contribution in #218 + - @kvkarandashev made their first contribution in #229 + + +v1.0.2 (2024-09-03) +------------------- +* Bugfix: append data in h5 file by @wxj6000 in #200 +* Support customized CHELPG radii by @wxj6000 in #202 +* Add cupy installation guide for developer installation instructions by @henryw7 in #204 +* Bugfix: save density when spin unrestricted by @wxj6000 in #205 +* Add chkfile support for pysisyphus by @henryw7 in #203 + + +v1.0.1 (2024-08-24) +------------------- +* Bugfix in rks.reset by @wxj6000 in #191. The bug leads to the failure of geometry optimization with direct SCF (#190) +* Bugfix when CUDA unified memory is disabled. Removed CUDA unified memory in libxc, and reduced the overhead in calling libxc @wxj6000 in #180, #189 +* Bugfix and Improvement in opt_driver by @wxj6000 in #187 #197 +* Support SMD in opt_driver and dft driver @liuyu-chem1996 in #196 +* Support thermo calculation in dft_driver @liuyu-chem1996 in #192 + + +v1.0.0 (2024-07-23) +------------------- +Released features: +* Density fitting scheme and direct SCF scheme +* SCF, analytical gradient, and analytical Hessian calculations for Hartree-Fock and DFT +* Spin-conserved and spin-flip TDA and TDDFT for excitated states +* Nonlocal functional correction (vv10) for SCF and gradient +* PCM models, SMD model, their analytical gradients, and semi-analytical Hessian matrix +* Unrestricted Hartree-Fock and unrestricted DFT, gradient, and Hessian +* MP2/DF-MP2 and CCSD (experimental) +* Polarizability, IR, and NMR shielding (experimental) diff --git a/benchmarks/cupy_helper/benchmark_memory_copy.py b/benchmarks/cupy_helper/benchmark_memory_copy.py new file mode 100644 index 00000000..8455f3f0 --- /dev/null +++ b/benchmarks/cupy_helper/benchmark_memory_copy.py @@ -0,0 +1,141 @@ +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +import cupy as cp +from cupyx import profiler +from gpu4pyscf.lib.cupy_helper import copy_array + +''' +Benchmark different ways of transfering data from pinned memory to device +''' + +# Host array +host_array = cp.cuda.alloc_pinned_memory(512*512*512 * 8) +big_host_data = np.ndarray(512**3, dtype=cp.float64, buffer=host_array) +big_host_data = big_host_data.reshape(512,512,512) +big_host_data += np.random.rand(512,512,512) + +# Device array +big_device_data = cp.empty_like(big_host_data) + +# Create views on both arrays +host_view = big_host_data[:, 128:] # Non-contiguous view on the host +device_view = big_device_data[:, 128:] # Non-contiguous view on the device + +print("Host View Shape:", host_view.shape) +print("Device View Shape:", device_view.shape) + +print("------ Benchmark device to host transfer ----------") +size = host_view.nbytes +perf_custom = profiler.benchmark(copy_array, (host_view, device_view), n_repeat=20, n_warmup=3) +t_kernel = perf_custom.gpu_times.mean() +bandwidth = size / t_kernel / 1e9 +print('Using custom function', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") + +def cupy_copy(c, out): + out[:] = cp.asarray(c) + return out +perf_cupy = profiler.benchmark(cupy_copy, (host_view, device_view), n_repeat=20, n_warmup=3) +t_kernel = perf_cupy.gpu_times.mean() +bandwidth = size / t_kernel / 1e9 +print('Using cupy function', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") + +print("------- Benchmark host to device transfer ---------") +size = host_view.nbytes +perf_custom = profiler.benchmark(copy_array, (device_view, host_view), n_repeat=20, n_warmup=3) +t_kernel = perf_custom.gpu_times.mean() +bandwidth = size / t_kernel / 1e9 +print('Using custom function', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") + +def cupy_copy(c, out): + out[:] = c.get() + return out +perf_cupy = profiler.benchmark(cupy_copy, (device_view, host_view), n_repeat=20, n_warmup=3) +t_kernel = perf_cupy.gpu_times.mean() +bandwidth = size / t_kernel / 1e9 +print('Using cupy function', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") + +print("-------- Benchmark device to device transfer (non-contiguous) ---------") + +with cp.cuda.Device(0): + a = cp.random.rand(512,512,512) + device0_view = a[:,128:] +with cp.cuda.Device(1): + b = cp.random.rand(512,512,512) + device1_view = b[:,128:] +perf_cupy = profiler.benchmark(copy_array, (device0_view, device1_view), n_repeat=20, n_warmup=3) +t_kernel = perf_cupy.gpu_times.mean() +bandwidth = device0_view.nbytes / t_kernel / 1e9 +print('Using custom function', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") + +assert np.linalg.norm(device0_view.get() - device1_view.get()) < 1e-10 + +def cupy_copy(c, out): + with cp.cuda.Device(out.device): + out[:] = cp.asarray(c.get()) + return out +perf_cupy = profiler.benchmark(cupy_copy, (device0_view, device1_view), n_repeat=20, n_warmup=3) +t_kernel = perf_cupy.gpu_times.mean() +bandwidth = device0_view.nbytes / t_kernel / 1e9 +print('Using cupy function', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") + +print("-------- Benchmark device to device transfer (contiguous) ---------") +perf_cupy = profiler.benchmark(copy_array, (a, b), n_repeat=20, n_warmup=3) +t_kernel = perf_cupy.gpu_times.mean() +bandwidth = device0_view.nbytes / t_kernel / 1e9 +print('Using custom function', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") + +def cupy_copy_contiguous(a, b): + b[:] = a +perf_cupy = profiler.benchmark(cupy_copy_contiguous, (a, b), n_repeat=20, n_warmup=3) +t_kernel = perf_cupy.gpu_times.mean() +bandwidth = device0_view.nbytes / t_kernel / 1e9 +print('Cupy copy contiguous array', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") + +def cupy_asarray_contiguous(a, b): + with cp.cuda.Device(b.device): + b = cp.asarray(a) +perf_cupy = profiler.benchmark(cupy_asarray_contiguous, (a, b), n_repeat=20, n_warmup=3) +t_kernel = perf_cupy.gpu_times.mean() +bandwidth = device0_view.nbytes / t_kernel / 1e9 +print('Cupy set contiguous array', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") + +assert np.linalg.norm(a.get() - b.get()) < 1e-10 + + +print('----------- Benchmark reduction across devices ------ ') +from gpu4pyscf.lib.cupy_helper import reduce_to_device +_num_devices = cp.cuda.runtime.getDeviceCount() +a_dist = [] +for device_id in range(_num_devices): + with cp.cuda.Device(device_id): + a = cp.random.rand(512,512,512) + a_dist.append(a) + +perf_cupy = profiler.benchmark(reduce_to_device, (a_dist,), n_repeat=20, n_warmup=3) +t_kernel = perf_cupy.gpu_times.mean() +bandwidth = a_dist[0].nbytes * _num_devices / t_kernel / 1e9 +print('Cupy set contiguous array', t_kernel) +print(f"Effective Bandwidth: {bandwidth:.2f} GB/s") diff --git a/examples/00-h2o.py b/examples/00-h2o.py index 62518557..58c9076e 100644 --- a/examples/00-h2o.py +++ b/examples/00-h2o.py @@ -36,12 +36,12 @@ atom=atom, # water molecule basis='def2-tzvpp', # basis set output='./pyscf.log', # save log file - verbose=6 # control the level of print info + verbose=6 # control the level of print info ) mf_GPU = rks.RKS( # restricted Kohn-Sham DFT mol, # pyscf.gto.object - xc='b3lyp' # xc funtionals, such as pbe0, wb97m-v, tpss, + xc='b3lyp' # xc funtionals, such as pbe0, wb97m-v, tpss, ).density_fit() # density fitting mf_GPU.grids.atom_grid = (99,590) # (99,590) lebedev grids, (75,302) is often enough @@ -51,7 +51,7 @@ # Compute Energy e_dft = mf_GPU.kernel() -print(f"total energy = {e_dft}") # -76.26736519501688 +print(f"total energy = {e_dft}") # -76.46668196729536 # Compute Gradient g = mf_GPU.nuc_grad_method() diff --git a/examples/02-h2o_geomopt.py b/examples/02-h2o_geomopt.py index 1ca982a9..eaadbc26 100644 --- a/examples/02-h2o_geomopt.py +++ b/examples/02-h2o_geomopt.py @@ -43,4 +43,4 @@ def callback(envs): mol_eq = optimize(mf_GPU, maxsteps=20, callback=callback) print("Optimized coordinate:") print(mol_eq.atom_coords()) -print('geometry optimization took', time.time() - start_time, 's') +print('Geometry optimization took', time.time() - start_time, 's') diff --git a/examples/04-h2o_esp.py b/examples/04-h2o_esp.py index 9b04c485..264b3685 100644 --- a/examples/04-h2o_esp.py +++ b/examples/04-h2o_esp.py @@ -21,6 +21,7 @@ import numpy as np from pyscf import gto from gpu4pyscf.dft import rks +from gpu4pyscf.gto.int3c1e import int1e_grids atom =''' O 0.0000000000 -0.0000000000 0.1174000000 @@ -33,10 +34,8 @@ mf.kernel() dm = mf.make_rdm1() # compute one-electron density matrix -# Use default mesh grids -coords = mf.grids.coords.get() +# Use default Lebedev grids +coords = mf.grids.coords -# The efficiency can be improved if needed -from pyscf import df -fakemol = gto.fakemol_for_charges(coords) -v = np.einsum('ijp,ij->p', df.incore.aux_e2(mol, fakemol), dm) +# Calculate electrostatic potential +v = int1e_grids(mol, coords, dm=dm) # performing 'ijp,ij->p' efficiently diff --git a/examples/05-h2o_multipole_moment.py b/examples/05-h2o_multipole_moment.py index e360d859..1ea7c677 100644 --- a/examples/05-h2o_multipole_moment.py +++ b/examples/05-h2o_multipole_moment.py @@ -32,10 +32,10 @@ mf.kernel() dm = mf.make_rdm1() -dip = mf.dip_moment(unit='DEBYE', dm=dm.get()) +dip = mf.dip_moment(unit='DEBYE', dm=dm) print('dipole moment:') print(dip) -quad = mf.quad_moment(unit='DEBYE-ANG', dm=dm.get()) +quad = mf.quad_moment(unit='DEBYE-ANG', dm=dm) print('quadrupole moment:') print(quad) diff --git a/examples/14-pcm_solvent.py b/examples/14-pcm_solvent.py index 3fb05d4e..00ea6054 100644 --- a/examples/14-pcm_solvent.py +++ b/examples/14-pcm_solvent.py @@ -31,9 +31,9 @@ mf = rks.RKS(mol, xc='HYB_GGA_XC_B3LYP').density_fit() mf = mf.PCM() mf.grids.atom_grid = (99,590) -mf.with_solvent.lebedev_order = 29 # 302 Lebedev grids -mf.with_solvent.method = 'IEF-PCM' -mf.with_solvent.eps = 78.3553 +mf.with_solvent.lebedev_order = 29 # 302 Lebedev grids +mf.with_solvent.method = 'IEF-PCM' # Can be C-PCM, SS(V)PE, COSMO +mf.with_solvent.eps = 78.3553 # Dielectric constant mf.kernel() gradobj = mf.nuc_grad_method() diff --git a/examples/15-chelpg.py b/examples/15-chelpg.py index 75161162..8e94d92a 100644 --- a/examples/15-chelpg.py +++ b/examples/15-chelpg.py @@ -32,18 +32,18 @@ mol.basis = '631g' mol.unit = 'B' mol.build() -mol.verbose = 6 +mol.verbose = 4 xc = 'b3lyp' mf = rks.RKS(mol, xc=xc) mf.grids.level = 5 mf.kernel() q = chelpg.eval_chelpg_layer_gpu(mf) -print('partial charge with CHELPG, using modified Bondi radii') +print('Partial charge with CHELPG, using modified Bondi radii') print(q) # [ 0.04402311 0.11333945 -0.25767919 0.10031663] # Customize the radii used for calculating CHELPG charges from pyscf.data import radii q = chelpg.eval_chelpg_layer_gpu(mf, Rvdw=radii.UFF) -print('partial charge with CHELPG, using UFF radii') +print('Partial charge with CHELPG, using UFF radii') print(q) diff --git a/examples/16-smd_solvent.py b/examples/16-smd_solvent.py index e606d74a..446fe38c 100644 --- a/examples/16-smd_solvent.py +++ b/examples/16-smd_solvent.py @@ -28,16 +28,14 @@ mol = pyscf.M(atom=atom, basis='def2-tzvpp', verbose=1) mf = dft.rks.RKS(mol, xc='HYB_GGA_XC_B3LYP').density_fit() -mf = mf.SMD() mf.grids.atom_grid = (99,590) -mf.with_solvent.lebedev_order = 29 # 302 Lebedev grids -mf.with_solvent.method = 'SMD' -mf.with_solvent.solvent = 'water' -e_tot = mf.kernel() -print('total energy with SMD:', e_tot) +e_gas = mf.kernel() +print('total energy in gas phase:', e_gas) -gradobj = mf.nuc_grad_method() -f = gradobj.kernel() +mf = mf.SMD() # Add SMD model to the mean-field object +mf.with_solvent.lebedev_order = 29 # 302 Lebedev grids, +mf.with_solvent.solvent = 'water' # Has to be a string, lookup the solvent name from https://comp.chem.umn.edu/solvation/mnsddb.pdf +e_smd = mf.kernel() +print('total energy in water:', e_smd) -hessobj = mf.Hessian() -h = hessobj.kernel() +print('Solvation free energy:', e_smd - e_gas) diff --git a/examples/19-unrestricted_dft.py b/examples/19-unrestricted_dft.py index 86e59402..0ebaec5b 100644 --- a/examples/19-unrestricted_dft.py +++ b/examples/19-unrestricted_dft.py @@ -49,14 +49,3 @@ hobj_with_pcm = mf_with_pcm.Hessian() h = hobj_with_pcm.kernel() - -# SCF, gradient, and Hessian for DF-UKS with IEF-PCM -mf_with_smd = mf.SMD() -mf_with_smd.with_solvent.solvent = 'water' -mf_with_smd.kernel() - -gobj_with_smd = mf_with_smd.nuc_grad_method() -g = gobj_with_smd.kernel() - -hobj_with_smd = mf_with_smd.Hessian() -h = hobj_with_smd.kernel() diff --git a/examples/20-dfmp2.py b/examples/20-dfmp2.py index e00c9b78..6edfc100 100644 --- a/examples/20-dfmp2.py +++ b/examples/20-dfmp2.py @@ -35,7 +35,18 @@ e_corr, t2 = ptobj.kernel() e_mp2 = e_hf + e_corr +# It prints out MP2 energies, those energies are assessible in the PT object. +print('MP2 correlation energy:', ptobj.emp2) +print('SCS MP2 correlation energy:', ptobj.emp2_scs) +print('Total energy with SCS MP2:', ptobj.e_tot_scs) + +print('----- frozen core --------') + # frozen core ptobj.frozen = [0] e_corr, t2 = ptobj.kernel() e_mp2 = e_hf + e_corr + +print('MP2 correlation energy:', ptobj.emp2) +print('SCS MP2 correlation energy:', ptobj.emp2_scs) +print('Total energy with SCS MP2:', ptobj.e_tot_scs) diff --git a/examples/22-resp_charge.py b/examples/22-resp_charge.py index 208adc27..7e83d290 100644 --- a/examples/22-resp_charge.py +++ b/examples/22-resp_charge.py @@ -42,11 +42,11 @@ print(q0) # RESP charge // first stage fitting -q1 = esp.resp_solve(mol, dm) +q1 = esp.resp_solve(mol, dm) -# Add constraint: fix those charges in the second stage +# Add constraint: fix those charges in the second stage # q2[4] = q1[4] -# q2[5] = q1[5] +# q2[5] = q1[5] # q2[6] = q1[6] # q2[7] = q1[7] sum_constraints = [] @@ -58,7 +58,7 @@ equal_constraints = [[1,2,3]] # RESP charge // second stage fitting -q2 = esp.resp_solve(mol, dm, resp_a=1e-3, +q2 = esp.resp_solve(mol, dm, resp_a=1e-3, sum_constraints=sum_constraints, equal_constraints=equal_constraints) print('Fitted RESP charge') diff --git a/examples/23-qmmm_pbc.py b/examples/24-qmmm_pbc.py similarity index 100% rename from examples/23-qmmm_pbc.py rename to examples/24-qmmm_pbc.py diff --git a/examples/24-cp_bsse.py b/examples/25-cp_bsse.py similarity index 86% rename from examples/24-cp_bsse.py rename to examples/25-cp_bsse.py index 45a2c845..697cf8bc 100644 --- a/examples/24-cp_bsse.py +++ b/examples/25-cp_bsse.py @@ -21,15 +21,15 @@ from gpu4pyscf.dft import rks atom_A = [ -('O', (0.000000, 0.000000, 0.000000)), -('H', (0.000000, 0.757160, 0.586260)), -('H', (0.000000, -0.757160, 0.586260)) + ('O', (0.000000, 0.000000, 0.000000)), + ('H', (0.000000, 0.757160, 0.586260)), + ('H', (0.000000, -0.757160, 0.586260)) ] atom_B = [ -('O', (0.000000, 0.000000, 2.913530)), -('H', (0.000000, 0.757160, 3.499790)), -('H', (0.000000, -0.757160, 3.499790)) + ('O', (0.000000, 0.000000, 2.913530)), + ('H', (0.000000, 0.757160, 3.499790)), + ('H', (0.000000, -0.757160, 3.499790)) ] atom_AB = atom_A + atom_B @@ -51,7 +51,7 @@ mol_B_ghost.build() def solve_dft(mol, xc='b3lyp'): - mf = rks.RKS(mol, xc='b3lyp').density_fit() + mf = rks.RKS(mol, xc=xc).density_fit() mf.grids.atom_grid = (99,590) return mf.kernel() diff --git a/examples/dft_driver.py b/examples/dft_driver.py index 13aaa0ce..0be7f410 100644 --- a/examples/dft_driver.py +++ b/examples/dft_driver.py @@ -27,7 +27,6 @@ parser.add_argument("--solvent", type=str, default='') args = parser.parse_args() -lib.num_threads(16) start_time = time.time() bas = args.basis mol = pyscf.M( @@ -52,7 +51,7 @@ mf_df.direct_scf_tol = 1e-14 mf_df.conv_tol = 1e-10 mf_df.chkfile = None -mf_df.conv_tol_cpscf = 1e-3 +mf_df.conv_tol_cpscf = 1e-6 e_tot = mf_df.kernel() scf_time = time.time() - start_time print(f'compute time for energy: {scf_time:.3f} s') diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py index 4526d79d..b823b43b 100644 --- a/gpu4pyscf/__init__.py +++ b/gpu4pyscf/__init__.py @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = '1.2.1' +__version__ = '1.3.0' from . import lib, grad, hessian, solvent, scf, dft diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index 52b0ecf8..da61804c 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -20,7 +20,8 @@ from cupyx.scipy.linalg import solve_triangular from pyscf import lib from pyscf.df import df, addons, incore -from gpu4pyscf.lib.cupy_helper import cholesky, tag_array, get_avail_mem, cart2sph, p2p_transfer +from gpu4pyscf.lib.cupy_helper import (cholesky, tag_array, get_avail_mem, + cart2sph, p2p_transfer, copy_array) from gpu4pyscf.df import int3c2e, df_jk from gpu4pyscf.lib import logger from gpu4pyscf import __config__ @@ -36,7 +37,7 @@ class DF(lib.StreamObject): from gpu4pyscf.lib.utils import to_gpu, device - _keys = {'intopt', 'mol', 'auxmol', 'use_gpu_memory'} + _keys = {'intopt', 'nao', 'naux', 'cd_low', 'mol', 'auxmol', 'use_gpu_memory'} def __init__(self, mol, auxbasis=None): self.mol = mol @@ -52,8 +53,12 @@ def __init__(self, mol, auxbasis=None): self.naux = None self.cd_low = None self._cderi = None + self._vjopt = None self._rsh_df = {} + __getstate__, __setstate__ = lib.generate_pickle_methods( + excludes=('cd_low', 'intopt', '_cderi', '_vjopt')) + @property def auxbasis(self): return self._auxbasis @@ -138,8 +143,7 @@ def get_blksize(self, extra=0, nao=None): log = logger.new_logger(self.mol, self.mol.verbose) device_id = cupy.cuda.Device().id log.debug(f"{mem_avail/1e9:.3f} GB memory available on Device {device_id}, block size = {blksize}") - if blksize < ALIGNED: - raise RuntimeError("Not enough GPU memory") + assert blksize > 0 return blksize def loop(self, blksize=None, unpack=True): @@ -222,12 +226,16 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, log.debug("Saving CDERI on CPU") _cderi = {} - blksize = (naux + _num_devices - 1) // _num_devices - for device_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)): + aux_blksize = (naux + _num_devices - 1) // _num_devices + aux_blksize = (aux_blksize + ALIGNED - 1) // ALIGNED * ALIGNED + for device_id in range(_num_devices): + p0 = min(aux_blksize*device_id, naux) + p1 = min(aux_blksize*(device_id+1), naux) + #for device_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)): if use_gpu_memory: with cupy.cuda.Device(device_id), _streams[device_id]: _cderi[device_id] = cupy.empty([p1-p0, npairs]) - log.debug(f"CDERI size {_cderi[device_id].nbytes/GB:.3f} on Device {device_id}") + log.debug(f"CDERI size {_cderi[device_id].nbytes/GB:.3f} GB on Device {device_id}") else: mem = cupy.cuda.alloc_pinned_memory((p1-p0) * npairs * 8) cderi_blk = np.ndarray([p1-p0, npairs], dtype=np.float64, order='C', buffer=mem) @@ -249,7 +257,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, with ThreadPoolExecutor(max_workers=_num_devices) as executor: for device_id in range(_num_devices): task_list = task_list_per_device[device_id] - future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi, + future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi, aux_blksize, omega=omega, sr_only=sr_only, device_id=device_id) futures.append(future) @@ -261,7 +269,8 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, return _cderi -def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, device_id=0): +def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize, + omega=None, sr_only=False, device_id=0): ''' Execute CDERI tasks on one device ''' nq = len(intopt.log_qs) @@ -270,7 +279,6 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de naoaux = cd_low.shape[0] npairs = [len(intopt.ao_pairs_row[cp_ij]) for cp_ij in range(len(intopt.log_qs))] pairs_loc = np.append(0, np.cumsum(npairs)) - blksize = (naux + _num_devices - 1) // _num_devices with cupy.cuda.Device(device_id), _streams[device_id]: assert isinstance(mol.verbose, int) log = logger.new_logger(mol, mol.verbose) @@ -341,13 +349,18 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de ij0 = pairs_loc[cp_ij_id] ij1 = pairs_loc[cp_ij_id+1] if isinstance(_cderi[0], np.ndarray): - for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)): - for i in range(p0,p1): - cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1]) + for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)): + tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True) + copy_array(tmp, _cderi[slice_id][:p1-p0,ij0:ij1]) + elif _num_devices > 1: + # Multi-GPU case, copy data to other Devices + for dev_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)): + # Making a copy for contiguous data transfer + tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True) + with cupy.cuda.Device(dev_id): + tmp = copy_array(tmp) + _cderi[dev_id][:,ij0:ij1] = tmp else: - # Copy data to other Devices - for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)): - #_cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1] - p2p_transfer(_cderi[slice_id][:,ij0:ij1], cderi_block[p0:p1]) - t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1) + _cderi[0][:,ij0:ij1] = cderi_block + t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1) return diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py index d2083f41..5561cf9c 100644 --- a/gpu4pyscf/df/df_jk.py +++ b/gpu4pyscf/df/df_jk.py @@ -122,7 +122,7 @@ class _DFHF: to_gpu = utils.to_gpu device = utils.device __name_mixin__ = 'DF' - _keys = {'rhoj', 'rhok', 'disp', 'screen_tol'} + _keys = {'rhoj', 'rhok', 'disp', 'screen_tol', 'with_df', 'only_dfj'} def __init__(self, mf, dfobj, only_dfj): self.__dict__.update(mf.__dict__) @@ -132,7 +132,6 @@ def __init__(self, mf, dfobj, only_dfj): self.direct_scf = False self.with_df = dfobj self.only_dfj = only_dfj - self._keys = mf._keys.union(['with_df', 'only_dfj']) def undo_df(self): '''Remove the DFHF Mixin''' diff --git a/gpu4pyscf/df/grad/jk.py b/gpu4pyscf/df/grad/jk.py index 4139726e..2bbf9d9e 100644 --- a/gpu4pyscf/df/grad/jk.py +++ b/gpu4pyscf/df/grad/jk.py @@ -13,8 +13,10 @@ # limitations under the License. from concurrent.futures import ThreadPoolExecutor +import numpy as np import cupy -from gpu4pyscf.lib.cupy_helper import contract, concatenate +from gpu4pyscf.df.int3c2e import get_int3c2e_ip_jk, VHFOpt, _split_tasks +from gpu4pyscf.lib.cupy_helper import contract, concatenate, reduce_to_device from gpu4pyscf.lib import logger from gpu4pyscf.__config__ import _streams, _num_devices @@ -54,7 +56,7 @@ def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0): t0 = log.timer_debug1(f'rhoj and rhok on Device {device_id}', *t0) return rhoj, rhok -def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True): +def get_rhojk(with_df, dm, orbo, with_j=True, with_k=True): ''' Calculate rhoj and rhok on Multi-GPU system ''' futures = [] @@ -80,3 +82,112 @@ def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True): rhok = concatenate(rhok_total) return rhoj, rhok + +def _jk_ip_task(intopt, rhoj_cart, dm_cart, rhok_cart, orbo_cart, task_list, + with_j=True, with_k=True, device_id=0, omega=None): + mol = intopt.mol + with cupy.cuda.Device(device_id), _streams[device_id]: + log = logger.new_logger(mol, mol.verbose) + t0 = (logger.process_clock(), logger.perf_counter()) + + orbo_cart = cupy.asarray(orbo_cart) + cart_aux_loc = intopt.cart_aux_loc + nao_cart = dm_cart.shape[0] + naux_cart = intopt._sorted_auxmol.nao + vj = vk = vjaux = vkaux = None + if with_j: + rhoj_cart = cupy.asarray(rhoj_cart) + dm_cart = cupy.asarray(dm_cart) + vj = cupy.zeros((3,nao_cart), order='C') + vjaux = cupy.zeros((3,naux_cart)) + if with_k: + rhok_cart = cupy.asarray(rhok_cart) + vk = cupy.zeros((3,nao_cart), order='C') + vkaux = cupy.zeros((3,naux_cart)) + + for cp_kl_id in task_list: + k0, k1 = cart_aux_loc[cp_kl_id], cart_aux_loc[cp_kl_id+1] + rhoj_tmp = rhok_tmp = None + if with_j: + rhoj_tmp = rhoj_cart[k0:k1] + if with_k: + rhok_tmp = contract('por,ir->pio', rhok_cart[k0:k1], orbo_cart) + rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo_cart) + ''' + if(rhoj_tmp.flags['C_CONTIGUOUS'] == False): + rhoj_tmp = rhoj_tmp.astype(cupy.float64, order='C') + + if(rhok_tmp.flags['C_CONTIGUOUS'] == False): + rhok_tmp = rhok_tmp.astype(cupy.float64, order='C') + ''' + ''' + # outcore implementation + buf = int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 1) + size = 3*(k1-k0)*nao_cart*nao_cart + int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C') + rhoj_tmp0 = contract('xpji,ij->xip', int3c_ip, dm_cart) + vj_outcore = contract('xip,p->xi', rhoj_tmp0, rhoj_cart[k0:k1]) + vk_outcore = contract('pji,xpji->xi', rhok_tmp, int3c_ip) + + buf = int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 2) + int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C') + rhoj_tmp0 = contract('xpji,ji->xp', int3c_ip, dm_cart) + vjaux_outcore = contract('xp,p->xp', rhoj_tmp0, rhoj_cart[k0:k1]) + vkaux_outcore = contract('xpji,pji->xp', int3c_ip, rhok_tmp) + ''' + vj_tmp, vk_tmp = get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip1', rhoj_tmp, rhok_tmp, dm_cart, omega=omega) + if with_j: vj += vj_tmp + if with_k: vk += vk_tmp + vj_tmp, vk_tmp = get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip2', rhoj_tmp, rhok_tmp, dm_cart, omega=omega) + if with_j: vjaux[:, k0:k1] = vj_tmp + if with_k: vkaux[:, k0:k1] = vk_tmp + + rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None + t0 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t0) + return vj, vk, vjaux, vkaux + +def get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart, + with_j=True, with_k=True, omega=None): + ''' + Calculate vj = (i'j|L)(L|kl)(ij)(kl), vk = (i'j|L)(L|kl)(ik)(jl) + vjaux = (ij|L')(L|kl)(ij)(kl), vkaux = (ij|L')(L|kl)(ik)(jl) + ''' + nao_cart = dm_cart.shape[0] + block_size = with_df.get_blksize(nao=nao_cart) + + intopt = VHFOpt(mol, auxmol, 'int2e') + intopt.build(1e-14, diag_block_with_triu=True, aosym=False, + group_size_aux=block_size, verbose=0)#, group_size=block_size) + + aux_ao_loc = np.array(intopt.aux_ao_loc) + loads = aux_ao_loc[1:] - aux_ao_loc[:-1] + task_list = _split_tasks(loads, _num_devices) + + futures = [] + cupy.cuda.get_current_stream().synchronize() + with ThreadPoolExecutor(max_workers=_num_devices) as executor: + for device_id in range(_num_devices): + future = executor.submit( + _jk_ip_task, intopt, rhoj_cart, dm_cart, rhok_cart, orbo_cart, task_list[device_id], + with_j=with_j, with_k=with_k, device_id=device_id, omega=omega) + futures.append(future) + + rhoj_total = [] + rhok_total = [] + vjaux_total = [] + vkaux_total = [] + for future in futures: + rhoj, rhok, vjaux, vkaux = future.result() + rhoj_total.append(rhoj) + rhok_total.append(rhok) + vjaux_total.append(vjaux) + vkaux_total.append(vkaux) + + rhoj = rhok = vjaux = vkaux = None + if with_j: + rhoj = reduce_to_device(rhoj_total) + vjaux = reduce_to_device(vjaux_total) + if with_k: + rhok = reduce_to_device(rhok_total) + vkaux = reduce_to_device(vkaux_total) + return rhoj, rhok, vjaux, vkaux diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py index 681e18be..17816bc8 100644 --- a/gpu4pyscf/df/grad/rhf.py +++ b/gpu4pyscf/df/grad/rhf.py @@ -22,7 +22,7 @@ from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf import __config__ from gpu4pyscf.lib import logger -from gpu4pyscf.df.grad.jk import get_rhoj_rhok +from gpu4pyscf.df.grad.jk import get_rhojk, get_grad_vjk LINEAR_DEP_THRESHOLD = df.LINEAR_DEP_THR MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128) @@ -44,6 +44,7 @@ def j2c_solver(v): mask = w > lindep v1 = v[:,mask] j2c = cupy.dot(v1/w[mask], v1.conj().T) + w = v = v1 = mask = None def j2c_solver(b): # noqa: F811 return j2c.dot(b.reshape(j2c.shape[0],-1)).reshape(b.shape) return j2c_solver @@ -61,7 +62,6 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega # extended to any 1-particle density matrix if(dm0 is None): dm0 = mf_grad.base.make_rdm1() - mf = mf_grad.base if omega is None: with_df = mf_grad.base.with_df else: @@ -91,7 +91,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega mo_coeff = None orbo = intopt.sort_orbitals(orbo, axis=[0]) - rhoj, rhok = get_rhoj_rhok(with_df, dm, orbo, with_j=with_j, with_k=with_k) + rhoj, rhok = get_rhojk(with_df, dm, orbo, with_j=with_j, with_k=with_k) # (d/dX P|Q) contributions if omega and omega > 1e-10: @@ -101,6 +101,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega int2c_e1 = auxmol.intor('int2c2e_ip1') int2c_e1 = cupy.asarray(int2c_e1) + rhoj_cart = rhok_cart = None auxslices = auxmol.aoslice_by_atom() aux_cart2sph = intopt.aux_cart2sph low = with_df.cd_low @@ -128,6 +129,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega elif low.tag == 'cd': #rhok = solve_triangular(low_t, rhok, lower=False) rhok = solve_triangular(low_t, rhok.reshape(naux, -1), lower=False, overwrite_b=True).reshape(naux, nocc, nocc) + rhok = rhok.copy(order='C') tmp = contract('pij,qij->pq', rhok, rhok) tmp = intopt.unsort_orbitals(tmp, aux_axis=[0,1]) vkaux = -contract('xpq,pq->xp', int2c_e1, tmp) @@ -142,12 +144,6 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega t0 = log.timer_debug1('rhoj and rhok', *t0) int2c_e1 = None - nao_cart = intopt._sorted_mol.nao - block_size = with_df.get_blksize(nao=nao_cart) - - intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') - intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, - group_size_aux=block_size)#, group_size=block_size) dm_cart = dm orbo_cart = orbo if not mol.cart: @@ -155,63 +151,14 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega cart2sph = intopt.cart2sph orbo_cart = cart2sph @ orbo dm_cart = cart2sph @ dm @ cart2sph.T - - dm = orbo = None - vj = vk = rhoj_tmp = rhok_tmp = None - vjaux = vkaux = None - - naux_cart = intopt._sorted_auxmol.nao - if with_j: - vj = cupy.zeros((3,nao_cart), order='C') - vjaux = cupy.zeros((3,naux_cart)) - if with_k: - vk = cupy.zeros((3,nao_cart), order='C') - vkaux = cupy.zeros((3,naux_cart)) - cupy.get_default_memory_pool().free_all_blocks() - t1 = log.init_timer() - for cp_kl_id in range(len(intopt.aux_log_qs)): - k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1] - assert k1-k0 <= block_size - if with_j: - rhoj_tmp = rhoj_cart[k0:k1] - if with_k: - rhok_tmp = contract('por,ir->pio', rhok_cart[k0:k1], orbo_cart) - rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo_cart) - ''' - if(rhoj_tmp.flags['C_CONTIGUOUS'] == False): - rhoj_tmp = rhoj_tmp.astype(cupy.float64, order='C') - - if(rhok_tmp.flags['C_CONTIGUOUS'] == False): - rhok_tmp = rhok_tmp.astype(cupy.float64, order='C') - ''' - ''' - # outcore implementation - buf = int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 1) - size = 3*(k1-k0)*nao_cart*nao_cart - int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C') - rhoj_tmp0 = contract('xpji,ij->xip', int3c_ip, dm_cart) - vj_outcore = contract('xip,p->xi', rhoj_tmp0, rhoj_cart[k0:k1]) - vk_outcore = contract('pji,xpji->xi', rhok_tmp, int3c_ip) - - buf = int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 2) - int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C') - rhoj_tmp0 = contract('xpji,ji->xp', int3c_ip, dm_cart) - vjaux_outcore = contract('xp,p->xp', rhoj_tmp0, rhoj_cart[k0:k1]) - vkaux_outcore = contract('xpji,pji->xp', int3c_ip, rhok_tmp) - ''' - vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip1', rhoj_tmp, rhok_tmp, dm_cart, omega=omega) - if with_j: vj += vj_tmp - if with_k: vk += vk_tmp - vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip2', rhoj_tmp, rhok_tmp, dm_cart, omega=omega) - if with_j: vjaux[:, k0:k1] = vj_tmp - if with_k: vkaux[:, k0:k1] = vk_tmp - - rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None - t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1) - + + with_df._cderi = None # release GPU memory + vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart, + with_j=with_j, with_k=with_k, omega=omega) # NOTE: vj and vk are still in cartesian _sorted_mol = intopt._sorted_mol natm = _sorted_mol.natm + nao_cart = _sorted_mol.nao ao2atom = numpy.zeros([nao_cart, natm]) ao_loc = _sorted_mol.ao_loc for ibas, iatm in enumerate(_sorted_mol._bas[:,gto.ATOM_OF]): @@ -225,6 +172,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega _sorted_auxmol = intopt._sorted_auxmol natm = _sorted_auxmol.natm + naux_cart = _sorted_auxmol.nao aux2atom = numpy.zeros([naux_cart, natm]) ao_loc = _sorted_auxmol.ao_loc for ibas, iatm in enumerate(_sorted_auxmol._bas[:,gto.ATOM_OF]): @@ -237,7 +185,6 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega if with_k: vkaux_3c = aux2atom.T @ vkaux.T vkaux = vkaux_2c - vkaux_3c - return vj, vk, vjaux, vkaux diff --git a/gpu4pyscf/df/grad/uhf.py b/gpu4pyscf/df/grad/uhf.py index fc8de3be..53acd7e0 100644 --- a/gpu4pyscf/df/grad/uhf.py +++ b/gpu4pyscf/df/grad/uhf.py @@ -18,11 +18,11 @@ from cupyx.scipy.linalg import solve_triangular from pyscf import scf, gto from gpu4pyscf.df import int3c2e -from gpu4pyscf.lib.cupy_helper import tag_array, contract, load_library +from gpu4pyscf.lib.cupy_helper import tag_array, contract from gpu4pyscf.grad import uhf as uhf_grad from gpu4pyscf import __config__ from gpu4pyscf.lib import logger -from gpu4pyscf.df.grad.jk import get_rhoj_rhok +from gpu4pyscf.df.grad.jk import get_rhojk, get_grad_vjk FREE_CUPY_CACHE = True BINSIZE = 128 @@ -80,39 +80,9 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, # (L|ij) -> rhoj: (L), rhok: (L|oo) low = with_df.cd_low - rhoj, rhok = get_rhoj_rhok(with_df, dm, orbo, with_j=with_j, with_k=with_k) + rhoj, rhok = get_rhojk(with_df, dm, orbo, with_j=with_j, with_k=with_k) if dm2 is not None: - rhoj2, _ = get_rhoj_rhok(with_df, dm2_tmp, orbo, with_j=with_j, with_k=False) - ''' - rows = with_df.intopt.cderi_row - cols = with_df.intopt.cderi_col - dm_sparse = dm[rows, cols] - dm_sparse[with_df.intopt.cderi_diag] *= .5 - if dm2 is not None: - dm2_sparse = dm2_tmp[rows, cols] - dm2_sparse[with_df.intopt.cderi_diag] *= .5 - - blksize = with_df.get_blksize() - if with_j: - rhoj = cupy.empty([naux]) - if dm2 is not None: - rhoj2 = cupy.empty([naux]) - if with_k: - rhok = cupy.empty([naux, nocc, nocc], order='C') - p0 = p1 = 0 - - for cderi, cderi_sparse in with_df.loop(blksize=blksize): - p1 = p0 + cderi.shape[0] - if with_j: - rhoj[p0:p1] = 2.0*dm_sparse.dot(cderi_sparse) - if dm2 is not None: - rhoj2[p0:p1] = 2.0*dm2_sparse.dot(cderi_sparse) - if with_k: - tmp = contract('Lij,jk->Lki', cderi, orbo) - contract('Lki,il->Lkl', tmp, orbo, out=rhok[p0:p1]) - p0 = p1 - tmp = dm_sparse = cderi_sparse = cderi = None - ''' + rhoj2, _ = get_rhojk(with_df, dm2_tmp, orbo, with_j=with_j, with_k=False) # (d/dX P|Q) contributions if omega and omega > 1e-10: @@ -120,7 +90,9 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, int2c_e1 = auxmol.intor('int2c2e_ip1') else: int2c_e1 = auxmol.intor('int2c2e_ip1') + int2c_e1 = cupy.asarray(int2c_e1) + rhoj_cart = rhok_cart = None auxslices = auxmol.aoslice_by_atom() aux_cart2sph = intopt.aux_cart2sph low_t = low.T.copy() @@ -154,6 +126,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, rhok = contract('pq,qij->pij', low_t.T, rhok) elif low.tag == 'cd': rhok = solve_triangular(low_t, rhok.reshape(naux, -1), lower=False, overwrite_b=True).reshape(naux, nocc, nocc) + rhok = rhok.copy(order='C') tmp = contract('pij,qij->pq', rhok, rhok) tmp = intopt.unsort_orbitals(tmp, aux_axis=[0,1]) vkaux = -contract('xpq,pq->xp', int2c_e1, tmp) @@ -192,58 +165,10 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, orbo_cart = orbo dm = orbo = None - vj = vk = rhoj_tmp = rhok_tmp = None - vjaux = vkaux = None - - naux_cart = intopt._sorted_auxmol.nao - if with_j: - vj = cupy.zeros((3,nao_cart), order='C') - vjaux = cupy.zeros((3,naux_cart)) - if with_k: - vk = cupy.zeros((3,nao_cart), order='C') - vkaux = cupy.zeros((3,naux_cart)) - cupy.get_default_memory_pool().free_all_blocks() - t1 = log.init_timer() - for cp_kl_id in range(len(intopt.aux_log_qs)): - k0, k1 = intopt.cart_aux_loc[cp_kl_id], intopt.cart_aux_loc[cp_kl_id+1] - assert k1-k0 <= block_size - if with_j: - rhoj_tmp = rhoj_cart[k0:k1] - if with_k: - rhok_tmp = contract('por,ir->pio', rhok_cart[k0:k1], orbo_cart) - rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo_cart) - ''' - if(rhoj_tmp.flags['C_CONTIGUOUS'] == False): - rhoj_tmp = rhoj_tmp.astype(cupy.float64, order='C') - - if(rhok_tmp.flags['C_CONTIGUOUS'] == False): - rhok_tmp = rhok_tmp.astype(cupy.float64, order='C') - ''' - ''' - # outcore implementation - int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 1, out=buf) - size = 3*(k1-k0)*nao_cart*nao_cart - int3c_ip = buf[:size].reshape([3,k1-k0,nao_cart,nao_cart], order='C') - rhoj_tmp = contract('xpji,ij->xip', int3c_ip, dm_cart) - vj += contract('xip,p->xi', rhoj_tmp, rhoj_cart[k0:k1]) - vk += contract('pji,xpji->xi', rhok_tmp, int3c_ip) - - int3c2e.get_int3c2e_ip_slice(intopt, cp_kl_id, 2, out=buf) - rhoj_tmp = contract('xpji,ji->xp', int3c_ip, dm_cart) - vjaux[:, k0:k1] = contract('xp,p->xp', rhoj_tmp, rhoj_cart[k0:k1]) - vkaux[:, k0:k1] = contract('xpji,pji->xp', int3c_ip, rhok_tmp) - ''' - vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip1', rhoj_tmp, rhok_tmp, dm_cart, omega=omega) - if with_j: vj += vj_tmp - if with_k: vk += vk_tmp - - vj_tmp, vk_tmp = int3c2e.get_int3c2e_ip_jk(intopt, cp_kl_id, 'ip2', rhoj_tmp, rhok_tmp, dm_cart, omega=omega) - if with_j: vjaux[:, k0:k1] = vj_tmp - if with_k: vkaux[:, k0:k1] = vk_tmp - - rhoj_tmp = rhok_tmp = vj_tmp = vk_tmp = None - t1 = log.timer_debug1(f'calculate {cp_kl_id:3d} / {len(intopt.aux_log_qs):3d}, {k1-k0:3d} slices', *t1) - + with_df._cderi = None # release GPU memory + vj, vk, vjaux, vkaux = get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart, + with_j=with_j, with_k=with_k, omega=omega) + # NOTE: vj and vk are still in cartesian _sorted_mol = intopt._sorted_mol natm = _sorted_mol.natm @@ -260,6 +185,7 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, _sorted_auxmol = intopt._sorted_auxmol natm = _sorted_auxmol.natm + naux_cart = _sorted_auxmol.nao aux2atom = np.zeros([naux_cart, natm]) ao_loc = _sorted_auxmol.ao_loc for ibas, iatm in enumerate(_sorted_auxmol._bas[:,gto.ATOM_OF]): diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py new file mode 100644 index 00000000..40ab3bfd --- /dev/null +++ b/gpu4pyscf/df/hessian/jk.py @@ -0,0 +1,443 @@ +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import ctypes +import itertools +import numpy as np +from concurrent.futures import ThreadPoolExecutor +import cupy +from gpu4pyscf.df import int3c2e +from gpu4pyscf.scf.int4c2e import libgint +from gpu4pyscf.hessian.jk import _ao2mo +from gpu4pyscf.lib import logger +from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device +from gpu4pyscf.__config__ import _streams, _num_devices + +NROOT_ON_GPU = 7 + +def _jk_task_with_mo1(dfobj, dms, mo_coeff, mo1s, occ_coeffs, + with_j=True, with_k=True, hermi=0, device_id=0): + ''' Calculate J and K matrices with mo response + For CP-HF + ''' + assert hermi == 1 + with cupy.cuda.Device(device_id), _streams[device_id]: + assert isinstance(dfobj.verbose, int) + log = logger.new_logger(dfobj.mol, dfobj.verbose) + t0 = log.init_timer() + dms = cupy.asarray(dms) + n_dm = dms.shape[0] + mo1s = [cupy.asarray(mo1) for mo1 in mo1s] + occ_coeffs = [cupy.asarray(occ_coeff) for occ_coeff in occ_coeffs] + mo_coeff = [cupy.asarray(mo) for mo in mo_coeff] + nao = dms.shape[-1] + intopt = dfobj.intopt + rows = intopt.cderi_row + cols = intopt.cderi_col + dms_shape = dms.shape + if with_j: + dm_sparse = dms[:,rows,cols] + if hermi == 0: + dm_sparse += dms[:,cols,rows] + else: + dm_sparse *= 2 + dm_sparse[:, intopt.cderi_diag] *= .5 + dms = None + + if with_k: + vks = [cupy.zeros_like(mo1) for mo1 in mo1s] + + if with_j: + vj_sparse = cupy.zeros_like(dm_sparse) + + nocc = max([mo1.shape[2] for mo1 in mo1s]) + blksize = dfobj.get_blksize(extra=2*nao*nocc) + for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k): + if with_j: + rhoj = dm_sparse.dot(cderi_sparse) + vj_sparse += cupy.dot(rhoj, cderi_sparse.T) + rhoj = None + cderi_sparse = None + if with_k: + for occ_coeff, mo1, vk in zip(occ_coeffs, mo1s, vks): + nocc = occ_coeff.shape[1] + rhok = contract('Lij,jo->Loi', cderi, occ_coeff) + rhok_oo = contract('Loi,ip->Lop', rhok, occ_coeff).reshape([-1,nocc]) + rhok = rhok.reshape([-1,nao]) + for i in range(mo1.shape[0]): + rhok1 = contract('Lij,jo->Loi', cderi, mo1[i]) + rhok1 = rhok1.reshape([-1,nao]) + vk[i] += cupy.dot(rhok1.T, rhok_oo) + + rhok1 = rhok1.reshape([-1,nocc,nao]) + rhok1 = contract('Loi,ip->Lop', rhok1, occ_coeff) + rhok1 = rhok1.reshape([-1,nocc]) + vk[i] += cupy.dot(rhok.T, rhok1) + mo1 = rhok1 = rhok = rhok_oo = None + cderi = None + mo1s = None + if with_j: + vj = cupy.zeros(dms_shape) + vj[:,rows,cols] = vj_sparse + vj[:,cols,rows] = vj_sparse + + vj_mo = vk_mo = None + if len(occ_coeffs) == 1: + # Restricted case + mo = mo_coeff[0] + if with_j: + vj_mo = _ao2mo(vj, occ_coeffs[0], mo).reshape(n_dm,-1) + vj = None + mo *= 2.0 # Due to double occupancy + if with_k: + vk_mo = contract('nio,ip->npo', vks[0], mo).reshape(n_dm,-1) + elif len(occ_coeffs) == 2: + # Unrestricted case + n_dm_2 = n_dm // 2 + mocca, moccb = occ_coeffs + moa, mob = mo_coeff + nmoa, nmob = moa.shape[1], mob.shape[1] + nocca, noccb = mocca.shape[1], moccb.shape[1] + + if with_j: + vjab = vj[:n_dm_2] + vj[n_dm_2:] + vj = None + vj_mo = cupy.empty([n_dm_2,nmoa*nocca+nmob*noccb]) + vj_mo[:,:nmoa*nocca] = _ao2mo(vjab, mocca, moa).reshape(n_dm_2,-1) + vj_mo[:,nmoa*nocca:] = _ao2mo(vjab, moccb, mob).reshape(n_dm_2,-1) + vjab = None + + if with_k: + vka, vkb = vks + vk_mo = cupy.empty([n_dm_2,nmoa*nocca+nmob*noccb]) + vk_mo[:,:nmoa*nocca] = contract('nio,ip->npo', vka, moa).reshape(n_dm_2,-1) + vk_mo[:,nmoa*nocca:] = contract('nio,ip->npo', vkb, mob).reshape(n_dm_2,-1) + + t0 = log.timer_debug1(f'vj and vk on Device {device_id}', *t0) + return vj_mo, vk_mo + +def get_jk(dfobj, dms_tag, mo_coeff, mocc, hermi=0, + with_j=True, with_k=True, direct_scf_tol=1e-14, omega=None): + ''' Compute J/K in MO with density fitting + ''' + + log = logger.new_logger(dfobj.mol, dfobj.verbose) + if not isinstance(dms_tag, cupy.ndarray): + dms_tag = cupy.asarray(dms_tag) + + assert(with_j or with_k) + if dms_tag is None: logger.error("dm is not given") + nao = dms_tag.shape[-1] + t1 = t0 = log.init_timer() + if dfobj._cderi is None: + log.debug('Build CDERI ...') + dfobj.build(direct_scf_tol=direct_scf_tol, omega=omega) + t1 = log.timer_debug1('init jk', *t0) + + assert nao == dfobj.nao + intopt = dfobj.intopt + + nao = dms_tag.shape[-1] + dms = dms_tag.reshape([-1,nao,nao]) + intopt = dfobj.intopt + dms = intopt.sort_orbitals(dms, axis=[1,2]) + + cupy.cuda.get_current_stream().synchronize() + occ_coeffs = dms_tag.occ_coeff + mo1s = dms_tag.mo1 + + if not isinstance(occ_coeffs, (tuple, list)): + occ_coeffs = [occ_coeffs] + mo1s = [mo1s] + mo_coeff = [mo_coeff] + else: + assert isinstance(mo1s, (tuple, list)) + mo_coeff = [mo_coeff[0], mo_coeff[1]] + + occ_coeffs = [intopt.sort_orbitals(occ_coeff, axis=[0]) for occ_coeff in occ_coeffs] + mo1s = [intopt.sort_orbitals(mo1, axis=[1]) for mo1 in mo1s] + mo_coeff = [intopt.sort_orbitals(mo, axis=[0]) for mo in mo_coeff] + + futures = [] + with ThreadPoolExecutor(max_workers=_num_devices) as executor: + for device_id in range(_num_devices): + future = executor.submit( + _jk_task_with_mo1, + dfobj, dms, mo_coeff, mo1s, occ_coeffs, + hermi=hermi, device_id=device_id, + with_j=with_j, with_k=with_k) + futures.append(future) + + vj = vk = None + if with_j: + vj = [future.result()[0] for future in futures] + vj = reduce_to_device(vj, inplace=True) + + if with_k: + vk = [future.result()[1] for future in futures] + vk = reduce_to_device(vk, inplace=True) + t1 = log.timer_debug1('vj and vk', *t1) + return vj, vk + + +def _get_int3c2e_ipip_slice(ip_type, intopt, cp_ij_id, aux_id, omega=None, stream=None): + + if omega is None: omega = 0.0 + if stream is None: stream = cupy.cuda.get_current_stream() + + fn = getattr(libgint, 'GINTfill_int3c2e_' + ip_type) + nao = intopt._sorted_mol.nao + naux = intopt._sorted_auxmol.nao + norb = nao + naux + 1 + comp = 9 + order = 2 + nbins = 1 + + cp_kl_id = aux_id + len(intopt.log_qs) + lk = intopt.aux_angular[aux_id] + + cpi = intopt.cp_idx[cp_ij_id] + cpj = intopt.cp_jdx[cp_ij_id] + li = intopt.angular[cpi] + lj = intopt.angular[cpj] + + i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] + j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] + k0, k1 = intopt.cart_aux_loc[aux_id], intopt.cart_aux_loc[aux_id+1] + ni = i1 - i0 + nj = j1 - j0 + nk = k1 - k0 + + log_q_ij = intopt.log_qs[cp_ij_id] + log_q_kl = intopt.aux_log_qs[aux_id] + + bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) + bins_locs_kl = np.array([0, len(log_q_kl)], dtype=np.int32) + + ao_offsets = np.array([i0,j0,nao+1+k0,nao], dtype=np.int32) + strides = np.array([1, ni, ni*nj, ni*nj*nk], dtype=np.int32) + + # Use GPU kernels for low-angular momentum + if (li + lj + lk + order)//2 + 1 < NROOT_ON_GPU: + int3c_blk = cupy.zeros([comp, nk, nj, ni], order='C', dtype=np.float64) + err = fn( + ctypes.cast(stream.ptr, ctypes.c_void_p), + intopt.bpcache, + ctypes.cast(int3c_blk.data.ptr, ctypes.c_void_p), + ctypes.c_int(norb), + strides.ctypes.data_as(ctypes.c_void_p), + ao_offsets.ctypes.data_as(ctypes.c_void_p), + bins_locs_ij.ctypes.data_as(ctypes.c_void_p), + bins_locs_kl.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbins), + ctypes.c_int(cp_ij_id), + ctypes.c_int(cp_kl_id), + ctypes.c_double(omega)) + if err != 0: + raise RuntimeError(f'GINT_fill_int3c2e general failed, err={err}') + else: + from pyscf.gto.moleintor import getints, make_cintopt + pmol = intopt._tot_mol + intor = pmol._add_suffix('int3c2e_' + ip_type) + opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor) + + # TODO: sph2cart in CPU? + ishl0, ishl1 = intopt.l_ctr_offsets[cpi], intopt.l_ctr_offsets[cpi+1] + jshl0, jshl1 = intopt.l_ctr_offsets[cpj], intopt.l_ctr_offsets[cpj+1] + kshl0, kshl1 = intopt.l_ctr_offsets[aux_id+1+intopt.nctr], intopt.l_ctr_offsets[aux_id+1+intopt.nctr+1] + shls_slice = np.array([ishl0, ishl1, jshl0, jshl1, kshl0, kshl1], dtype=np.int64) + int3c_cpu = getints(intor, pmol._atm, pmol._bas, pmol._env, shls_slice, cintopt=opt).transpose([0,3,2,1]) + int3c_blk = cupy.asarray(int3c_cpu) + + if not intopt.auxmol.cart: + int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk) + if not intopt.mol.cart: + int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj) + int3c_blk = cart2sph(int3c_blk, axis=3, ang=li) + + return int3c_blk + + +def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo, + device_id=0, with_j=True, with_k=True, omega=None, + auxbasis_response=1): + natm = intopt.mol.natm + nao = dm0.shape[0] + assert with_j or with_k + ao_loc = intopt.ao_loc + aux_ao_loc = intopt.aux_ao_loc + with cupy.cuda.Device(device_id), _streams[device_id]: + log = logger.new_logger(intopt.mol, intopt.mol.verbose) + t0 = log.init_timer() + orbo = cupy.asarray(orbo) + dm0 = cupy.asarray(dm0) + nao = dm0.shape[0] + if with_j: + naux = rhoj.shape[0] + rhoj = cupy.asarray(rhoj) + hj_ipip1 = cupy.zeros([9,nao]) + hj_ipip2 = cupy.zeros([9,naux]) + hj_ip1ip2 = cupy.zeros([9,nao,naux]) + hj_ipvip1 = cupy.zeros([9,nao,nao]) + if with_k: + naux = rhok.shape[0] + rhok = cupy.asarray(rhok) + hk_ipip1 = cupy.zeros([9,nao]) + hk_ipip2 = cupy.zeros([9,naux]) + hk_ip1ip2 = cupy.zeros([9,nao,naux]) + hk_ipvip1 = cupy.zeros([9,nao,nao]) + + cupy.get_default_memory_pool().free_all_blocks() + for aux_id, cp_ij_id in task_list: + cpi = intopt.cp_idx[cp_ij_id] + cpj = intopt.cp_jdx[cp_ij_id] + i0, i1 = ao_loc[cpi], ao_loc[cpi+1] + j0, j1 = ao_loc[cpj], ao_loc[cpj+1] + k0, k1 = aux_ao_loc[aux_id], aux_ao_loc[aux_id+1] + + if with_k: + rhok_tmp = contract('por,ir->poi', rhok[k0:k1], orbo[i0:i1]) + rhok_tmp = contract('poi,jo->pji', rhok_tmp, orbo[j0:j1]) + + # (20|0), (0|0)(0|00) + int3c_blk = _get_int3c2e_ipip_slice('ipip1', intopt, cp_ij_id, aux_id, omega=omega) + if with_j: + tmp = contract('xpji,p->xji', int3c_blk, rhoj[k0:k1]) + hj_ipip1[:,i0:i1] += contract('xji,ij->xi', tmp, dm0[i0:i1,j0:j1]) + if with_k: + hk_ipip1[:,i0:i1] += contract('xpji,pji->xi', int3c_blk, rhok_tmp) + int3c_blk = tmp = None + + # (11|0), (0|0)(0|00) without response of RI basis + int3c_blk = _get_int3c2e_ipip_slice('ipvip1', intopt, cp_ij_id, aux_id, omega=omega) + if with_j: + tmp = contract('xpji,p->xji', int3c_blk, rhoj[k0:k1]) + hj_ipvip1[:,i0:i1,j0:j1] += contract('xji,ij->xij', tmp, dm0[i0:i1,j0:j1]) + if with_k: + hk_ipvip1[:,i0:i1,j0:j1] += contract('xpji,pji->xij', int3c_blk, rhok_tmp) + int3c_blk = tmp = None + + if auxbasis_response < 1: + continue + + # (10|1), (0|0)(0|00) + int3c_blk = _get_int3c2e_ipip_slice('ip1ip2', intopt, cp_ij_id, aux_id, omega=omega) + if with_j: + tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) + hj_ip1ip2[:,i0:i1,k0:k1] += contract('xpi,p->xip', tmp, rhoj[k0:k1]) + if with_k: + hk_ip1ip2[:,i0:i1,k0:k1] += contract('xpji,pji->xip', int3c_blk, rhok_tmp) + int3c_blk = tmp = None + + if auxbasis_response < 2: + continue + + # (00|2), (0|0)(0|00) + int3c_blk = _get_int3c2e_ipip_slice('ipip2', intopt, cp_ij_id, aux_id, omega=omega) + if with_j: + tmp = contract('xpji,ij->xp', int3c_blk, dm0[i0:i1,j0:j1]) + hj_ipip2[:,k0:k1] += contract('xp,p->xp', tmp, rhoj[k0:k1]) + if with_k: + hk_ipip2[:,k0:k1] += contract('xpji,pji->xp', int3c_blk, rhok_tmp) + int3c_blk = tmp = None + auxslices = intopt.auxmol.aoslice_by_atom() + aoslices = intopt.mol.aoslice_by_atom() + ao2atom = int3c2e.get_ao2atom(intopt, aoslices) + aux2atom = int3c2e.get_aux2atom(intopt, auxslices) + + hj = None + if with_j: + hj_ipvip1 = hj_ipvip1.reshape([3,3,nao,nao]) + tmp = contract('ia,xyij->ajxy', ao2atom, hj_ipvip1) + hj = 2.0 * contract('jb,ajxy->abxy', ao2atom, tmp) + + hj_ipip1 = hj_ipip1.reshape([3,3,nao]) + tmp = contract('ia,xyi->axy', ao2atom, hj_ipip1) + hj[range(natm), range(natm)] += 2.0 * tmp + + hk = None + if with_k: + hk_ipvip1 = hk_ipvip1.reshape([3,3,nao,nao]) + tmp = contract('ia,xyij->ajxy', ao2atom, hk_ipvip1) + hk = contract('jb,ajxy->abxy', ao2atom, tmp) + + hk_ipip1 = hk_ipip1.reshape([3,3,nao]) + tmp = contract('ia,xyi->axy', ao2atom, hk_ipip1) + hk[range(natm), range(natm)] += tmp + + if auxbasis_response > 0: + if with_j: + hj_ip1ip2 = hj_ip1ip2.reshape([3,3,nao,naux]) + tmp = contract('ia,xyij->ajxy', ao2atom, hj_ip1ip2) + tmp = contract('jb,ajxy->abxy',aux2atom, tmp) + tmp = tmp + tmp.transpose([1,0,3,2]) + hj += tmp + if auxbasis_response > 1: + hj += tmp + if with_k: + hk_ip1ip2 = hk_ip1ip2.reshape([3,3,nao,naux]) + tmp = contract('ia,xyij->ajxy', ao2atom, hk_ip1ip2) + tmp = contract('jb,ajxy->abxy', aux2atom, tmp) + tmp = 0.5 * (tmp + tmp.transpose([1,0,3,2])) + hk += tmp + if auxbasis_response > 1: + hk += tmp + + if auxbasis_response > 1: + if with_j: + hj_ipip2 = hj_ipip2.reshape([3,3,naux]) + tmp = contract('ia,xyi->axy', aux2atom, hj_ipip2) + hj[range(natm), range(natm)] += tmp + if with_k: + hk_ipip2 = hk_ipip2.reshape([3,3,naux]) + tmp = contract('ia,xyi->axy', aux2atom, hk_ipip2) + hk[range(natm), range(natm)] += .5 * tmp + t0 = log.timer_debug1(f'int3c2e_ipip on Device {device_id}', *t0) + return hj, hk + +def get_int3c2e_hjk(intopt, rhoj, rhok, dm0_tag, with_j=True, with_k=True, + omega=None, auxbasis_response=1): + orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') + futures = [] + ncp_k = len(intopt.aux_log_qs) + ncp_ij = len(intopt.log_qs) + tasks = np.array(list(itertools.product(range(ncp_k), range(ncp_ij)))) + task_list = [] + for device_id in range(_num_devices): + task_list.append(tasks[device_id::_num_devices]) + + cupy.cuda.get_current_stream().synchronize() + with ThreadPoolExecutor(max_workers=_num_devices) as executor: + for device_id in range(_num_devices): + future = executor.submit( + _int3c2e_ipip_tasks, intopt, task_list[device_id], + rhoj, rhok, dm0_tag, orbo, with_j=with_j, with_k=with_k, + device_id=device_id, omega=omega, + auxbasis_response=auxbasis_response) + futures.append(future) + + hj_total = [] + hk_total = [] + for future in futures: + hj, hk = future.result() + hj_total.append(hj) + hk_total.append(hk) + + hj = hk = None + if with_j: + hj = reduce_to_device(hj_total, inplace=True) + if with_k: + hk = reduce_to_device(hk_total, inplace=True) + return hj, hk diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index aaf1c16e..2eab8ef5 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -30,15 +30,15 @@ from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf.hessian import rhf as rhf_hess from gpu4pyscf.lib.cupy_helper import ( - contract, tag_array, get_avail_mem, release_gpu_stack, pinv) + contract, tag_array, get_avail_mem, release_gpu_stack, pinv, copy_array) from gpu4pyscf.df import int3c2e, df from gpu4pyscf.lib import logger from gpu4pyscf import __config__ from gpu4pyscf.df.grad.rhf import _gen_metric_solver -from gpu4pyscf.gto.mole import sort_atoms +from gpu4pyscf.df.hessian import jk LINEAR_DEP_THR = df.LINEAR_DEP_THR -BLKSIZE = 128 +BLKSIZE = 256 ALIGNED = getattr(__config__, 'ao_aligned', 32) GB = 1024*1024*1024 @@ -53,11 +53,13 @@ def _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2): ''' nnz = rhok1_Pko.shape[0] nao = dm0.shape[0] + hk_ao_ao = cupy.zeros([nao,nao,3,3]) + cupy.get_default_memory_pool().free_all_blocks() mem_avail = get_avail_mem() blksize = int((mem_avail*0.4/(nao*nao*3*8)/ALIGNED))*ALIGNED - hk_ao_ao = cupy.zeros([nao,nao,3,3]) for k0, k1 in lib.prange(0,nnz,blksize): - rhok1_Pko_kslice = cupy.asarray(rhok1_Pko[k0:k1]) + #rhok1_Pko_kslice = cupy.asarray(rhok1_Pko[k0:k1]) + rhok1_Pko_kslice = copy_array(rhok1_Pko[k0:k1]) # (10|0)(0|10) without response of RI basis vk2_ip1_ip1 = contract('piox,pkoy->ikxy', rhok1_Pko_kslice, rhok1_Pko_kslice) @@ -67,12 +69,11 @@ def _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2): # (10|0)(0|01) without response of RI basis rhok1_Pkl_kslice = contract('piox,ko->pikx', rhok1_Pko_kslice, mocc_2) hk_ao_ao += contract('pikx,pkiy->ikxy', rhok1_Pkl_kslice, rhok1_Pkl_kslice) - rhok1_Pkl_kslice = None + rhok1_Pkl_kslice = rhok1_Pko_kslice = None return hk_ao_ao - -def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, - atmlst=None, max_memory=4000, verbose=None, with_k=True, omega=None): +def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, + max_memory=None, verbose=None, with_j=True, with_k=True, omega=None): '''Partial derivative ''' log = logger.new_logger(hessobj, verbose) @@ -110,7 +111,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # ================================ sorted AO begin =============================================== intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') - intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) + intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, verbose=0, + group_size=BLKSIZE, group_size_aux=BLKSIZE) naux = auxmol.nao mocc_2 = intopt.sort_orbitals(mocc_2, axis=[0]) dm0 = intopt.sort_orbitals(dm0, axis=[0,1]) @@ -119,55 +121,66 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c = cupy.asarray(int2c, order='C') int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1]) solve_j2c = _gen_metric_solver(int2c) - - int2c_ip1 = cupy.asarray(int2c_ip1, order='C') - int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2]) - - hj_ao_ao = cupy.zeros([nao,nao,3,3]) - hk_ao_ao = cupy.zeros([nao,nao,3,3]) - if hessobj.auxbasis_response: - hj_ao_aux = cupy.zeros([nao,naux,3,3]) - hk_ao_aux = cupy.zeros([nao,naux,3,3]) - + # int3c contributions wj, wk_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0_tag, omega=omega) + rhoj0_P = rhok0_P__ = None + + if with_j: + rhoj0_P = solve_j2c(wj) + wj = None + if with_k: + rhok0_P__ = solve_j2c(wk_P__) + wk_P__ = None t1 = log.timer_debug1('intermediate variables with int3c2e', *t1) - rhoj0_P = solve_j2c(wj) - rhok0_P__ = solve_j2c(wk_P__) - wj = wk_P__ = None + + hj_ipip, hk_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, + with_j=with_j, with_k=with_k, omega=omega, + auxbasis_response=hessobj.auxbasis_response) + t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1) # int3c_ip2 contributions wj_ip2, wk_ip2_P__ = int3c2e.get_int3c2e_ip2_wjk(intopt, dm0_tag, omega=omega) - t1 = log.timer_debug1('interdeidate variables with int3c2e_ip2', *t1) + t1 = log.timer_debug1('intermediate variables with int3c2e_ip2', *t1) # int3c_ip1 contributions wj1_P, wk1_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0_tag, omega=omega) + t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1) + + cupy.get_default_memory_pool().free_all_blocks() + release_gpu_stack() + #rhoj1_P = contract('pq,pix->qix', int2c_inv, wj1_P) - rhoj1_P = solve_j2c(wj1_P) + int2c_ip1 = cupy.asarray(int2c_ip1, order='C') + int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2]) - hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P) # (10|0)(0|0)(0|01) - wj1_P = None - if hessobj.auxbasis_response: - wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P) - wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P) - hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2) # (10|0)(1|00) - hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01) # (10|0)(1|0)(0|00) - hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01) # (10|0)(0|1)(0|00) - wj1_01 = None - rhoj1_P = None + if with_j: + rhoj1_P = solve_j2c(wj1_P) + hj_ao_ao = 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P) # (10|0)(0|0)(0|01) + wj1_P = None + if hessobj.auxbasis_response: + wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P) + wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P) + hj_ao_aux = contract('pix,py->ipxy', rhoj1_P, wj_ip2) # (10|0)(1|00) + hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01) # (10|0)(1|0)(0|00) + hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01) # (10|0)(0|1)(0|00) + wj1_01 = None + rhoj1_P = None if with_k: cupy.get_default_memory_pool().free_all_blocks() mem_avail = get_avail_mem() nocc = mocc.shape[1] slice_size = naux*nocc*9 # largest slice of intermediate variables - blksize = int(mem_avail*0.2/8/slice_size/ALIGNED) * ALIGNED + blksize = int(mem_avail*0.2/8/slice_size) log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, {blksize} aux AOs per block') - if blksize < ALIGNED: - raise RuntimeError('Not enough memory for intermediate variables') - + assert blksize > 0 + if hessobj.auxbasis_response: + hk_ao_aux = cupy.zeros([nao,naux,3,3]) for i0, i1 in lib.prange(0,nao,blksize): - wk1_Pko_islice = cupy.asarray(wk1_Pko[:,i0:i1]) + #wk1_Pko_islice = cupy.asarray(wk1_Pko[:,i0:i1]) + wk1_Pko_islice = copy_array(wk1_Pko[:,i0:i1]) + #rhok1_Pko = contract('pq,qiox->piox', int2c_inv, wk1_Pko_islice) rhok1_Pko = solve_j2c(wk1_Pko_islice) wk1_Pko_islice = None @@ -188,6 +201,12 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, hk_ao_aux[i0:i1] -= contract('qoi,qioxy->iqxy', rhok0_P_I, wk1_I) wk1_I = rhok0_P_I = None rhok1_Pko = None + t1 = log.timer_debug1('contract int3c2e_ip1 with int2c_ip1', *t1) + + rho2c_0 = contract('pij,qji->pq', rhok0_P__, rhok0_P__) + rho2c_10 = contract('rijx,qij->rqx', wk_ip2_P__, rhok0_P__) + rho2c_11 = contract('pijx,qijy->pqxy', wk_ip2_P__, wk_ip2_P__) + rhok0_P__ = wk_ip2_P__ = None w, v = cupy.linalg.eigh(int2c) idx = w > LINEAR_DEP_THR @@ -197,55 +216,24 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, rhok1_Pko = wk1_Pko[:nnz] # Reuse the same memory for i0, i1 in lib.prange(0,nao,blksize): - wk1_tmp = cupy.asarray(wk1_Pko[:,i0:i1]) + #wk1_tmp = cupy.asarray(wk1_Pko[:,i0:i1]) + wk1_tmp = copy_array(wk1_Pko[:,i0:i1]) if isinstance(rhok1_Pko, cupy.ndarray): rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp) else: - rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp).get() + #rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp).get() + wk1_tmp = contract('qp,qiox->piox', cd_low, wk1_tmp) + copy_array(wk1_tmp, rhok1_Pko[:,i0:i1]) wk1_tmp = None cd_low = None - - hk_ao_ao += _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2) + hk_ao_ao = _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2) wk1_Pko = rhok1_Pko = None - t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1) - - cupy.get_default_memory_pool().free_all_blocks() - # int3c_ipip1 contributions - hj_ao_diag, hk_ao_diag = int3c2e.get_int3c2e_hjk(intopt, 'ipip1', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k) - hj_ao_diag *= 2.0 - t1 = log.timer_debug1('intermediate variables with int3c2e_ipip1', *t1) - - # int3c_ipvip1 contributions - # (11|0), (0|00) without response of RI basis - hj, hk = int3c2e.get_int3c2e_hjk(intopt, 'ipvip1', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k) - hj_ao_ao += 2.0*hj - if with_k: - hk_ao_ao += hk - hj = hk = None - t1 = log.timer_debug1('intermediate variables with int3c2e_ipvip1', *t1) + solve_j2c = None + t1 = log.timer_debug1('contract int3c2e_ip1 with int3c2e_ip1', *t1) - # int3c_ip1ip2 contributions - # (10|1), (0|0)(0|00) - if hessobj.auxbasis_response: - hj, hk = int3c2e.get_int3c2e_hjk(intopt, 'ip1ip2', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k) - hj_ao_aux += hj - if with_k: - hk_ao_aux += hk - hj = hk = None - t1 = log.timer_debug1('intermediate variables with int3c2e_ip1ip2', *t1) - - # int3c_ipip2 contributions - if hessobj.auxbasis_response > 1: - # (00|2), (0|0)(0|00) - hj, hk = int3c2e.get_int3c2e_hjk(intopt, 'ipip2', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k) - hj_aux_diag = hj - if with_k: - hk_aux_diag = .5*hk - hj = hk = None - t1 = log.timer_debug1('intermediate variables with int3c2e_ipip2', *t1) - # int2c contributions if hessobj.auxbasis_response > 1: + cupy.get_default_memory_pool().free_all_blocks() if omega and omega > 1e-10: with auxmol.with_range_coulomb(omega): int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1') @@ -253,13 +241,14 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1') int2c_ipip1 = cupy.asarray(int2c_ipip1, order='C') int2c_ipip1 = intopt.sort_orbitals(int2c_ipip1, aux_axis=[1,2]) - rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P) + # (00|0)(2|0)(0|00) # p,xp->px - hj_aux_diag -= (rhoj0_P*rhoj2c_P).T.reshape(-1,3,3) + if with_j: + rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P) + hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3) if with_k: - rho2c_0 = contract('pij,qji->pq', rhok0_P__, rhok0_P__) - hk_aux_diag -= .5 * contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3) + hk_aux_diag = -.5 * contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3) int2c_ipip1 = None if omega and omega > 1e-10: @@ -269,41 +258,34 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1') int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C') int2c_ip1ip2 = intopt.sort_orbitals(int2c_ip1ip2, aux_axis=[1,2]) - hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3) + if with_j: + hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3) if with_k: hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3) t1 = log.timer_debug1('intermediate variables with int2c_*', *t1) int2c_ip1ip2 = None - cupy.get_default_memory_pool().free_all_blocks() - release_gpu_stack() - # aux-aux pair - if hessobj.auxbasis_response > 1: + # aux-aux pair int2c_inv = pinv(int2c, lindep=LINEAR_DEP_THR) - wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P) int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv) - - rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv) # (1|0)(0|00) - hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10) # (00|0)(1|0), (0|1)(0|00) - hj_aux_aux += contract('xpq,yq->pqxy', rhoj0_10, wj0_01) # (00|0)(1|0), (1|0)(0|00) - rhoj0_10 = rhoj0_P = None - - rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv) # (0|0)(1|00) - hj_aux_aux -= contract('xpq,yq->pqxy', rhoj1, wj0_01) # (00|1), (1|0)(0|00) - hj_aux_aux += .5 * contract('xpq,qy->pqxy', rhoj1, wj_ip2) # (00|1), (1|00) - hj_aux_aux -= contract('xpr,yqr->pqxy', rhoj1, wj0_10) # (00|1), (0|1)(0|00) - wj0_10 = rhoj1 = wj_ip2 = None - - rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv) # (0|1)(0|00) - hj_aux_aux += .5 * contract('xpq,yq->pqxy', rhoj0_01, wj0_01) # (00|0)(0|1), (1|0)(0|00) - wj0_01 = rhoj0_01 = None + if with_j: + wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P) + rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv) # (1|0)(0|00) + hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10) # (00|0)(1|0), (0|1)(0|00) + hj_aux_aux += contract('xpq,yq->pqxy', rhoj0_10, wj0_01) # (00|0)(1|0), (1|0)(0|00) + rhoj0_10 = rhoj0_P = None + + rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv) # (0|0)(1|00) + hj_aux_aux -= contract('xpq,yq->pqxy', rhoj1, wj0_01) # (00|1), (1|0)(0|00) + hj_aux_aux += .5 * contract('xpq,qy->pqxy', rhoj1, wj_ip2) # (00|1), (1|00) + hj_aux_aux -= contract('xpr,yqr->pqxy', rhoj1, wj0_10) # (00|1), (0|1)(0|00) + wj0_10 = rhoj1 = wj_ip2 = None + + rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv) # (0|1)(0|00) + hj_aux_aux += .5 * contract('xpq,yq->pqxy', rhoj0_01, wj0_01) # (00|0)(0|1), (1|0)(0|00) + wj0_01 = rhoj0_01 = None if with_k: - rho2c_10 = contract('rijx,qij->rqx', wk_ip2_P__, rhok0_P__) - rhok0_P__ = None - - rho2c_11 = contract('pijx,qijy->pqxy', wk_ip2_P__, wk_ip2_P__) - wk_ip2_P__ = None hk_aux_aux += .5 * contract('pqxy,pq->pqxy', rho2c_11, int2c_inv) # (00|1)(1|00) rho2c_11 = None @@ -327,26 +309,24 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, t1 = log.timer_debug1('contract int2c_*', *t1) dm0 = intopt.unsort_orbitals(dm0, axis=[0,1]) - hj_ao_diag = intopt.unsort_orbitals(hj_ao_diag, axis=[0]) - hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1]) - if hessobj.auxbasis_response: - hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1]) - if hessobj.auxbasis_response > 1: - hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0]) - hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1]) + if with_j: + hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1]) + if hessobj.auxbasis_response: + hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1]) + if hessobj.auxbasis_response > 1: + hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1]) if with_k: - hk_ao_diag = intopt.unsort_orbitals(hk_ao_diag, axis=[0]) hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1]) if hessobj.auxbasis_response: hk_ao_aux = intopt.unsort_orbitals(hk_ao_aux, axis=[0], aux_axis=[1]) if hessobj.auxbasis_response > 1: - hk_aux_diag = intopt.unsort_orbitals(hk_aux_diag, aux_axis=[0]) hk_aux_aux = intopt.unsort_orbitals(hk_aux_aux, aux_axis=[0,1]) #======================================== sort AO end =========================================== # Energy weighted density matrix # pi,qi,i->pq dme0 = cupy.dot(mocc, (mocc * mo_energy[mo_occ>0] * 2).T) de_hcore = rhf_hess._e_hcore_generator(hessobj, dm0) + t1 = log.timer_debug1('hcore generate', *t1) # ------------------------------------ # overlap matrix contributions @@ -360,19 +340,19 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # ----------------------------------------- # collecting all # ----------------------------------------- - e1 = cupy.zeros([len(atmlst),len(atmlst),3,3]) - ej = cupy.zeros([len(atmlst),len(atmlst),3,3]) - ek = cupy.zeros([len(atmlst),len(atmlst),3,3]) + natm = len(atmlst) + e1 = cupy.zeros([natm,natm,3,3]) + ej = hj_ipip + ek = hk_ipip + for i0, ia in enumerate(atmlst): shl0, shl1, p0, p1 = aoslices[ia] e1[i0,i0] -= cupy.sum(h1aa[p0:p1], axis=0) - ej[i0,i0] += cupy.sum(hj_ao_diag[p0:p1,:,:], axis=0) - if with_k: - ek[i0,i0] += cupy.sum(hk_ao_diag[p0:p1,:,:], axis=0) for j0, ja in enumerate(atmlst[:i0+1]): q0, q1 = aoslices[ja][2:] - ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1]) e1[i0,j0] -= cupy.sum(h1ab[p0:p1,q0:q1], axis=[0,1]) + if with_j: + ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1]) if with_k: ek[i0,j0] += cupy.sum(hk_ao_ao[p0:p1,q0:q1], axis=[0,1]) e1[i0,j0] += de_hcore(ia, ja) @@ -381,13 +361,14 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # if hessobj.auxbasis_response: for j0, (q0, q1) in enumerate(auxslices[:,2:]): - _ej = cupy.sum(hj_ao_aux[p0:p1,q0:q1], axis=[0,1]) - if hessobj.auxbasis_response > 1: - ej[i0,j0] += _ej * 2 - ej[j0,i0] += _ej.T * 2 - else: - ej[i0,j0] += _ej - ej[j0,i0] += _ej.T + if with_j: + _ej = cupy.sum(hj_ao_aux[p0:p1,q0:q1], axis=[0,1]) + if hessobj.auxbasis_response > 1: + ej[i0,j0] += _ej * 2 + ej[j0,i0] += _ej.T * 2 + else: + ej[i0,j0] += _ej + ej[j0,i0] += _ej.T if with_k: _ek = cupy.sum(hk_ao_aux[p0:p1,q0:q1], axis=[0,1]) if hessobj.auxbasis_response > 1: @@ -401,13 +382,11 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # if hessobj.auxbasis_response > 1: shl0, shl1, p0, p1 = auxslices[ia] - ej[i0,i0] += cupy.sum(hj_aux_diag[p0:p1], axis=0) - if with_k: - ek[i0,i0] += cupy.sum(hk_aux_diag[p0:p1], axis=0) for j0, (q0, q1) in enumerate(auxslices[:,2:]): - _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1]) - ej[i0,j0] += _ej - ej[j0,i0] += _ej.T + if with_j: + _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1]) + ej[i0,j0] += _ej + ej[j0,i0] += _ej.T if with_k: _ek = cupy.sum(hk_aux_aux[p0:p1,q0:q1], axis=[0,1]) ek[i0,j0] += _ek * .5 @@ -415,9 +394,23 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, for i0, ia in enumerate(atmlst): for j0 in range(i0): e1[j0,i0] = e1[i0,j0].T - ej[j0,i0] = ej[i0,j0].T - ek[j0,i0] = ek[i0,j0].T + if with_j: + ej[j0,i0] = ej[i0,j0].T + if with_k: + ek[j0,i0] = ek[i0,j0].T t1 = log.timer_debug1('hcore contribution', *t1) + + aux2atom = int3c2e.get_aux2atom(intopt, auxslices) + + natm = mol.natm + idx = range(natm) + # Diagonal contributions + if hessobj.auxbasis_response > 1: + if with_j: + ej[idx, idx] += contract('ia,ixy->axy', aux2atom, hj_aux_diag) + if with_k: + ek[idx, idx] += contract('ia,ixy->axy', aux2atom, hk_aux_diag) + log.timer('RHF partial hessian', *time0) return e1, ej, ek @@ -425,19 +418,19 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): mol = hessobj.mol natm = mol.natm - nocc = int(cupy.count_nonzero(mo_occ > 0)) - nmo = len(mo_occ) - h1ao = cupy.empty((natm, 3, nmo, nocc)) - for ia, h1, vj1, vk1 in _gen_jk(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, True): - h1 += vj1 - vk1 * .5 - h1ao[ia] = h1 - return h1ao - -def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, - verbose=None, with_k=True, omega=None): + assert atmlst is None or atmlst ==range(natm) + vj, vk = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, atmlst, verbose, True) + # h1mo = h1 + vj - 0.5 * vk + h1mo = vk + h1mo *= -.5 + h1mo += vj + h1mo += rhf_grad.get_grad_hcore(hessobj.base.nuc_grad_method()) + return h1mo + +def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, + verbose=None, with_j=True, with_k=True, omega=None): ''' - A generator to produce the derivatives of Hcore, J, K matrices in MO bases + Derivatives of J, K matrices in MO bases ''' log = logger.new_logger(hessobj, verbose) t0 = log.init_timer() @@ -447,8 +440,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, mol = hessobj.mol if atmlst is None: atmlst = range(mol.natm) - # FIXME - with_k = True + mo_coeff = cupy.asarray(mo_coeff, order='C') mo_occ = cupy.asarray(mo_occ, order='C') @@ -475,7 +467,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, - aosym=False, + aosym=False, verbose=0, group_size_aux=BLKSIZE, group_size=BLKSIZE) naux = auxmol.nao @@ -484,26 +476,31 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[0]) dm0 = intopt.sort_orbitals(dm0, axis=[0,1]) dm0_tag = tag_array(dm0, occ_coeff=mocc) - + int2c = intopt.sort_orbitals(int2c, aux_axis=[0,1]) solve_j2c = _gen_metric_solver(int2c) - wj, wk_Pl_ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega) - rhoj0 = solve_j2c(wj) + wj, wk_Pl_ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, + with_j=with_j, with_k=True, omega=omega) + rhoj0 = None + if with_j: + rhoj0 = solve_j2c(wj) + wj = None - wj = None if isinstance(wk_Pl_, cupy.ndarray): rhok0_Pl_ = solve_j2c(wk_Pl_) else: - #rhok0_Pl_ = np.empty_like(wk_Pl_) - #mem = cupy.cuda.alloc_pinned_memory(wk_Pl_.nbytes) - #rhok0_Pl_ = np.ndarray(wk_Pl_.shape, dtype=np.float64, order='C', buffer=mem) rhok0_Pl_ = wk_Pl_ # reuse the memory for p0, p1 in lib.prange(0,nao,64): - wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1]) - rhok0_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get() + #wk_tmp = cupy.asarray(wk_Pl_[:,p0:p1]) + #rhok0_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get() + wk_tmp = copy_array(wk_Pl_[:,p0:p1]) + wk_tmp = solve_j2c(wk_tmp) + copy_array(wk_tmp, rhok0_Pl_[:,p0:p1]) wk_tmp = None - wk_Pl_ = solve_j2c = None + wk_Pl_ = None + solve_j2c = None t0 = log.timer_debug1('Fock matrix due to int3c2e', *t0) + vj1_int3c = vk1_int3c = None # -------------------------- # int3c_ip2 contribution @@ -511,8 +508,10 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, cupy.get_default_memory_pool().free_all_blocks() if hessobj.auxbasis_response: fn = int3c2e.get_int3c2e_ip2_vjk - vj1_int3c_ip2, vk1_int3c_ip2 = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, auxslices, omega=omega) - vk1_int3c_ip2 *= 2.0 + vj1_int3c, vk1_int3c = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, auxslices, + with_j=with_j, with_k=with_k, omega=omega) + t0 = log.timer_debug1('Fock matrix due to int3c2e_ip2', *t0) + # Responses due to int2c2e_ip1 if omega and omega > 1e-10: with auxmol.with_range_coulomb(omega): @@ -522,64 +521,77 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, int2c_ip1 = cupy.asarray(int2c_ip1, order='C') int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2]) - # Generate rhok0_P__ - if isinstance(rhok0_Pl_, cupy.ndarray): - rhok0_P__ = contract('pio,ir->pro', rhok0_Pl_, mocc) - else: - rhok0_P__ = cupy.empty([naux,nocc,nocc]) - for p0, p1 in lib.prange(0,naux,64): - rhok0_Pl_tmp = cupy.asarray(rhok0_Pl_[p0:p1]) - rhok0_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocc) - rhok0_Pl_tmp = None - - wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0) - wk0_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0_P__) + if with_j: + wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0) + if with_k: + # Generate rhok0_P__ + if isinstance(rhok0_Pl_, cupy.ndarray): + rhok0_P__ = contract('pio,ir->pro', rhok0_Pl_, mocc) + else: + rhok0_P__ = cupy.empty([naux,nocc,nocc]) + for p0, p1 in lib.prange(0,naux,64): + #rhok0_Pl_tmp = cupy.asarray(rhok0_Pl_[p0:p1]) + rhok0_Pl_tmp = copy_array(rhok0_Pl_[p0:p1]) + rhok0_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocc) + rhok0_Pl_tmp = None + wk0_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0_P__) aux2atom = int3c2e.get_aux2atom(intopt, auxslices) mem_avail = get_avail_mem() blksize = int(0.2*mem_avail/(3*naux*nocc*8)/ALIGNED) * ALIGNED log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, {blksize} AOs per block') if blksize < ALIGNED: - raise RuntimeError('Not enough memory to compute int3c2e_ip2') + raise RuntimeError('Not enough memory to compute int2c2e_ip2') for p0, p1 in lib.prange(0,nao,blksize): - rhok_tmp = cupy.asarray(rhok0_Pl_[:,p0:p1]) - vj1_tmp = contract('pio,xp->xpio', rhok_tmp, wj0_10) - + #rhok_tmp = cupy.asarray(rhok0_Pl_[:,p0:p1]) + rhok_tmp = copy_array(rhok0_Pl_[:,p0:p1]) wk0_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhok_tmp) - vj1_tmp += contract('xpio,p->xpio', wk0_10_Pl_, rhoj0) - vj1_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vj1_tmp, aux2atom) - vj1_tmp = None + if with_j: + vj1_tmp = contract('pio,xp->xpio', rhok_tmp, wj0_10) + vj1_tmp += contract('xpio,p->xpio', wk0_10_Pl_, rhoj0) + vj1_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vj1_tmp, aux2atom) + vj1_tmp = None if with_k: vk1_tmp = contract('xpio,pro->xpir', wk0_10_Pl_, rhok0_P__) vk1_tmp += contract('xpro,pir->xpio', wk0_10_P__, rhok_tmp) - # 2.0 due to spin - vk1_int3c_ip2[:,:,p0:p1] += 2.0*contract('xpio,pa->axio', vk1_tmp, aux2atom) + vk1_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vk1_tmp, aux2atom) vk1_tmp = None wk0_10_Pl_ = rhok_tmp = None wj0_10 = wk0_10_P__ = rhok0_P__ = int2c_ip1 = None aux2atom = None - - vj1_int3c_ip2 = contract('nxiq,ip->nxpq', vj1_int3c_ip2, mo_coeff) - vk1_int3c_ip2 = contract('nxiq,ip->nxpq', vk1_int3c_ip2, mo_coeff) - t0 = log.timer_debug1('Fock matrix due to int3c2e_ip2', *t0) + t0 = log.timer_debug1('Fock matrix due to int2c2e_ip1', *t0) # ----------------------------- # int3c_ip1 contributions # ------------------------------ cupy.get_default_memory_pool().free_all_blocks() fn = int3c2e.get_int3c2e_ip1_vjk - vj1_buf, vk1_buf, vj1_ao, vk1_ao = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, aoslices, omega=omega) - rhoj0 = rhok0_Pl_ = None - vk1_ao *= 2.0 - vk1_buf *= 2.0 - - vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2]) - vk1_buf = intopt.unsort_orbitals(vk1_buf, axis=[1,2]) + vj1_buf, vk1_buf, vj1_ao, vk1_ao = fn(intopt, rhoj0, rhok0_Pl_, dm0_tag, aoslices, + omega=omega, with_j=with_j, with_k=with_k) + rhoj0 = rhok0_Pl_ = dm0_tag = None + if with_j: + vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2]) + if vj1_int3c is None: + vj1_int3c = -vj1_ao + else: + vj1_int3c -= vj1_ao + vj1_ao = None + # NOTE: vj1_int3c and vk1_int3c are in [natm,3,nao,nocc] + # axis=2 in AO, axis=3 in MO + # convert axis=2 into MO now + vj1_int3c = contract('nxiq,ip->nxpq', vj1_int3c, mo_coeff) - vj1_int3c_ip1 = -contract('nxiq,ip->nxpq', vj1_ao, mo_coeff) - vk1_int3c_ip1 = -contract('nxiq,ip->nxpq', vk1_ao, mo_coeff) - vj1_ao = vk1_ao = None + if with_k: + vk1_buf = intopt.unsort_orbitals(vk1_buf, axis=[1,2]) + if vk1_int3c is None: + vk1_int3c = -vk1_ao + else: + vk1_int3c -= vk1_ao + vk1_ao = None + # * 2.0 due to the contraction with mocc + vk1_buf *= 2.0 + vk1_int3c = 2.0 * contract('nxiq,ip->nxpq', vk1_int3c, mo_coeff) t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0) mocc = intopt.unsort_orbitals(mocc, axis=[0]) @@ -591,40 +603,48 @@ def _ao2mo(mat): tmp = contract('xij,jo->xio', mat, mocc) return contract('xik,ip->xpk', tmp, mo_coeff) - vj1_int3c = vj1_int3c_ip1 + vj1_int3c_ip2 - vj1_int3c_ip1 = vj1_int3c_ip2 = None - if with_k: - vk1_int3c = vk1_int3c_ip1 + vk1_int3c_ip2 - vk1_int3c_ip1 = vk1_int3c_ip2 = None - - grad_hcore = rhf_grad.get_grad_hcore(hessobj.base.nuc_grad_method()) cupy.get_default_memory_pool().free_all_blocks() - vk1 = None for i0, ia in enumerate(atmlst): shl0, shl1, p0, p1 = aoslices[ia] - vj1_ao = cupy.zeros([3,nao,nao]) - vk1_ao = cupy.zeros([3,nao,nao]) - - vj1_ao[:,p0:p1,:] -= vj1_buf[:,p0:p1,:] - vj1_ao[:,:,p0:p1] -= vj1_buf[:,p0:p1,:].transpose(0,2,1) + if with_j: + vj1_ao = cupy.zeros([3,nao,nao]) + vj1_ao[:,p0:p1,:] -= vj1_buf[:,p0:p1,:] + vj1_ao[:,:,p0:p1] -= vj1_buf[:,p0:p1,:].transpose(0,2,1) + vj1_int3c[ia] += _ao2mo(vj1_ao) if with_k: + vk1_ao = cupy.zeros([3,nao,nao]) vk1_ao[:,p0:p1,:] -= vk1_buf[:,p0:p1,:] vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1) + vk1_int3c[ia] += _ao2mo(vk1_ao) + return vj1_int3c, vk1_int3c + +def _get_jk_mo(hessobj, mol, dms, mo_coeff, mocc, + hermi=1, with_j=True, with_k=True, omega=None): + mf = hessobj.base + dfobj = mf.with_df + if omega is None: + return jk.get_jk(dfobj, dms, mo_coeff, mocc, + hermi=hermi, with_j=with_j, with_k=with_k) + + # A temporary treatment for RSH-DF integrals + key = '%.6f' % omega + if key in dfobj._rsh_df: + rsh_df = dfobj._rsh_df[key] + else: + rsh_df = dfobj._rsh_df[key] = dfobj.copy().reset() + logger.info(dfobj, 'Create RSH-DF object %s for omega=%s', rsh_df, omega) + + with rsh_df.mol.with_range_coulomb(omega): + return jk.get_jk(rsh_df, dms, mo_coeff, mocc, + hermi=hermi, with_j=with_j, with_k=with_k, omega=omega) - h1 = grad_hcore[i0] - vj1 = vj1_int3c[ia] + _ao2mo(vj1_ao) - if with_k: - vk1 = vk1_int3c[ia] + _ao2mo(vk1_ao) - yield ia, h1, vj1, vk1 class Hessian(rhf_hess.Hessian): '''Non-relativistic restricted Hartree-Fock hessian''' from gpu4pyscf.lib.utils import to_gpu, device - __init__ = rhf_hess.Hessian.__init__ auxbasis_response = 1 partial_hess_elec = partial_hess_elec make_h1 = make_h1 - kernel = rhf_hess.kernel - hess = kernel + get_jk_mo = _get_jk_mo diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py index 3643d8ad..e0d5cd90 100644 --- a/gpu4pyscf/df/hessian/rks.py +++ b/gpu4pyscf/df/hessian/rks.py @@ -23,9 +23,11 @@ import numpy import cupy from pyscf import lib +from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf.hessian import rhf as rhf_hess from gpu4pyscf.hessian import rks as rks_hess from gpu4pyscf.df.hessian import rhf as df_rhf_hess +from gpu4pyscf.df.hessian.rhf import _get_jk_ip, _partial_hess_ejk from gpu4pyscf.lib import logger from gpu4pyscf.lib.cupy_helper import contract @@ -49,17 +51,17 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) with_k = mf._numint.libxc.is_hybrid_xc(mf.xc) - de2, ej, ek = df_rhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, - atmlst, max_memory, verbose, - with_k=with_k) + de2, ej, ek = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, + atmlst, max_memory, verbose, + with_j=True, with_k=with_k) de2 += ej # (A,B,dR_A,dR_B) if with_k: de2 -= hyb * ek if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: - ek_lr = df_rhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, - atmlst, max_memory, verbose, - True, omega=omega)[2] + ek_lr = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, + atmlst, max_memory, verbose, + with_j=False, with_k=True, omega=omega)[2] de2 -= (alpha - hyb) * ek_lr max_memory = None @@ -84,33 +86,38 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): mol = hessobj.mol + natm = mol.natm + assert atmlst is None or atmlst ==range(natm) mf = hessobj.base ni = mf._numint ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True) omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) mem_now = lib.current_memory()[0] max_memory = max(2000, mf.max_memory*.9-mem_now) - h1mo = rks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory) + with_k = ni.libxc.is_hybrid_xc(mf.xc) + vj1, vk1 = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, + atmlst, verbose, with_j=True, with_k=with_k) + h1mo = vj1 + if with_k: + h1mo -= .5 * hyb * vk1 + vj1 = vk1 = None - for ia, h1, vj1, vk1 in df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, with_k): - h1mo[ia] += h1 + vj1 - if with_k: - h1mo[ia] -= .5 * hyb * vk1 if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: - for ia, h1, vj1_lr, vk1_lr in df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, True, omega=omega): - h1mo[ia] -= .5 * (alpha - hyb) * vk1_lr + _, vk1_lr = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, atmlst, + verbose, with_j=False, with_k=True, omega=omega) + h1mo -= .5 * (alpha - hyb) * vk1_lr + vk1_lr = None + + h1mo += rhf_grad.get_grad_hcore(hessobj.base.nuc_grad_method()) + h1mo += rks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory) return h1mo class Hessian(rks_hess.Hessian): '''Non-relativistic RKS hessian''' from gpu4pyscf.lib.utils import to_gpu, device - __init__ = rks_hess.Hessian.__init__ auxbasis_response = 1 partial_hess_elec = partial_hess_elec make_h1 = make_h1 - kernel = rhf_hess.kernel - hess = kernel + get_jk_mo = df_rhf_hess._get_jk_mo diff --git a/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py new file mode 100644 index 00000000..a3e13260 --- /dev/null +++ b/gpu4pyscf/df/hessian/tests/test_df_rhf_hessian.py @@ -0,0 +1,145 @@ +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest +import numpy +import cupy +from pyscf import gto, scf +from pyscf.df.hessian import rhf as df_rhf_cpu +from pyscf.hessian import rhf as rhf_cpu +from gpu4pyscf.df.hessian import rhf as df_rhf_gpu +from gpu4pyscf.hessian import rhf as rhf_gpu + +def setUpModule(): + global mol + mol = gto.Mole() + mol.verbose = 1 + mol.output = '/dev/null' + mol.atom.extend([ + ["O" , (0. , 0. , 0.)], + [1 , (0. , -0.757 , 0.587)], + [1 , (0. , 0.757 , 0.587)] ]) + mol.basis = 'sto3g' + mol.build() + +def tearDownModule(): + global mol + mol.stdout.close() + del mol + +class KnownValues(unittest.TestCase): + def test_gen_vind(self): + mf = scf.RHF(mol).density_fit() + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + mf.kernel() + mo_coeff = mf.mo_coeff + mo_occ = mf.mo_occ + + nao, nmo = mo_coeff.shape + mocc = mo_coeff[:,mo_occ>0] + nocc = mocc.shape[1] + + fx_cpu = rhf_cpu.gen_vind(mf, mo_coeff, mo_occ) + mo1 = numpy.random.rand(100, nmo*nocc) + v1vo_cpu = fx_cpu(mo1).reshape(-1,nmo*nocc) + + mf = mf.to_gpu() + hessobj = mf.Hessian() + fx_gpu = hessobj.gen_vind(mo_coeff, mo_occ) + mo1 = cupy.asarray(mo1) + v1vo_gpu = fx_gpu(mo1) + assert numpy.linalg.norm(v1vo_cpu - v1vo_gpu.get()) < 1e-8 + + def test_partial_hess_elec(self): + mf = scf.RHF(mol).density_fit() + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + mf.kernel() + hobj = mf.Hessian() + hobj.auxbasis_response = 2 + e1_cpu, ej_cpu, ek_cpu = df_rhf_cpu._partial_hess_ejk(hobj) + + mf = mf.to_gpu() + hobj = mf.Hessian() + hobj.auxbasis_response = 2 + e1_gpu, ej_gpu, ek_gpu = df_rhf_gpu._partial_hess_ejk(hobj) + assert numpy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-5 + assert numpy.linalg.norm(ej_cpu - ej_gpu.get()) < 1e-5 + assert numpy.linalg.norm(ek_cpu - ek_gpu.get()) < 1e-5 + + def test_make_h1(self): + mf = scf.RHF(mol).density_fit() + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + mf.kernel() + mo_energy = mf.mo_energy + mo_coeff = mf.mo_coeff + mo_occ = mf.mo_occ + mocc = mo_coeff[:,mo_occ>0] + hobj = mf.Hessian() + hobj.auxbasis_response = 1 + h1_cpu = df_rhf_cpu.make_h1(hobj, mo_coeff, mo_occ) + mo1_cpu, mo_e1_cpu = hobj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1_cpu, verbose=1) + h1_cpu = numpy.asarray(h1_cpu) + h1_cpu = numpy.einsum('xypq,pi,qj->xyij', h1_cpu, mo_coeff, mocc) + + mf = mf.to_gpu() + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + hobj = mf.Hessian() + hobj.auxbasis_response = 1 + mo_occ = cupy.asarray(mo_occ) + h1_gpu = df_rhf_gpu.make_h1(hobj, mo_coeff, mo_occ) + h1_gpu = cupy.asarray(h1_gpu) + mo_energy = cupy.asarray(mo_energy) + mo_coeff = cupy.asarray(mo_coeff) + fx = hobj.gen_vind(mo_coeff, mo_occ) + mo1_gpu, mo_e1_gpu = hobj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1_gpu, fx, verbose=1) + assert numpy.linalg.norm(h1_cpu - h1_gpu.get()) < 1e-5 + assert numpy.linalg.norm((mo_e1_cpu - mo_e1_gpu)) < 1e-4 + + def test_df_rhf_hess_elec(self): + mf = scf.RHF(mol).density_fit() + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + mf.kernel() + hobj = mf.Hessian() + hobj.auxbasis_response = 2 + hess_cpu = hobj.hess_elec() + + mf = mf.to_gpu() + hobj = mf.Hessian() + hobj.auxbasis_response = 2 + hess_gpu = hobj.hess_elec() + assert numpy.linalg.norm(hess_cpu - hess_gpu.get()) < 1e-5 + + def test_df_rhf_hessian(self): + mf = scf.RHF(mol).density_fit() + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + mf.kernel() + hobj = mf.Hessian() + hobj.auxbasis_response = 2 + hess_cpu = hobj.kernel() + mf = mf.to_gpu() + hobj = mf.Hessian() + hobj.auxbasis_response = 2 + hess_gpu = hobj.kernel() + assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5 + +if __name__ == "__main__": + print("Full Tests for DF RHF Hessian") + unittest.main() diff --git a/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py new file mode 100644 index 00000000..f737e92a --- /dev/null +++ b/gpu4pyscf/df/hessian/tests/test_df_rks_hessian.py @@ -0,0 +1,107 @@ +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest +import numpy +from pyscf import gto, dft + +def setUpModule(): + global mol + mol = gto.Mole() + mol.verbose = 1 + mol.output = '/dev/null' + mol.atom.extend([ + ["O" , (0. , 0. , 0.)], + [1 , (0. , -0.757 , 0.587)], + [1 , (0. , 0.757 , 0.587)] ]) + mol.basis = 'sto3g' + mol.build() + +def tearDownModule(): + global mol + mol.stdout.close() + del mol + +class KnownValues(unittest.TestCase): + + def test_df_rks_hess_elec(self): + mf = dft.RKS(mol, xc='b3lyp').density_fit() + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-8 + mf.grids.level = 1 + mf.kernel() + hobj = mf.Hessian() + hobj.auxbasis_response = 2 + hess_cpu = hobj.partial_hess_elec() + + mf = mf.to_gpu() + mf.grids.level = 1 + mf.kernel() + hobj = mf.Hessian() + hobj.auxbasis_response = 2 + hess_gpu = hobj.partial_hess_elec() + assert numpy.linalg.norm(hess_cpu - hess_gpu.get()) < 1e-5 + + def test_df_lda(self): + mf = dft.RKS(mol).density_fit() + mf.conv_tol = 1e-10 + mf.grids.level = 1 + mf.conv_tol_cpscf = 1e-8 + mf.kernel() + + hessobj = mf.Hessian() + hess_cpu = hessobj.kernel() + + mf = mf.to_gpu() + hessobj = mf.Hessian() + hess_gpu = hessobj.kernel() + assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5 + + def test_df_gga(self): + mf = dft.RKS(mol, xc='b3lyp').density_fit() + mf.conv_tol = 1e-10 + mf.grids.level = 1 + mf.conv_tol_cpscf = 1e-8 + mf.kernel() + + hessobj = mf.Hessian() + hess_cpu = hessobj.kernel() + + mf = mf.to_gpu() + hessobj = mf.Hessian() + hessobj.base.cphf_grids = hessobj.base.grids + hess_gpu = hessobj.kernel() + assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5 + + def test_df_mgga(self): + mf = dft.RKS(mol, xc='tpss').density_fit() + mf.conv_tol = 1e-10 + mf.grids.level = 1 + mf.conv_tol_cpscf = 1e-8 + mf.kernel() + + hessobj = mf.Hessian() + hess_cpu = hessobj.kernel() + + mf = mf.to_gpu() + hessobj = mf.Hessian() + hessobj.base.cphf_grids = hessobj.base.grids + hess_gpu = hessobj.kernel() + assert numpy.linalg.norm(hess_cpu - hess_gpu) < 1e-5 + +if __name__ == "__main__": + print("Full Tests for DF RKS Hessian") + unittest.main() + \ No newline at end of file diff --git a/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py b/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py index 5a4bbb74..f3094095 100644 --- a/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py +++ b/gpu4pyscf/df/hessian/tests/test_df_uhf_hessian.py @@ -61,7 +61,8 @@ def test_gen_vind(self): v1vo_cpu = fx_cpu(mo1) mf = mf.to_gpu() - fx_gpu = uhf_gpu.gen_vind(mf, mo_coeff, mo_occ) + hessobj = mf.Hessian() + fx_gpu = hessobj.gen_vind(mo_coeff, mo_occ) mo1 = cupy.asarray(mo1) v1vo_gpu = fx_gpu(mo1) assert numpy.linalg.norm(v1vo_cpu - v1vo_gpu.get()) < 1e-8 @@ -113,7 +114,8 @@ def test_make_h1(self): mo_energy = cupy.asarray(mo_energy) mo_coeff = cupy.asarray(mo_coeff) mo_occ = cupy.asarray(mo_occ) - mo1_gpu, mo_e1_gpu = hobj.solve_mo1(mo_energy, mo_coeff, mo_occ, (h1a_gpu, h1b_gpu), verbose=1) + fx = hobj.gen_vind(mo_coeff, mo_occ) + mo1_gpu, mo_e1_gpu = hobj.solve_mo1(mo_energy, mo_coeff, mo_occ, (h1a_gpu, h1b_gpu), fx, verbose=1) assert numpy.linalg.norm(h1a_cpu - h1a_gpu.get()) < 1e-5 assert numpy.linalg.norm(h1b_cpu - h1b_gpu.get()) < 1e-5 mo1_cpu = (numpy.asarray(mo1_cpu[0]), numpy.asarray(mo1_cpu[1])) diff --git a/gpu4pyscf/df/hessian/uhf.py b/gpu4pyscf/df/hessian/uhf.py index 29d016a4..5e94a248 100644 --- a/gpu4pyscf/df/hessian/uhf.py +++ b/gpu4pyscf/df/hessian/uhf.py @@ -34,26 +34,29 @@ from gpu4pyscf.hessian import uhf as uhf_hess from gpu4pyscf.hessian import rhf as rhf_hess from gpu4pyscf.lib.cupy_helper import ( - contract, tag_array, get_avail_mem, release_gpu_stack, pinv) + contract, tag_array, get_avail_mem, release_gpu_stack, pinv, copy_array) from gpu4pyscf.df import int3c2e, df +from gpu4pyscf.df.hessian import rhf as df_rhf_hess from gpu4pyscf.lib import logger from gpu4pyscf import __config__ from gpu4pyscf.df.grad.rhf import _gen_metric_solver -from gpu4pyscf.gto.mole import sort_atoms +from gpu4pyscf.df.hessian import jk LINEAR_DEP_THR = df.LINEAR_DEP_THR -BLKSIZE = 256 +BLKSIZE = 128 ALIGNED = getattr(__config__, 'ao_aligned', 32) GB = 1024*1024*1024 def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, max_memory=4000, verbose=None): e1, ej, ek = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, - atmlst, max_memory, verbose, True) + atmlst, max_memory, verbose, + with_j=True, with_k=True) return e1 + ej - ek def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, - atmlst=None, max_memory=4000, verbose=None, with_k=True, omega=None): + atmlst=None, max_memory=4000, verbose=None, + with_j=True, with_k=True, omega=None): '''Partial derivative ''' log = logger.new_logger(hessobj, verbose) @@ -93,7 +96,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # ================================ sorted AO begin =============================================== intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') - intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) + intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, verbose=0, + group_size=BLKSIZE, group_size_aux=BLKSIZE) mocca = intopt.sort_orbitals(mocca, axis=[0]) moccb = intopt.sort_orbitals(moccb, axis=[0]) @@ -112,43 +116,43 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c_ip1 = cupy.asarray(int2c_ip1, order='C') int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2]) - hj_ao_ao = cupy.zeros([nao,nao,3,3]) - hk_ao_ao = cupy.zeros([nao,nao,3,3]) - if hessobj.auxbasis_response: - hj_ao_aux = cupy.zeros([nao,naux,3,3]) - hk_ao_aux = cupy.zeros([nao,naux,3,3]) - # int3c contributions wja, wka_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0a_tag, omega=omega) wjb, wkb_P__ = int3c2e.get_int3c2e_jk(mol, auxmol, dm0b_tag, omega=omega) - rhoj0_P = solve_j2c(wja + wjb) - rhok0a_P__ = solve_j2c(wka_P__) - rhok0b_P__ = solve_j2c(wkb_P__) + rhoj0_P = rhok0a_P__ = rhok0b_P__ = None + if with_j: + rhoj0_P = solve_j2c(wja + wjb) + if with_k: + rhok0a_P__ = solve_j2c(wka_P__) + rhok0b_P__ = solve_j2c(wkb_P__) wja = wjb = wka_P__ = wkb_P__ = None t1 = log.timer_debug1('intermediate variables with int3c2e', *t1) # int3c_ip2 contributions wja_ip2, wka_ip2_P__ = int3c2e.get_int3c2e_ip2_wjk(intopt, dm0a_tag, omega=omega) wjb_ip2, wkb_ip2_P__ = int3c2e.get_int3c2e_ip2_wjk(intopt, dm0b_tag, omega=omega) - wj_ip2 = wja_ip2 + wjb_ip2 + wj_ip2 = None + if with_j: + wj_ip2 = wja_ip2 + wjb_ip2 t1 = log.timer_debug1('interdeidate variables with int3c2e_ip2', *t1) # int3c_ip1 contributions wj1a_P, wk1a_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0a_tag, omega=omega) wj1b_P, wk1b_Pko = int3c2e.get_int3c2e_ip1_wjk(intopt, dm0b_tag, omega=omega) - wj1_P = wj1a_P + wj1b_P - rhoj1_P = solve_j2c(wj1_P) - - hj_ao_ao += 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P) # (10|0)(0|0)(0|01) wj1_P = None - if hessobj.auxbasis_response: - wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P) - wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P) - hj_ao_aux += contract('pix,py->ipxy', rhoj1_P, wj_ip2) # (10|0)(1|00) - hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01) # (10|0)(1|0)(0|00) - hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01) # (10|0)(0|1)(0|00) - wj1_01 = None - rhoj1_P = None + if with_j: + wj1_P = wj1a_P + wj1b_P + rhoj1_P = solve_j2c(wj1_P) + hj_ao_ao = 4.0*contract('pix,pjy->ijxy', rhoj1_P, wj1_P) # (10|0)(0|0)(0|01) + wj1_P = None + if hessobj.auxbasis_response: + wj0_01 = contract('ypq,q->yp', int2c_ip1, rhoj0_P) + wj1_01 = contract('yqp,pix->iqxy', int2c_ip1, rhoj1_P) + hj_ao_aux = contract('pix,py->ipxy', rhoj1_P, wj_ip2) # (10|0)(1|00) + hj_ao_aux -= contract('pix,yp->ipxy', rhoj1_P, wj0_01) # (10|0)(1|0)(0|00) + hj_ao_aux -= contract('q,iqxy->iqxy', rhoj0_P, wj1_01) # (10|0)(0|1)(0|00) + wj1_01 = None + rhoj1_P = None if with_k: mem_avail = get_avail_mem() @@ -159,17 +163,23 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, block size {blksize}') if blksize < ALIGNED: raise RuntimeError('Not enough memory for intermediate variables') - + hk_ao_ao = cupy.zeros([nao,nao,3,3]) + if hessobj.auxbasis_response: + hk_ao_aux = cupy.zeros([nao,naux,3,3]) for i0, i1 in lib.prange(0,nao,blksize): - wk1a_Pko_islice = cupy.asarray(wk1a_Pko[:,i0:i1]) - wk1b_Pko_islice = cupy.asarray(wk1b_Pko[:,i0:i1]) + #wk1a_Pko_islice = cupy.asarray(wk1a_Pko[:,i0:i1]) + #wk1b_Pko_islice = cupy.asarray(wk1b_Pko[:,i0:i1]) + wk1a_Pko_islice = copy_array(wk1a_Pko[:,i0:i1]) + wk1b_Pko_islice = copy_array(wk1b_Pko[:,i0:i1]) rhok1a_Pko = solve_j2c(wk1a_Pko_islice) rhok1b_Pko = solve_j2c(wk1b_Pko_islice) wk1a_Pko_islice = wk1b_Pko_islice = None for k0, k1 in lib.prange(0,nao,blksize): - wk1a_Pko_kslice = cupy.asarray(wk1a_Pko[:,k0:k1]) - wk1b_Pko_kslice = cupy.asarray(wk1b_Pko[:,k0:k1]) - + #wk1a_Pko_kslice = cupy.asarray(wk1a_Pko[:,k0:k1]) + #wk1b_Pko_kslice = cupy.asarray(wk1b_Pko[:,k0:k1]) + wk1a_Pko_kslice = copy_array(wk1a_Pko[:,k0:k1]) + wk1b_Pko_kslice = copy_array(wk1b_Pko[:,k0:k1]) + # (10|0)(0|10) without response of RI basis vk2_ip1_ip1 = contract('piox,pkoy->ikxy', rhok1a_Pko, wk1a_Pko_kslice) hk_ao_ao[i0:i1,k0:k1] += contract('ikxy,ik->ikxy', vk2_ip1_ip1, dm0a[i0:i1,k0:k1]) @@ -214,49 +224,30 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1) cupy.get_default_memory_pool().free_all_blocks() - # int3c_ipip1 contributions - fn = int3c2e.get_int3c2e_hjk - hja_ao_diag, hka_ao_diag = fn(intopt, 'ipip1', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k) - hjb_ao_diag, hkb_ao_diag = fn(intopt, 'ipip1', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k) - hj_ao_diag = 2.0 * (hja_ao_diag + hjb_ao_diag) - if with_k: - hk_ao_diag = 2.0 * (hka_ao_diag + hkb_ao_diag) - t1 = log.timer_debug1('intermediate variables with int3c2e_ipip1', *t1) - - # int3c_ipvip1 contributions - # (11|0), (0|00) without response of RI basis - fn = int3c2e.get_int3c2e_hjk - hja, hka = fn(intopt, 'ipvip1', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k) - hjb, hkb = fn(intopt, 'ipvip1', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k) - hj_ao_ao += 2.0*(hja + hjb) + hja_ipip, hka_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0a_P__, dm0a_tag, + with_j=with_j, with_k=with_k, omega=omega, + auxbasis_response=hessobj.auxbasis_response) + hjb_ipip, hkb_ipip = jk.get_int3c2e_hjk(intopt, rhoj0_P, rhok0b_P__, dm0b_tag, + with_j=with_j, with_k=with_k, omega=omega, + auxbasis_response=hessobj.auxbasis_response) + if with_j: + hj_ipip = hja_ipip + hjb_ipip if with_k: - hk_ao_ao += (hka + hkb) - hja = hjb = hka = hkb = None - t1 = log.timer_debug1('intermediate variables with int3c2e_ipvip1', *t1) - - # int3c_ip1ip2 contributions - # (10|1), (0|0)(0|00) - if hessobj.auxbasis_response: - fn = int3c2e.get_int3c2e_hjk - hja, hka = fn(intopt, 'ip1ip2', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k) - hjb, hkb = fn(intopt, 'ip1ip2', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k) - hj_ao_aux += hja + hjb - if with_k: - hk_ao_aux += hka + hkb - hja = hjb = hka = hkb = None - t1 = log.timer_debug1('intermediate variables with int3c2e_ip1ip2', *t1) + hk_ipip = 2.0*(hka_ipip + hkb_ipip) + t1 = log.timer_debug1('intermediate variables with int3c2e_ipip', *t1) - # int3c_ipip2 contributions if hessobj.auxbasis_response > 1: - # (00|2), (0|0)(0|00) - fn = int3c2e.get_int3c2e_hjk - hja, hka = fn(intopt, 'ipip2', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k) - hjb, hkb = fn(intopt, 'ipip2', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k) - hj_aux_diag = hja + hjb if with_k: - hk_aux_diag = (hka + hkb) - hja = hjb = hka = hkb = None - t1 = log.timer_debug1('intermediate variables with int3c2e_ipip2', *t1) + rho2c_0 = contract('pij,qji->pq', rhok0a_P__, rhok0a_P__) + rho2c_0+= contract('pij,qji->pq', rhok0b_P__, rhok0b_P__) + + rho2c_10 = contract('rijx,qij->rqx', wka_ip2_P__, rhok0a_P__) + rho2c_10+= contract('rijx,qij->rqx', wkb_ip2_P__, rhok0b_P__) + rhok0a_P__ = rhok0b_P__ = None + + rho2c_11 = contract('pijx,qijy->pqxy', wka_ip2_P__, wka_ip2_P__) + rho2c_11+= contract('pijx,qijy->pqxy', wkb_ip2_P__, wkb_ip2_P__) + wka_ip2_P__ = wkb_ip2_P__ = None # int2c contributions if hessobj.auxbasis_response > 1: @@ -267,14 +258,13 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1') int2c_ipip1 = cupy.asarray(int2c_ipip1, order='C') int2c_ipip1 = intopt.sort_orbitals(int2c_ipip1, aux_axis=[1,2]) - rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P) # (00|0)(2|0)(0|00) - # p,xp->px - hj_aux_diag -= (rhoj0_P*rhoj2c_P).T.reshape(-1,3,3) + if with_j: + rhoj2c_P = contract('xpq,q->xp', int2c_ipip1, rhoj0_P) + # p,xp->px + hj_aux_diag = -(rhoj0_P*rhoj2c_P).T.reshape(-1,3,3) if with_k: - rho2c_0 = contract('pij,qji->pq', rhok0a_P__, rhok0a_P__) - rho2c_0+= contract('pij,qji->pq', rhok0b_P__, rhok0b_P__) - hk_aux_diag -= contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3) + hk_aux_diag = -contract('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3) int2c_ipip1 = None if omega and omega > 1e-10: @@ -284,7 +274,8 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1') int2c_ip1ip2 = cupy.asarray(int2c_ip1ip2, order='C') int2c_ip1ip2 = intopt.sort_orbitals(int2c_ip1ip2, aux_axis=[1,2]) - hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3) + if with_j: + hj_aux_aux = -.5 * contract('p,xpq->pqx', rhoj0_P, int2c_ip1ip2*rhoj0_P).reshape(naux, naux,3,3) if with_k: hk_aux_aux = -.5 * contract('xpq,pq->pqx', int2c_ip1ip2, rho2c_0).reshape(naux,naux,3,3) t1 = log.timer_debug1('intermediate variables with int2c_*', *t1) @@ -294,33 +285,25 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, release_gpu_stack() # aux-aux pair if hessobj.auxbasis_response > 1: - wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P) int2c_ip1_inv = contract('yqp,pr->yqr', int2c_ip1, int2c_inv) - - rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv) # (1|0)(0|00) - hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10) # (00|0)(1|0), (0|1)(0|00) - hj_aux_aux += contract('xpq,yq->pqxy', rhoj0_10, wj0_01) # (00|0)(1|0), (1|0)(0|00) - rhoj0_10 = rhoj0_P = None - - rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv) # (0|0)(1|00) - hj_aux_aux -= contract('xpq,yq->pqxy', rhoj1, wj0_01) # (00|1), (1|0)(0|00) - hj_aux_aux += .5 * contract('xpq,qy->pqxy', rhoj1, wj_ip2) # (00|1), (1|00) - hj_aux_aux -= contract('xpr,yqr->pqxy', rhoj1, wj0_10) # (00|1), (0|1)(0|00) - wj0_10 = rhoj1 = wj_ip2 = None - - rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv) # (0|1)(0|00) - hj_aux_aux += .5 * contract('xpq,yq->pqxy', rhoj0_01, wj0_01) # (00|0)(0|1), (1|0)(0|00) - wj0_01 = rhoj0_01 = None + if with_j: + wj0_10 = contract('ypq,p->ypq', int2c_ip1, rhoj0_P) + rhoj0_10 = contract('p,xpq->xpq', rhoj0_P, int2c_ip1_inv) # (1|0)(0|00) + hj_aux_aux += .5 * contract('xpr,yqr->pqxy', rhoj0_10, wj0_10) # (00|0)(1|0), (0|1)(0|00) + hj_aux_aux += contract('xpq,yq->pqxy', rhoj0_10, wj0_01) # (00|0)(1|0), (1|0)(0|00) + rhoj0_10 = rhoj0_P = None + + rhoj1 = contract('px,pq->xpq', wj_ip2, int2c_inv) # (0|0)(1|00) + hj_aux_aux -= contract('xpq,yq->pqxy', rhoj1, wj0_01) # (00|1), (1|0)(0|00) + hj_aux_aux += .5 * contract('xpq,qy->pqxy', rhoj1, wj_ip2) # (00|1), (1|00) + hj_aux_aux -= contract('xpr,yqr->pqxy', rhoj1, wj0_10) # (00|1), (0|1)(0|00) + wj0_10 = rhoj1 = wj_ip2 = None + + rhoj0_01 = contract('xp,pq->xpq', wj0_01, int2c_inv) # (0|1)(0|00) + hj_aux_aux += .5 * contract('xpq,yq->pqxy', rhoj0_01, wj0_01) # (00|0)(0|1), (1|0)(0|00) + wj0_01 = rhoj0_01 = None if with_k: - rho2c_10 = contract('rijx,qij->rqx', wka_ip2_P__, rhok0a_P__) - rho2c_10+= contract('rijx,qij->rqx', wkb_ip2_P__, rhok0b_P__) - rhok0a_P__ = rhok0b_P__ = None - - - rho2c_11 = contract('pijx,qijy->pqxy', wka_ip2_P__, wka_ip2_P__) - rho2c_11+= contract('pijx,qijy->pqxy', wkb_ip2_P__, wkb_ip2_P__) - wka_ip2_P__ = wkb_ip2_P__ = None hk_aux_aux += .5 * contract('pqxy,pq->pqxy', rho2c_11, int2c_inv) # (00|1)(1|00) rho2c_11 = None @@ -342,16 +325,14 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, hk_aux_aux -= contract('pqx,yqp->pqxy', rho2c_10, int2c_ip1_inv) # (00|1)(0|1)(0|00) rho2c_10= int2c_ip1_inv = None t1 = log.timer_debug1('contract int2c_*', *t1) - - hj_ao_diag = intopt.unsort_orbitals(hj_ao_diag, axis=[0]) - hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1]) - if hessobj.auxbasis_response: - hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1]) - if hessobj.auxbasis_response > 1: - hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0]) - hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1]) + if with_j: + hj_ao_ao = intopt.unsort_orbitals(hj_ao_ao, axis=[0,1]) + if hessobj.auxbasis_response: + hj_ao_aux = intopt.unsort_orbitals(hj_ao_aux, axis=[0], aux_axis=[1]) + if hessobj.auxbasis_response > 1: + hj_aux_diag = intopt.unsort_orbitals(hj_aux_diag, aux_axis=[0]) + hj_aux_aux = intopt.unsort_orbitals(hj_aux_aux, aux_axis=[0,1]) if with_k: - hk_ao_diag = intopt.unsort_orbitals(hk_ao_diag, axis=[0]) hk_ao_ao = intopt.unsort_orbitals(hk_ao_ao, axis=[0,1]) if hessobj.auxbasis_response: hk_ao_aux = intopt.unsort_orbitals(hk_ao_aux, axis=[0], aux_axis=[1]) @@ -380,19 +361,20 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # ----------------------------------------- # collecting all # ----------------------------------------- - hk_ao_ao *= 2.0 e1 = cupy.zeros([len(atmlst),len(atmlst),3,3]) - ej = cupy.zeros([len(atmlst),len(atmlst),3,3]) - ek = cupy.zeros([len(atmlst),len(atmlst),3,3]) + ej = ek = None + if with_j: + ej = hj_ipip + if with_k: + hk_ao_ao *= 2.0 + ek = hk_ipip for i0, ia in enumerate(atmlst): shl0, shl1, p0, p1 = aoslices[ia] e1[i0,i0] -= cupy.sum(h1aa[p0:p1], axis=0) - ej[i0,i0] += cupy.sum(hj_ao_diag[p0:p1,:,:], axis=0) - if with_k: - ek[i0,i0] += cupy.sum(hk_ao_diag[p0:p1,:,:], axis=0) for j0, ja in enumerate(atmlst[:i0+1]): q0, q1 = aoslices[ja][2:] - ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1]) + if with_j: + ej[i0,j0] += cupy.sum(hj_ao_ao[p0:p1,q0:q1], axis=[0,1]) e1[i0,j0] -= cupy.sum(h1ab[p0:p1,q0:q1], axis=[0,1]) if with_k: ek[i0,j0] += cupy.sum(hk_ao_ao[p0:p1,q0:q1], axis=[0,1]) @@ -403,13 +385,14 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # if hessobj.auxbasis_response: for j0, (q0, q1) in enumerate(auxslices[:,2:]): - _ej = cupy.sum(hj_ao_aux[p0:p1,q0:q1], axis=[0,1]) - if hessobj.auxbasis_response > 1: - ej[i0,j0] += _ej * 2 - ej[j0,i0] += _ej.T * 2 - else: - ej[i0,j0] += _ej - ej[j0,i0] += _ej.T + if with_j: + _ej = cupy.sum(hj_ao_aux[p0:p1,q0:q1], axis=[0,1]) + if hessobj.auxbasis_response > 1: + ej[i0,j0] += _ej * 2 + ej[j0,i0] += _ej.T * 2 + else: + ej[i0,j0] += _ej + ej[j0,i0] += _ej.T if with_k: _ek = cupy.sum(hk_ao_aux[p0:p1,q0:q1], axis=[0,1]) if hessobj.auxbasis_response > 1: @@ -423,13 +406,15 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, # if hessobj.auxbasis_response > 1: shl0, shl1, p0, p1 = auxslices[ia] - ej[i0,i0] += cupy.sum(hj_aux_diag[p0:p1], axis=0) + if with_j: + ej[i0,i0] += cupy.sum(hj_aux_diag[p0:p1], axis=0) if with_k: ek[i0,i0] += cupy.sum(hk_aux_diag[p0:p1], axis=0) for j0, (q0, q1) in enumerate(auxslices[:,2:]): - _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1]) - ej[i0,j0] += _ej - ej[j0,i0] += _ej.T + if with_j: + _ej = cupy.sum(hj_aux_aux[p0:p1,q0:q1], axis=[0,1]) + ej[i0,j0] += _ej + ej[j0,i0] += _ej.T if with_k: _ek = cupy.sum(hk_aux_aux[p0:p1,q0:q1], axis=[0,1]) ek[i0,j0] += _ek @@ -437,8 +422,10 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, for i0, ia in enumerate(atmlst): for j0 in range(i0): e1[j0,i0] = e1[i0,j0].T - ej[j0,i0] = ej[i0,j0].T - ek[j0,i0] = ek[i0,j0].T + if with_j: + ej[j0,i0] = ej[i0,j0].T + if with_k: + ek[j0,i0] = ek[i0,j0].T t1 = log.timer_debug1('hcore contribution', *t1) log.timer('UHF partial hessian', *time0) return e1, ej, ek @@ -447,25 +434,28 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): mol = hessobj.mol natm = mol.natm + mol = hessobj.mol + natm = mol.natm + assert atmlst is None or atmlst ==range(natm) if atmlst is None: atmlst = range(natm) - nocca, noccb = hessobj.base.nelec - nmo = len(mo_occ[0]) - h1aoa = cupy.empty((natm, 3, nmo, nocca)) - h1aob = cupy.empty((natm, 3, nmo, noccb)) - for ia, h1, vj1, vk1 in _gen_jk(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, True): - h1a, h1b = h1 - vj1a, vj1b = vj1 - vk1a, vk1b = vk1 - - h1aoa[ia] = h1a + vj1a - vk1a - h1aob[ia] = h1b + vj1b - vk1b - return (h1aoa, h1aob) - -def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, - verbose=None, with_k=True, omega=None): + vj1, vk1 = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, atmlst, verbose, True) + vj1a, vj1b = vj1 + vk1a, vk1b = vk1 + h1moa = vj1a + h1moa-= vk1a + h1mob = vj1b + h1mob-= vk1b + vj1 = vk1 = vj1a = vj1b = vk1a = vk1b = None + + gobj = hessobj.base.nuc_grad_method() + h1moa += rhf_grad.get_grad_hcore(gobj, mo_coeff[0], mo_occ[0]) + h1mob += rhf_grad.get_grad_hcore(gobj, mo_coeff[1], mo_occ[1]) + return (h1moa, h1mob) + +def _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, + verbose=None, with_j=True, with_k=True, omega=None): ''' A generator to produce the derivatives of Hcore, J, K matrices in MO bases ''' @@ -474,8 +464,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, mol = hessobj.mol if atmlst is None: atmlst = range(mol.natm) - # FIXME - with_k = True + mo_coeff = cupy.asarray(mo_coeff, order='C') mo_occ = cupy.asarray(mo_occ, order='C') @@ -500,12 +489,12 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, int2c = cupy.asarray(int2c, order='C') # ======================= sorted AO begin ====================================== intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') - intopt.build(mf.direct_scf_tol, - diag_block_with_triu=True, - aosym=False, - group_size_aux=BLKSIZE, + intopt.build(mf.direct_scf_tol, + diag_block_with_triu=True, + aosym=False, verbose=0, + group_size_aux=BLKSIZE, group_size=BLKSIZE) - + mocca = intopt.sort_orbitals(mocca, axis=[0]) moccb = intopt.sort_orbitals(moccb, axis=[0]) mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[1]) @@ -519,10 +508,12 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, fn = int3c2e.get_int3c2e_wjk dm0_tag = tag_array(dm0, occ_coeff=mocca) - wj, wka_Pl_ = fn(mol, auxmol, dm0_tag, omega=omega) + wj, wka_Pl_ = fn(mol, auxmol, dm0_tag, with_j=with_j, with_k=with_k, omega=omega) dm0_tag = tag_array(dm0, occ_coeff=moccb) - wj, wkb_Pl_ = fn(mol, auxmol, dm0_tag, omega=omega) - rhoj0 = solve_j2c(wj) + wj, wkb_Pl_ = fn(mol, auxmol, dm0_tag, with_j=with_j, with_k=with_k, omega=omega) + rhoj0 = None + if with_j: + rhoj0 = solve_j2c(wj) wj = None if isinstance(wka_Pl_, cupy.ndarray): @@ -530,8 +521,11 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, else: rhok0a_Pl_ = np.empty_like(wka_Pl_) for p0, p1 in lib.prange(0,nao,64): - wk_tmp = cupy.asarray(wka_Pl_[:,p0:p1]) - rhok0a_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get() + # wk_tmp = cupy.asarray(wka_Pl_[:,p0:p1]) + # rhok0a_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get() + wk_tmp = copy_array(wka_Pl_[:,p0:p1]) + wk_tmp = solve_j2c(wk_tmp) + copy_array(wk_tmp, rhok0a_Pl_[:,p0:p1]) wk_tmp = None if isinstance(wkb_Pl_, cupy.ndarray): @@ -539,31 +533,14 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, else: rhok0b_Pl_ = np.empty_like(wkb_Pl_) for p0, p1 in lib.prange(0,nao,64): - wk_tmp = cupy.asarray(wkb_Pl_[:,p0:p1]) - rhok0b_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get() + #wk_tmp = cupy.asarray(wkb_Pl_[:,p0:p1]) + #rhok0b_Pl_[:,p0:p1] = solve_j2c(wk_tmp).get() + wk_tmp = copy_array(wkb_Pl_[:,p0:p1]) + wk_tmp = solve_j2c(wk_tmp) + copy_array(wk_tmp, rhok0b_Pl_[:,p0:p1]) wk_tmp = None wka_Pl_ = wkb_Pl_ = None - - # ----------------------------- - # int3c_ip1 contributions - # ------------------------------ - cupy.get_default_memory_pool().free_all_blocks() - fn = int3c2e.get_int3c2e_ip1_vjk - dm0_tag = tag_array(dm0, occ_coeff=mocca) - vj1_buf, vk1a_buf, vj1a_ao, vk1a_ao = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, aoslices, omega=omega) - dm0_tag = tag_array(dm0, occ_coeff=moccb) - vj1_buf, vk1b_buf, vj1b_ao, vk1b_ao = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, aoslices, omega=omega) - - vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2]) - vk1a_buf = intopt.unsort_orbitals(vk1a_buf, axis=[1,2]) - vk1b_buf = intopt.unsort_orbitals(vk1b_buf, axis=[1,2]) - - vj1a_int3c = -contract('nxiq,ip->nxpq', vj1a_ao, mo_coeff[0]) - vj1b_int3c = -contract('nxiq,ip->nxpq', vj1b_ao, mo_coeff[1]) - vk1a_int3c = -contract('nxiq,ip->nxpq', vk1a_ao, mo_coeff[0]) - vk1b_int3c = -contract('nxiq,ip->nxpq', vk1b_ao, mo_coeff[1]) - vj1a_ao = vj1b_ao = vk1a_ao = vk1b_ao = None - t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0) + vj1a_int3c = vj1b_int3c = vk1a_int3c = vk1b_int3c = None # -------------------------- # int3c_ip2 contribution @@ -572,9 +549,11 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, if hessobj.auxbasis_response: fn = int3c2e.get_int3c2e_ip2_vjk dm0_tag = tag_array(dm0, occ_coeff=mocca) - vj1a_int3c_ip2, vk1a_int3c_ip2 = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, auxslices, omega=omega) + vj1a_int3c, vk1a_int3c = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, auxslices, + with_j=with_j, with_k=with_k, omega=omega) dm0_tag = tag_array(dm0, occ_coeff=moccb) - vj1b_int3c_ip2, vk1b_int3c_ip2 = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, auxslices, omega=omega) + vj1b_int3c, vk1b_int3c = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, auxslices, + with_j=with_j, with_k=with_k, omega=omega) # Responses due to int2c2e_ip1 if omega and omega > 1e-10: @@ -584,34 +563,37 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1') int2c_ip1 = cupy.asarray(int2c_ip1, order='C') int2c_ip1 = intopt.sort_orbitals(int2c_ip1, aux_axis=[1,2]) - - # generate rhok0_P__ - if isinstance(rhok0a_Pl_, cupy.ndarray): - rhok0a_P__ = contract('pio,ir->pro', rhok0a_Pl_, mocca) - else: - naux = auxmol.nao - nocc = mocca.shape[1] - rhok0a_P__ = cupy.empty([naux,nocc,nocc]) - for p0, p1 in lib.prange(0,naux,64): - rhok0_Pl_tmp = cupy.asarray(rhok0a_Pl_[p0:p1]) - rhok0a_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocca) - rhok0_Pl_tmp = None - - # generate rhok0_P__ - if isinstance(rhok0b_Pl_, cupy.ndarray): - rhok0b_P__ = contract('pio,ir->pro', rhok0b_Pl_, moccb) - else: - naux = auxmol.nao - nocc = moccb.shape[1] - rhok0b_P__ = cupy.empty([naux,nocc,nocc]) - for p0, p1 in lib.prange(0,naux,64): - rhok0_Pl_tmp = cupy.asarray(rhok0b_Pl_[p0:p1]) - rhok0b_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, moccb) - rhok0_Pl_tmp = None - - wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0) - wk0a_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0a_P__) - wk0b_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0b_P__) + if with_k: + # generate rhok0_P__ + if isinstance(rhok0a_Pl_, cupy.ndarray): + rhok0a_P__ = contract('pio,ir->pro', rhok0a_Pl_, mocca) + else: + naux = auxmol.nao + nocc = mocca.shape[1] + rhok0a_P__ = cupy.empty([naux,nocc,nocc]) + for p0, p1 in lib.prange(0,naux,64): + #rhok0_Pl_tmp = cupy.asarray(rhok0a_Pl_[p0:p1]) + rhok0_Pl_tmp = copy_array(rhok0a_Pl_[p0:p1]) + rhok0a_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, mocca) + rhok0_Pl_tmp = None + + # generate rhok0_P__ + if isinstance(rhok0b_Pl_, cupy.ndarray): + rhok0b_P__ = contract('pio,ir->pro', rhok0b_Pl_, moccb) + else: + naux = auxmol.nao + nocc = moccb.shape[1] + rhok0b_P__ = cupy.empty([naux,nocc,nocc]) + for p0, p1 in lib.prange(0,naux,64): + #rhok0_Pl_tmp = cupy.asarray(rhok0b_Pl_[p0:p1]) + rhok0_Pl_tmp = copy_array(rhok0b_Pl_[p0:p1]) + rhok0b_P__[p0:p1] = contract('pio,ir->pro', rhok0_Pl_tmp, moccb) + rhok0_Pl_tmp = None + if with_j: + wj0_10 = contract('xpq,q->xp', int2c_ip1, rhoj0) + if with_k: + wk0a_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0a_P__) + wk0b_10_P__ = contract('xqp,pro->xqro', int2c_ip1, rhok0b_P__) aux2atom = int3c2e.get_aux2atom(intopt, auxslices) mem_avail = get_avail_mem() @@ -620,42 +602,76 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, block size {blksize}') if blksize < ALIGNED: raise RuntimeError('Not enough memory to compute int3c2e_ip2') - - for p0, p1 in lib.prange(0,nao,64): - rhoka_tmp = cupy.asarray(rhok0a_Pl_[:,p0:p1]) - rhokb_tmp = cupy.asarray(rhok0b_Pl_[:,p0:p1]) - vj1a_tmp = contract('pio,xp->xpio', rhoka_tmp, wj0_10) - vj1b_tmp = contract('pio,xp->xpio', rhokb_tmp, wj0_10) + for p0, p1 in lib.prange(0,nao,blksize): + #rhoka_tmp = cupy.asarray(rhok0a_Pl_[:,p0:p1]) + #rhokb_tmp = cupy.asarray(rhok0b_Pl_[:,p0:p1]) + rhoka_tmp = copy_array(rhok0a_Pl_[:,p0:p1]) + rhokb_tmp = copy_array(rhok0b_Pl_[:,p0:p1]) wk0a_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhoka_tmp) wk0b_10_Pl_ = contract('xqp,pio->xqio', int2c_ip1, rhokb_tmp) - vj1a_tmp += contract('xpio,p->xpio', wk0a_10_Pl_, rhoj0) - vj1b_tmp += contract('xpio,p->xpio', wk0b_10_Pl_, rhoj0) - vj1a_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vj1a_tmp, aux2atom) - vj1b_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vj1b_tmp, aux2atom) - vj1a_tmp = vj1b_tmp = None + if with_j: + vj1a_tmp = contract('pio,xp->xpio', rhoka_tmp, wj0_10) + vj1b_tmp = contract('pio,xp->xpio', rhokb_tmp, wj0_10) + + vj1a_tmp += contract('xpio,p->xpio', wk0a_10_Pl_, rhoj0) + vj1b_tmp += contract('xpio,p->xpio', wk0b_10_Pl_, rhoj0) + vj1a_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vj1a_tmp, aux2atom) + vj1b_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vj1b_tmp, aux2atom) + vj1a_tmp = vj1b_tmp = None if with_k: vk1a_tmp = contract('xpio,pro->xpir', wk0a_10_Pl_, rhok0a_P__) vk1a_tmp += contract('xpro,pir->xpio', wk0a_10_P__, rhoka_tmp) vk1b_tmp = contract('xpio,pro->xpir', wk0b_10_Pl_, rhok0b_P__) vk1b_tmp += contract('xpro,pir->xpio', wk0b_10_P__, rhokb_tmp) - vk1a_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vk1a_tmp, aux2atom) - vk1b_int3c_ip2[:,:,p0:p1] += contract('xpio,pa->axio', vk1b_tmp, aux2atom) + vk1a_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vk1a_tmp, aux2atom) + vk1b_int3c[:,:,p0:p1] += contract('xpio,pa->axio', vk1b_tmp, aux2atom) vk1a_tmp = vk1b_tmp = None wk0a_10_Pl_ = wk0b_10_Pl_ = rhoka_tmp = rhokb_tmp = None wj0_10 = wk0a_10_P__ = wk0b_10_P__ = rhok0a_P__ =rhok0b_P__ = int2c_ip1 = None - rhoj0 = rhok0a_Pl_ = rhok0b_Pl_ = None - aux2atom = None - vj1a_int3c += contract('nxiq,ip->nxpq', vj1a_int3c_ip2, mo_coeff[0]) - vj1b_int3c += contract('nxiq,ip->nxpq', vj1b_int3c_ip2, mo_coeff[1]) - if with_k: - vk1a_int3c += contract('nxiq,ip->nxpq', vk1a_int3c_ip2, mo_coeff[0]) - vk1b_int3c += contract('nxiq,ip->nxpq', vk1b_int3c_ip2, mo_coeff[1]) - vk1a_int3c_ip2 = vk1b_int3c_ip2 = None + aux2atom = None t0 = log.timer_debug1('Fock matrix due to int3c2e_ip2', *t0) + # ----------------------------- + # int3c_ip1 contributions + # ------------------------------ + cupy.get_default_memory_pool().free_all_blocks() + fn = int3c2e.get_int3c2e_ip1_vjk + dm0_tag = tag_array(dm0, occ_coeff=mocca) + vj1_buf, vk1a_buf, vj1a_ao, vk1a_ao = fn(intopt, rhoj0, rhok0a_Pl_, dm0_tag, aoslices, + with_j=with_j, with_k=with_k, omega=omega) + dm0_tag = tag_array(dm0, occ_coeff=moccb) + vj1_buf, vk1b_buf, vj1b_ao, vk1b_ao = fn(intopt, rhoj0, rhok0b_Pl_, dm0_tag, aoslices, + with_j=with_j, with_k=with_k, omega=omega) + rhoj0 = rhok0a_Pl_ = rhok0b_Pl_ = None + + if with_j: + vj1_buf = intopt.unsort_orbitals(vj1_buf, axis=[1,2]) + if not hessobj.auxbasis_response: + vj1a_int3c = -vj1a_ao + vj1b_int3c = -vj1b_ao + else: + vj1a_int3c -= vj1a_ao + vj1b_int3c -= vj1b_ao + vj1a_ao = vj1b_ao = None + vj1a_int3c = contract('nxiq,ip->nxpq', vj1a_int3c, mo_coeff[0]) + vj1b_int3c = contract('nxiq,ip->nxpq', vj1b_int3c, mo_coeff[1]) + if with_k: + vk1a_buf = intopt.unsort_orbitals(vk1a_buf, axis=[1,2]) + vk1b_buf = intopt.unsort_orbitals(vk1b_buf, axis=[1,2]) + if not hessobj.auxbasis_response: + vk1a_int3c = -vk1a_ao + vk1b_int3c = -vk1b_ao + else: + vk1a_int3c -= vk1a_ao + vk1b_int3c -= vk1b_ao + vk1a_ao = vk1b_ao = None + vk1a_int3c = contract('nxiq,ip->nxpq', vk1a_int3c, mo_coeff[0]) + vk1b_int3c = contract('nxiq,ip->nxpq', vk1b_int3c, mo_coeff[1]) + t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0) + mocca = intopt.unsort_orbitals(mocca, axis=[0]) moccb = intopt.unsort_orbitals(moccb, axis=[0]) mo_coeff = intopt.unsort_orbitals(mo_coeff, axis=[1]) @@ -666,43 +682,35 @@ def _ao2mo(mat, mocc, mo): tmp = contract('xij,jo->xio', mat, mocc) return contract('xik,ip->xpk', tmp, mo) - gobj = hessobj.base.nuc_grad_method() - grad_hcore_a = rhf_grad.get_grad_hcore(gobj, mo_coeff[0], mo_occ[0]) - grad_hcore_b = rhf_grad.get_grad_hcore(gobj, mo_coeff[1], mo_occ[1]) cupy.get_default_memory_pool().free_all_blocks() - vk1a = vk1b = None for i0, ia in enumerate(atmlst): shl0, shl1, p0, p1 = aoslices[ia] - vj1_ao = cupy.zeros([3,nao,nao]) - vk1a_ao = cupy.zeros([3,nao,nao]) - vk1b_ao = cupy.zeros([3,nao,nao]) - - vj1_ao[:,p0:p1,:] -= vj1_buf[:,p0:p1,:] - vj1_ao[:,:,p0:p1] -= vj1_buf[:,p0:p1,:].transpose(0,2,1) + if with_j: + vj1_ao = cupy.zeros([3,nao,nao]) + vj1_ao[:,p0:p1,:] -= vj1_buf[:,p0:p1,:] + vj1_ao[:,:,p0:p1] -= vj1_buf[:,p0:p1,:].transpose(0,2,1) + vj1a_int3c[ia] += _ao2mo(vj1_ao, mocca, mo_coeff[0]) + vj1b_int3c[ia] += _ao2mo(vj1_ao, moccb, mo_coeff[1]) if with_k: + vk1a_ao = cupy.zeros([3,nao,nao]) + vk1b_ao = cupy.zeros([3,nao,nao]) vk1a_ao[:,p0:p1,:] -= vk1a_buf[:,p0:p1,:] vk1a_ao[:,:,p0:p1] -= vk1a_buf[:,p0:p1,:].transpose(0,2,1) vk1b_ao[:,p0:p1,:] -= vk1b_buf[:,p0:p1,:] vk1b_ao[:,:,p0:p1] -= vk1b_buf[:,p0:p1,:].transpose(0,2,1) + vk1a_int3c[ia] += _ao2mo(vk1a_ao, mocca, mo_coeff[0]) + vk1b_int3c[ia] += _ao2mo(vk1b_ao, moccb, mo_coeff[1]) + return (vj1a_int3c, vj1b_int3c), (vk1a_int3c, vk1b_int3c) - h1a = grad_hcore_a[i0] - h1b = grad_hcore_b[i0] - vj1a = vj1a_int3c[ia] + _ao2mo(vj1_ao, mocca, mo_coeff[0]) - vj1b = vj1b_int3c[ia] + _ao2mo(vj1_ao, moccb, mo_coeff[1]) - if with_k: - vk1a = vk1a_int3c[ia] + _ao2mo(vk1a_ao, mocca, mo_coeff[0]) - vk1b = vk1b_int3c[ia] + _ao2mo(vk1b_ao, moccb, mo_coeff[1]) - yield ia, (h1a, h1b), (vj1a, vj1b), (vk1a, vk1b) +_get_jk_mo = df_rhf_hess._get_jk_mo class Hessian(uhf_hess.Hessian): '''Non-relativistic restricted Hartree-Fock hessian''' from gpu4pyscf.lib.utils import to_gpu, device - __init__ = uhf_hess.Hessian.__init__ auxbasis_response = 1 partial_hess_elec = partial_hess_elec make_h1 = make_h1 - kernel = rhf_hess.kernel - hess = kernel + get_jk_mo = _get_jk_mo diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py index 1e7ee43b..059f571c 100644 --- a/gpu4pyscf/df/hessian/uks.py +++ b/gpu4pyscf/df/hessian/uks.py @@ -23,10 +23,12 @@ import numpy import cupy from pyscf import lib +from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf.hessian import rhf as rhf_hess from gpu4pyscf.hessian import uhf as uhf_hess from gpu4pyscf.hessian import uks as uks_hess from gpu4pyscf.df.hessian import uhf as df_uhf_hess +from gpu4pyscf.df.hessian.uhf import _partial_hess_ejk, _get_jk_ip from gpu4pyscf.lib import logger from gpu4pyscf.lib.cupy_helper import contract @@ -51,17 +53,17 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) with_k = mf._numint.libxc.is_hybrid_xc(mf.xc) - de2, ej, ek = df_uhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, - atmlst, max_memory, verbose, - with_k=with_k) + de2, ej, ek = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, + atmlst, max_memory, verbose, + with_j=True, with_k=with_k) de2 += ej # (A,B,dR_A,dR_B) if with_k: de2 -= hyb * ek if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: - ek_lr = df_uhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, - atmlst, max_memory, verbose, - True, omega=omega)[2] + ek_lr = _partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ, + atmlst, max_memory, verbose, + with_j=False, with_k=True, omega=omega)[2] de2 -= (alpha - hyb) * ek_lr max_memory = None @@ -89,40 +91,50 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): mol = hessobj.mol + natm = mol.natm + assert atmlst is None or atmlst ==range(natm) mf = hessobj.base ni = mf._numint ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True) omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) mem_now = lib.current_memory()[0] max_memory = max(2000, mf.max_memory*.9-mem_now) - h1moa, h1mob = uks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory) + with_k = ni.libxc.is_hybrid_xc(mf.xc) - for ia, h1, vj1, vk1 in df_uhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, with_k): + vj1, vk1 = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, + atmlst, verbose, with_j=True, with_k=True) + vj1a, vj1b = vj1 + h1moa = vj1a + h1mob = vj1b + + if with_k: + vk1a, vk1b = vk1 + h1moa -= hyb * vk1a + h1mob -= hyb * vk1b + vj1 = vk1 = vj1a = vj1b = vk1a = vk1b = None - h1moa[ia] += h1[0] + vj1[0] - h1mob[ia] += h1[1] + vj1[1] - if with_k: - vk1a, vk1b = vk1 - h1moa[ia] -= hyb * vk1a - h1mob[ia] -= hyb * vk1b if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: - for ia, h1, vj1_lr, vk1_lr in df_uhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile, - atmlst, verbose, True, omega=omega): - vk1a, vk1b = vk1_lr - h1moa[ia] -= (alpha - hyb) * vk1a - h1mob[ia] -= (alpha - hyb) * vk1b + _, vk1_lr = _get_jk_ip(hessobj, mo_coeff, mo_occ, chkfile, + atmlst, verbose, with_j=False, with_k=True, omega=omega) + vk1a, vk1b = vk1_lr + h1moa -= (alpha - hyb) * vk1a + h1mob -= (alpha - hyb) * vk1b + + gobj = hessobj.base.nuc_grad_method() + h1moa += rhf_grad.get_grad_hcore(gobj, mo_coeff[0], mo_occ[0]) + h1mob += rhf_grad.get_grad_hcore(gobj, mo_coeff[1], mo_occ[1]) + + v1moa, v1mob = uks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory) + h1moa += v1moa + h1mob += v1mob return h1moa, h1mob class Hessian(uks_hess.Hessian): '''Non-relativistic RKS hessian''' from gpu4pyscf.lib.utils import to_gpu, device - __init__ = uks_hess.Hessian.__init__ auxbasis_response = 1 partial_hess_elec = partial_hess_elec make_h1 = make_h1 - hess_elec = uhf_hess.hess_elec - kernel = rhf_hess.kernel - hess = kernel + get_jk_mo = df_uhf_hess._get_jk_mo diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index 3bb6c916..e77e30ca 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -20,8 +20,8 @@ from pyscf import gto, df, lib from pyscf.scf import _vhf from gpu4pyscf.scf.int4c2e import BasisProdCache, libgvhf, libgint -from gpu4pyscf.lib.cupy_helper import (block_c2s_diag, cart2sph, contract, get_avail_mem, - reduce_to_device) +from gpu4pyscf.lib.cupy_helper import (block_c2s_diag, cart2sph, contract, get_avail_mem, + reduce_to_device, copy_array, transpose_sum) from gpu4pyscf.lib import logger from gpu4pyscf.gto.mole import basis_seg_contraction from gpu4pyscf.__config__ import _num_devices, _streams @@ -29,7 +29,7 @@ LMAX_ON_GPU = 8 FREE_CUPY_CACHE = True STACK_SIZE_PER_THREAD = 8192 * 4 -BLKSIZE = 128 +BLKSIZE = 256 NROOT_ON_GPU = 7 def make_fake_mol(): @@ -103,8 +103,8 @@ def __del__(self): except AttributeError: pass - def build(self, cutoff=1e-14, group_size=None, - group_size_aux=None, diag_block_with_triu=False, aosym=False): + def build(self, cutoff=1e-14, group_size=None, group_size_aux=None, + diag_block_with_triu=False, aosym=False, verbose=None): ''' int3c2e is based on int2e with (ao,ao|aux,1) a tot_mol is created with concatenating [mol, fake_mol, aux_mol] @@ -116,7 +116,9 @@ def build(self, cutoff=1e-14, group_size=None, mol = basis_seg_contraction(_mol, allow_replica=True)[0] auxmol = basis_seg_contraction(_auxmol, allow_replica=True)[0] - log = logger.new_logger(_mol, _mol.verbose) + if verbose is None: + verbose = _mol.verbose + log = logger.new_logger(_mol, verbose) cput0 = log.init_timer() _sorted_mol, sorted_idx, uniq_l_ctr, l_ctr_counts = sort_mol(mol, log=log) @@ -181,7 +183,7 @@ def build(self, cutoff=1e-14, group_size=None, aux_loc = _auxmol.ao_loc_nr(cart=_auxmol.cart) ao_idx = np.array_split(np.arange(_auxmol.nao), aux_loc[1:-1]) - self._aux_ao_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx]) + self._aux_ao_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx]) cput1 = log.timer_debug1('Aux AO indices', *cput1) ao_loc = _sorted_mol.ao_loc_nr(cart=_mol.cart) @@ -218,28 +220,10 @@ def build(self, cutoff=1e-14, group_size=None, self.pair2bra = pair2bra self.pair2ket = pair2ket self.l_ctr_offsets = l_ctr_offsets - bas_pair2shls = np.hstack(pair2bra + pair2ket).astype(np.int32).reshape(2,-1) - bas_pairs_locs = np.append(0, np.cumsum([x.size for x in pair2bra])).astype(np.int32) - log_qs = log_qs + aux_log_qs - ao_loc = _tot_mol.ao_loc_nr(cart=True) - ncptype = len(log_qs) self._bpcache = {} - for n in range(_num_devices): - with cupy.cuda.Device(n), _streams[n]: - bpcache = ctypes.POINTER(BasisProdCache)() - scale_shellpair_diag = 1. - libgint.GINTinit_basis_prod( - ctypes.byref(bpcache), ctypes.c_double(scale_shellpair_diag), - ao_loc.ctypes.data_as(ctypes.c_void_p), - bas_pair2shls.ctypes.data_as(ctypes.c_void_p), - bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(ncptype), - _tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.natm), - _tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.nbas), - _tot_mol._env.ctypes.data_as(ctypes.c_void_p)) - self._bpcache[n] = bpcache - cput1 = log.timer_debug1('Initialize GPU cache', *cput1) + bas_pairs_locs = np.append(0, np.cumsum([x.size for x in pair2bra])).astype(np.int32) self.bas_pairs_locs = bas_pairs_locs ncptype = len(self.log_qs) self.aosym = aosym @@ -260,10 +244,31 @@ def build(self, cutoff=1e-14, group_size=None, self._sorted_mol = _sorted_mol self._sorted_auxmol = _sorted_auxmol - + @property def bpcache(self): device_id = cupy.cuda.Device().id + if device_id not in self._bpcache: + with cupy.cuda.Device(device_id), _streams[device_id]: + log = logger.new_logger(self.mol, self.mol.verbose) + cput0 = log.init_timer() + bpcache = ctypes.POINTER(BasisProdCache)() + scale_shellpair_diag = 1. + _tot_mol = self._tot_mol + log_qs = self.log_qs + self.aux_log_qs + ao_loc = _tot_mol.ao_loc_nr(cart=True) + bas_pair2shls = np.hstack(self.pair2bra + self.pair2ket).astype(np.int32).reshape(2,-1) + ncptype = len(log_qs) + libgint.GINTinit_basis_prod( + ctypes.byref(bpcache), ctypes.c_double(scale_shellpair_diag), + ao_loc.ctypes.data_as(ctypes.c_void_p), + bas_pair2shls.ctypes.data_as(ctypes.c_void_p), + self.bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(ncptype), + _tot_mol._atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.natm), + _tot_mol._bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(_tot_mol.nbas), + _tot_mol._env.ctypes.data_as(ctypes.c_void_p)) + self._bpcache[device_id] = bpcache + cput0 = log.timer_debug1(f'Initialize GPU cache on Device {device_id}', *cput0) bpcache = self._bpcache[device_id] return bpcache @@ -310,15 +315,15 @@ def unsort_orbitals(self, sorted_mat, axis=[], aux_axis=[]): mat = cupy.empty_like(sorted_mat) mat[tuple(fancy_index)] = sorted_mat return mat - + @property def cart2sph(self): return block_c2s_diag(self.angular, self.l_ctr_counts) - + @property def aux_cart2sph(self): return block_c2s_diag(self.aux_angular, self.aux_l_ctr_counts) - + @property def coeff(self): nao = self.mol.nao @@ -339,36 +344,45 @@ def aux_coeff(self): self._aux_coeff = self.unsort_orbitals(self.aux_cart2sph, aux_axis=[1]) return self._aux_coeff -def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_k=True): +def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_j=True, with_k=True): log = logger.new_logger(mol, mol.verbose) intopt = VHFOpt(mol, auxmol, 'int2e') - intopt.build(thred, diag_block_with_triu=True, aosym=True, group_size=BLKSIZE, group_size_aux=BLKSIZE) + intopt.build(thred, diag_block_with_triu=True, aosym=True, + group_size=BLKSIZE, group_size_aux=BLKSIZE) orbo = dm0_tag.occ_coeff nao = mol.nao naux = auxmol.nao nocc = orbo.shape[1] - wj = cupy.empty([naux]) - avail_mem = get_avail_mem() - use_gpu_memory = True - if naux*nao*nocc*8 < 0.4*avail_mem: - try: - wk = cupy.empty([naux,nao,nocc]) - except Exception: + + wj = None + if with_j: + wj = cupy.empty([naux]) + + wk = None + if with_k: + avail_mem = get_avail_mem() + use_gpu_memory = True + if naux*nao*nocc*8 < 0.4*avail_mem: + try: + wk = cupy.empty([naux,nao,nocc]) + except Exception: + use_gpu_memory = False + else: use_gpu_memory = False - else: - use_gpu_memory = False - - if not use_gpu_memory: - log.debug('Saving int3c2e_wjk on CPU memory') - mem = cupy.cuda.alloc_pinned_memory(naux*nao*nocc*8) - wk = np.ndarray([naux,nao,nocc], dtype=np.float64, order='C', buffer=mem) + + if not use_gpu_memory: + log.debug('Saving int3c2e_wjk on CPU memory') + mem = cupy.cuda.alloc_pinned_memory(naux*nao*nocc*8) + wk = np.ndarray([naux,nao,nocc], dtype=np.float64, order='C', buffer=mem) # TODO: async data transfer for cp_kl_id, _ in enumerate(intopt.aux_log_qs): k0 = intopt.aux_ao_loc[cp_kl_id] k1 = intopt.aux_ao_loc[cp_kl_id+1] - rhoj_tmp = cupy.zeros([k1-k0], order='C') - rhok_tmp = cupy.zeros([k1-k0, nao, nocc], order='C') + if with_j: + rhoj_tmp = cupy.zeros([k1-k0], order='C') + if with_k: + rhok_tmp = cupy.zeros([k1-k0, nao, nocc], order='C') for cp_ij_id, _ in enumerate(intopt.log_qs): cpi = intopt.cp_idx[cp_ij_id] @@ -381,20 +395,23 @@ def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None, with_k=True): int3c_blk = cart2sph(int3c_blk, axis=2, ang=li) i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1] j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1] - - tmp = contract('Lji,ij->L', int3c_blk, dm0_tag[i0:i1,j0:j1]) - rhoj_tmp += tmp - rhok_tmp[:,j0:j1] += contract('Lji,io->Ljo', int3c_blk, orbo[i0:i1]) - - if cpi != cpj and intopt.aosym: + if with_j: + tmp = contract('Lji,ij->L', int3c_blk, dm0_tag[i0:i1,j0:j1]) rhoj_tmp += tmp - rhok_tmp[:,i0:i1] += contract('Lji,jo->Lio', int3c_blk, orbo[j0:j1]) - wj[k0:k1] = rhoj_tmp + if cpi != cpj: + rhoj_tmp += tmp + if with_k: + rhok_tmp[:,j0:j1] += contract('Lji,io->Ljo', int3c_blk, orbo[i0:i1]) + if cpi != cpj: + rhok_tmp[:,i0:i1] += contract('Lji,jo->Lio', int3c_blk, orbo[j0:j1]) + if with_j: + wj[k0:k1] = rhoj_tmp if with_k: if isinstance(wk, cupy.ndarray): wk[k0:k1] = rhok_tmp else: - rhok_tmp.get(out=wk[k0:k1]) + #rhok_tmp.get(out=wk[k0:k1]) + copy_array(rhok_tmp, wk[k0:k1]) return wj, wk def get_int3c2e_ip_jk(intopt, cp_aux_id, ip_type, rhoj, rhok, dm, omega=None, stream=None): @@ -484,16 +501,6 @@ def loop_int3c2e_general(intopt, task_list=None, ip_type='', omega=None, stream= ao_loc = intopt.ao_loc aux_ao_loc = intopt.aux_ao_loc comp = 3**order - - lmax = intopt._sorted_mol._bas[:gto.ANG_OF].max() - aux_lmax = intopt._sorted_auxmol._bas[:gto.ANG_OF].max() - nroots = (lmax + aux_lmax + order)//2 + 1 - if nroots > NROOT_ON_GPU: - from pyscf.gto.moleintor import getints, make_cintopt - pmol = intopt._tot_mol - intor = pmol._add_suffix('int3c2e_' + ip_type) - opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor) - nbins = 1 # If task_list is not given, generate all the tasks @@ -505,7 +512,7 @@ def loop_int3c2e_general(intopt, task_list=None, ip_type='', omega=None, stream= for aux_id, cp_ij_id in task_list: cp_kl_id = aux_id + len(intopt.log_qs) lk = intopt.aux_angular[aux_id] - + cpi = intopt.cp_idx[cp_ij_id] cpj = intopt.cp_jdx[cp_ij_id] li = intopt.angular[cpi] @@ -546,6 +553,11 @@ def loop_int3c2e_general(intopt, task_list=None, ip_type='', omega=None, stream= if err != 0: raise RuntimeError(f'GINT_fill_int3c2e general failed, err={err}') else: + from pyscf.gto.moleintor import getints, make_cintopt + pmol = intopt._tot_mol + intor = pmol._add_suffix('int3c2e_' + ip_type) + opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor) + # TODO: sph2cart in CPU? ishl0, ishl1 = intopt.l_ctr_offsets[cpi], intopt.l_ctr_offsets[cpi+1] jshl0, jshl1 = intopt.l_ctr_offsets[cpj], intopt.l_ctr_offsets[cpj+1] @@ -670,26 +682,26 @@ def get_j_int3c2e_pass1(intopt, dm0, sort_j=True, stream=None): get rhoj pass1 for int3c2e ''' if stream is None: stream = cupy.cuda.get_current_stream() - + n_dm = 1 naux = intopt._sorted_auxmol.nao - + coeff = intopt.coeff if dm0.ndim == 3: dm0 = dm0[0] + dm0[1] dm_cart = coeff @ dm0 @ coeff.T - + num_cp_ij = [len(log_qs) for log_qs in intopt.log_qs] num_cp_kl = [len(log_qs) for log_qs in intopt.aux_log_qs] bins_locs_ij = np.append(0, np.cumsum(num_cp_ij)).astype(np.int32) bins_locs_kl = np.append(0, np.cumsum(num_cp_kl)).astype(np.int32) - + ncp_ij = len(intopt.log_qs) ncp_kl = len(intopt.aux_log_qs) norb = dm_cart.shape[0] - + rhoj = cupy.zeros([naux]) err = libgvhf.GINTbuild_j_int3c2e_pass1( @@ -706,7 +718,7 @@ def get_j_int3c2e_pass1(intopt, dm0, sort_j=True, stream=None): ctypes.c_int(ncp_kl)) if err != 0: raise RuntimeError('CUDA error in get_j_pass1') - + if sort_j: aux_coeff = intopt.aux_coeff rhoj = cupy.dot(rhoj, aux_coeff) @@ -731,7 +743,7 @@ def get_j_int3c2e_pass2(intopt, rhoj, stream=None): ncp_ij = len(intopt.log_qs) ncp_kl = len(intopt.aux_log_qs) - + rhoj = intopt.sort_orbitals(rhoj, aux_axis=[0]) if not intopt.auxmol.cart: rhoj = intopt.aux_cart2sph @ rhoj @@ -751,7 +763,7 @@ def get_j_int3c2e_pass2(intopt, rhoj, stream=None): if err != 0: raise RuntimeError('CUDA error in get_j_pass2') - + if not intopt.mol.cart: cart2sph = intopt.cart2sph vj = cart2sph.T @ vj @ cart2sph @@ -759,6 +771,48 @@ def get_j_int3c2e_pass2(intopt, rhoj, stream=None): vj = vj + vj.T return vj +def _int3c2e_jk_task(intopt, task_k_list, dm0, mocc, device_id=0, omega=None): + with cupy.cuda.Device(device_id), _streams[device_id]: + log = logger.new_logger(intopt.mol, intopt.mol.verbose) + t0 = log.init_timer() + mocc = cupy.asarray(mocc) + dm0 = cupy.asarray(dm0) + naux = intopt.auxmol.nao + nocc = mocc.shape[1] + rhoj = cupy.zeros([naux]) + rhok = cupy.zeros([naux,nocc,nocc]) + for cp_kl_id in task_k_list: + k0 = intopt.aux_ao_loc[cp_kl_id] + k1 = intopt.aux_ao_loc[cp_kl_id+1] + rhoj_tmp = cupy.zeros([k1-k0], order='C') + rhok_tmp = cupy.zeros([k1-k0, nocc, nocc], order='C') + for cp_ij_id, _ in enumerate(intopt.log_qs): + cpi = intopt.cp_idx[cp_ij_id] + cpj = intopt.cp_jdx[cp_ij_id] + li = intopt.angular[cpi] + lj = intopt.angular[cpj] + int3c_blk = get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, omega=omega) + if not intopt.mol.cart: + int3c_blk = cart2sph(int3c_blk, axis=1, ang=lj) + int3c_blk = cart2sph(int3c_blk, axis=2, ang=li) + i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1] + j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1] + if cpi == cpj and intopt.aosym: + int3c_blk *= 0.5 + + rhoj_tmp += contract('pji,ij->p', int3c_blk, dm0[i0:i1,j0:j1]) + ints_o = contract('pji,jo->poi', int3c_blk, mocc[j0:j1]) + rhok_tmp += contract('poi,ir->por', ints_o, mocc[i0:i1]) + int3c_blk = ints_o = None + if intopt.aosym: + rhoj[k0:k1] = 2.0 * rhoj_tmp + rhok[k0:k1] = transpose_sum(rhok_tmp) + else: + rhoj[k0:k1] = rhoj_tmp + rhok[k0:k1] = rhok_tmp + t0 = log.timer_debug1(f'int3c2e_vjk on Device {device_id}', *t0) + return rhoj, rhok + def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None): ''' get rhoj and rhok for int3c2e @@ -766,109 +820,132 @@ def get_int3c2e_jk(mol, auxmol, dm0_tag, with_k=True, omega=None): intopt = VHFOpt(mol, auxmol, 'int2e') intopt.build(1e-14, diag_block_with_triu=True, aosym=True, group_size=BLKSIZE, group_size_aux=BLKSIZE) - if omega is None: omega = 0.0 - naux = auxmol.nao orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') - nocc = orbo.shape[1] - rhoj = cupy.empty([naux]) - rhok = cupy.empty([naux,nocc,nocc]) + futures = [] + aux_ao_loc = np.array(intopt.aux_ao_loc) + loads = aux_ao_loc[1:] - aux_ao_loc[:-1] + task_list = _split_tasks(loads, _num_devices) - for cp_kl_id, _ in enumerate(intopt.aux_log_qs): - k0 = intopt.aux_ao_loc[cp_kl_id] - k1 = intopt.aux_ao_loc[cp_kl_id+1] - rhoj_tmp = cupy.zeros([k1-k0], order='C') - rhok_tmp = cupy.zeros([k1-k0, nocc, nocc], order='C') - for cp_ij_id, _ in enumerate(intopt.log_qs): - cpi = intopt.cp_idx[cp_ij_id] - cpj = intopt.cp_jdx[cp_ij_id] - li = intopt.angular[cpi] - lj = intopt.angular[cpj] - int3c_blk = get_int3c2e_slice(intopt, cp_ij_id, cp_kl_id, omega=omega) - if not intopt.mol.cart: - int3c_blk = cart2sph(int3c_blk, axis=1, ang=lj) - int3c_blk = cart2sph(int3c_blk, axis=2, ang=li) - i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1] - j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1] - if cpi == cpj and intopt.aosym: - int3c_blk *= 0.5 + cupy.cuda.get_current_stream().synchronize() + with ThreadPoolExecutor(max_workers=_num_devices) as executor: + for device_id in range(_num_devices): + future = executor.submit( + _int3c2e_jk_task, intopt, task_list[device_id], + dm0_tag, orbo, device_id=device_id, omega=omega) + futures.append(future) - rhoj_tmp += contract('pji,ij->p', int3c_blk, dm0_tag[i0:i1,j0:j1]) - ints_o = contract('pji,jo->poi', int3c_blk, orbo[j0:j1]) - rhok_tmp += contract('poi,ir->por', ints_o, orbo[i0:i1]) + rhoj_total = [] + rhok_total = [] + for future in futures: + rhoj, rhok = future.result() + rhoj_total.append(rhoj) + rhok_total.append(rhok) - if intopt.aosym: - rhoj[k0:k1] = 2.0 * rhoj_tmp - rhok[k0:k1] = rhok_tmp + rhok_tmp.transpose([0,2,1]) - else: - rhoj[k0:k1] = rhoj_tmp - rhok[k0:k1] = rhok_tmp + rhoj = rhok = None + rhoj = reduce_to_device(rhoj_total, inplace=True) + if with_k: + rhok = reduce_to_device(rhok_total, inplace=True) return rhoj, rhok -def _int3c2e_ip1_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0, with_k=True, omega=None): +def _split_tasks(loads, ngroups): + ''' Split a list of numbers into sublists with sums as close as possible + ''' + if ngroups == 1: + return [range(len(loads))] + groups = [[] for _ in range(ngroups)] + sums = [0] * ngroups + + sorted_indices = np.argsort(loads)[::-1] + for idx in sorted_indices: + min_index = sums.index(min(sums)) + groups[min_index].append(idx) + sums[min_index] += loads[idx] + return groups + +def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=0, + with_j=True, with_k=True, omega=None): natom = intopt.mol.natm nao = intopt.mol.nao aoslices = intopt.mol.aoslice_by_atom() + vj1_buf = vk1_buf = vj1 = vk1 = None + with cupy.cuda.Device(device_id), _streams[device_id]: + log = logger.new_logger(intopt.mol, intopt.mol.verbose) + t0 = log.init_timer() ao2atom = get_ao2atom(intopt, aoslices) - rhoj = cupy.asarray(rhoj) dm0 = cupy.asarray(dm0) orbo = cupy.asarray(orbo) nocc = orbo.shape[1] - vj1_buf = cupy.zeros([3,nao,nao]) - vk1_buf = cupy.zeros([3,nao,nao]) - vj1 = cupy.zeros([natom,3,nao,nocc]) - vk1 = cupy.zeros([natom,3,nao,nocc]) + if with_j: + rhoj = cupy.asarray(rhoj) + vj1_buf = cupy.zeros([3,nao,nao]) + vj1 = cupy.zeros([natom,3,nao,nocc]) + if with_k: + vk1_buf = cupy.zeros([3,nao,nao]) + vk1 = cupy.zeros([natom,3,nao,nocc]) aux_ao_loc = intopt.aux_ao_loc ncp_ij = len(intopt.log_qs) - for cp_k in task_list: + for cp_k in task_k_list: task_list = [(cp_k, cp_ij) for cp_ij in range(ncp_ij)] k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1] - rhok_tmp = cupy.asarray(rhok[k0:k1]) + #rhok_tmp = cupy.asarray(rhok[k0:k1]) + rhok_tmp = copy_array(rhok[k0:k1]) if with_k: rhok0 = contract('pio,ir->pro', rhok_tmp, orbo) rhok0 = contract('pro,Jo->prJ', rhok0, orbo) - rhoj0 = cupy.zeros([3,k1-k0,nao]) - int3c_ip1_occ = cupy.zeros([3,k1-k0,nao,nocc]) + int3c_ip1_occ = cupy.zeros([3,k1-k0,nao,nocc]) + if with_j: + rhoj0 = cupy.zeros([3,k1-k0,nao]) + for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, ip_type='ip1', omega=omega): - vj1_buf[:,i0:i1,j0:j1] += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1]) - rhoj0[:,:,i0:i1] += contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) - int3c_ip1_occ[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1]) - + if with_j: + vj1_buf[:,i0:i1,j0:j1] += contract('xpji,p->xij', int3c_blk, rhoj[k0:k1]) + rhoj0[:,:,i0:i1] += contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) if with_k: + int3c_ip1_occ[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1]) + vk1_ao = contract('xpji,poi->xijo', int3c_blk, rhok0[:,:,i0:i1]) vk1[:,:,j0:j1] += contract('xijo,ia->axjo', vk1_ao, ao2atom[i0:i1]) - - int3c_occ = contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1]) - rhok0_slice = contract('pJr,ir->pJi', rhok_tmp, orbo[i0:i1]) - - vk1_ao = contract('xpio,pJi->xiJo', int3c_occ, rhok0_slice) - vk1 += contract('xiJo,ia->axJo', vk1_ao, ao2atom[i0:i1]) - vk1_ao = int3c_occ = None - rhoj0_atom = contract('xpi,ia->xpa', rhoj0, 2.0*ao2atom) - vj1 += contract('pJo,xpa->axJo', rhok_tmp, rhoj0_atom) - rhoj0_atom = None - vk1_buf += contract('xpio,plo->xil', int3c_ip1_occ, rhok_tmp) + vk1_ao = int3c_blk = None + if with_j: + rhoj0_atom = contract('xpi,ia->xpa', rhoj0, 2.0*ao2atom) + vj1 += contract('pJo,xpa->axJo', rhok_tmp, rhoj0_atom) + rhoj0_atom = rhoj0 = None + if with_k: + rhok0 = None + vk1_buf += contract('xpio,plo->xil', int3c_ip1_occ, rhok_tmp) + mem_avail = get_avail_mem() + blksize = min(int(mem_avail * 0.2 / ((k1-k0) * nao) * 8), + int(mem_avail * 0.2 / (nocc * nao * 3 * 8))) + for p0, p1, in lib.prange(0, nao, blksize): + rhok0_slice = contract('pJr,ir->pJi', rhok_tmp[:,p0:p1], orbo) + vk1_ao = contract('xpio,pJi->xiJo', int3c_ip1_occ, rhok0_slice) + vk1[:,:,p0:p1] += contract('xiJo,ia->axJo', vk1_ao, ao2atom) + rhok0_slice = vk1_ao = None + rhok_tmp = int3c_ip1_occ = None + t0 = log.timer_debug1(f'int3c2e_ip1_vjk on Device {device_id}', *t0) # TODO: absorbe vj1_buf and vk1_buf into vj1 and vk1 return vj1_buf, vk1_buf, vj1, vk1 -def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omega=None): +def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_j=True, + with_k=True, omega=None): orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') futures = [] - ncp_k = len(intopt.aux_log_qs) - tasks = np.array(list(range(ncp_k))) - task_list = [] - for device_id in range(_num_devices): - task_list.append(tasks[device_id::_num_devices]) - + + aux_ao_loc = np.array(intopt.aux_ao_loc) + loads = aux_ao_loc[1:] - aux_ao_loc[:-1] + task_list = _split_tasks(loads, _num_devices) + cupy.cuda.get_current_stream().synchronize() with ThreadPoolExecutor(max_workers=_num_devices) as executor: for device_id in range(_num_devices): future = executor.submit( - _int3c2e_ip1_vjk_task, intopt, task_list[device_id], - rhoj, rhok, dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega) + _int3c2e_ip1_vjk_task, intopt, task_list[device_id], + rhoj, rhok, dm0_tag, orbo, with_j=with_j, with_k=with_k, + device_id=device_id, omega=omega) futures.append(future) - + vj1_buf_total = [] vk1_buf_total = [] vj1_total = [] @@ -879,48 +956,61 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg vk1_buf_total.append(vk1_buf) vj1_total.append(vj1) vk1_total.append(vk1) - + vj1 = vk1 = vj1_buf = vk1_buf = None - vj1 = reduce_to_device(vj1_total, inplace=True) - vj1_buf = reduce_to_device(vj1_buf_total, inplace=True) + if with_j: + vj1 = reduce_to_device(vj1_total, inplace=True) + vj1_buf = reduce_to_device(vj1_buf_total, inplace=True) if with_k: vk1 = reduce_to_device(vk1_total, inplace=True) vk1_buf = reduce_to_device(vk1_buf_total, inplace=True) return vj1_buf, vk1_buf, vj1, vk1 -def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0, with_k=True, omega=None): +def _int3c2e_ip2_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, + device_id=0, with_j=True, with_k=True, omega=None): natom = intopt.mol.natm nao = intopt.mol.nao auxslices = intopt.auxmol.aoslice_by_atom() + vj1 = vk1 = None with cupy.cuda.Device(device_id), _streams[device_id]: + log = logger.new_logger(intopt.mol, intopt.mol.verbose) + t0 = log.init_timer() aux2atom = get_aux2atom(intopt, auxslices) - rhoj = cupy.asarray(rhoj) dm0 = cupy.asarray(dm0) orbo = cupy.asarray(orbo) nocc = orbo.shape[1] - vj1 = cupy.zeros([natom,3,nao,nocc]) - vk1 = cupy.zeros([natom,3,nao,nocc]) + if with_j: + rhoj = cupy.asarray(rhoj) + vj1 = cupy.zeros([natom,3,nao,nocc]) + if with_k: + vk1 = cupy.zeros([natom,3,nao,nocc]) aux_ao_loc = intopt.aux_ao_loc ncp_ij = len(intopt.log_qs) - for cp_k in task_list: + for cp_k in task_k_list: task_list = [(cp_k, cp_ij) for cp_ij in range(ncp_ij)] k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1] - wj2 = cupy.zeros([3,k1-k0]) + if with_j: + wj2 = cupy.zeros([3,k1-k0]) + wk2_P__ = cupy.zeros([3,k1-k0,nao,nocc]) for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, ip_type='ip2', omega=omega): # contraction - wj2 += contract('xpji,ji->xp', int3c_blk, dm0[j0:j1,i0:i1]) - wk2_P__[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1]) - rhok_tmp = cupy.asarray(rhok[k0:k1]) - vj1_tmp = -contract('pio,xp->xpio', rhok_tmp, wj2) - vj1_tmp -= contract('xpio,p->xpio', wk2_P__, rhoj[k0:k1]) + if with_j: + wj2 += contract('xpji,ji->xp', int3c_blk, dm0[j0:j1,i0:i1]) - vj1 += contract('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1]) + wk2_P__[:,:,i0:i1] += contract('xpji,jo->xpio', int3c_blk, orbo[j0:j1]) + int3c_blk = None + #rhok_tmp = cupy.asarray(rhok[k0:k1]) + rhok_tmp = copy_array(rhok[k0:k1]) + if with_j: + vj1_tmp = -contract('pio,xp->xpio', rhok_tmp, wj2) + vj1_tmp -= contract('xpio,p->xpio', wk2_P__, rhoj[k0:k1]) + + vj1 += contract('xpio,pa->axio', vj1_tmp, aux2atom[k0:k1]) + vj1_tmp = wj2 = None if with_k: - #rhok0_slice = contract('pio,jo->pij', rhok_tmp, orbo) - #vk1_tmp = -contract('xpjo,pij->xpio', wk2_P__, rhok0_slice) rhok0_slice = contract('xpjo,jr->xpro', wk2_P__, orbo) vk1_tmp = -contract('xpro,pir->xpio', rhok0_slice, rhok_tmp) @@ -928,54 +1018,59 @@ def _int3c2e_ip2_vjk_task(intopt, task_list, rhoj, rhok, dm0, orbo, device_id=0, vk1_tmp -= contract('xpio,pro->xpir', wk2_P__, rhok0_oo) vk1 += contract('xpir,pa->axir', vk1_tmp, aux2atom[k0:k1]) - wj2 = wk2_P__ = rhok0_slice = rhok0_oo = None - rhok_tmp = vk1_tmp = None + vk1_tmp = rhok0_oo = rhok0_slice = None + rhok_tmp = wk2_P__ = None + t0 = log.timer_debug1(f'int3c2e_ip2_vjk on Device {device_id}', *t0) return vj1, vk1 -def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, omega=None): +def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, + with_j=True, with_k=True, omega=None): ''' vj and vk responses (due to int3c2e_ip2) to changes in atomic positions ''' orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') futures = [] - ncp_k = len(intopt.aux_log_qs) - tasks = np.array(list(range(ncp_k))) - task_list = [] - for device_id in range(_num_devices): - task_list.append(tasks[device_id::_num_devices]) - + + aux_ao_loc = np.array(intopt.aux_ao_loc) + loads = aux_ao_loc[1:] - aux_ao_loc[:-1] + task_list = _split_tasks(loads, _num_devices) + cupy.cuda.get_current_stream().synchronize() with ThreadPoolExecutor(max_workers=_num_devices) as executor: for device_id in range(_num_devices): future = executor.submit( - _int3c2e_ip2_vjk_task, intopt, task_list[device_id], - rhoj, rhok, dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega) + _int3c2e_ip2_vjk_task, intopt, task_list[device_id], + rhoj, rhok, dm0_tag, orbo, with_j=with_j, + with_k=with_k, device_id=device_id, omega=omega) futures.append(future) - + vj_total = [] vk_total = [] for future in futures: vj, vk = future.result() vj_total.append(vj) vk_total.append(vk) - + vj = vk = None - vj = reduce_to_device(vj_total, inplace=True) + if with_j: + vj = reduce_to_device(vj_total, inplace=True) if with_k: vk = reduce_to_device(vk_total, inplace=True) return vj, vk -def _int3c2e_ip1_wjk_task(intopt, task_list, dm0, orbo, wk, device_id=0, with_k=True, omega=None): +def _int3c2e_ip1_wjk_task(intopt, task_k_list, dm0, orbo, wk, device_id=0, with_k=True, omega=None): nao = intopt.mol.nao naux = intopt.auxmol.nao aux_ao_loc = intopt.aux_ao_loc with cupy.cuda.Device(device_id), _streams[device_id]: + log = logger.new_logger(intopt.mol, intopt.mol.verbose) + t0 = log.init_timer() ncp_ij = len(intopt.log_qs) nocc = orbo.shape[1] wj = cupy.zeros([naux,nao,3]) dm0 = cupy.asarray(dm0) orbo = cupy.asarray(orbo) - for cp_k in task_list: + for cp_k in task_k_list: k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1] if with_k: wk_tmp = cupy.zeros([k1-k0,nao,nocc,3]) @@ -985,8 +1080,12 @@ def _int3c2e_ip1_wjk_task(intopt, task_list, dm0, orbo, wk, device_id=0, with_k= wj[k0:k1,i0:i1] += contract('xpji,ij->pix', int3c_blk, dm0[i0:i1,j0:j1]) if with_k: wk_tmp[:,i0:i1] += contract('xpji,jo->piox', int3c_blk, orbo[j0:j1]) + int3c_blk = None if with_k: - wk_tmp.get(out=wk[k0:k1]) + #wk_tmp.get(out=wk[k0:k1]) + copy_array(wk_tmp, wk[k0:k1]) + wk_tmp = None + t0 = log.timer_debug1(f'int3c2e_ip1_wjk on Device {device_id}', *t0) return wj def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): @@ -994,12 +1093,11 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): ''' orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') futures = [] - ncp_k = len(intopt.aux_log_qs) - tasks = np.array(list(range(ncp_k))) - task_list = [] - for device_id in range(_num_devices): - task_list.append(tasks[device_id::_num_devices]) - + + aux_ao_loc = np.array(intopt.aux_ao_loc) + loads = aux_ao_loc[1:] - aux_ao_loc[:-1] + task_list = _split_tasks(loads, _num_devices) + nao = intopt.mol.nao naux = intopt.auxmol.nao nocc = orbo.shape[1] @@ -1012,7 +1110,7 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): with ThreadPoolExecutor(max_workers=_num_devices) as executor: for device_id in range(_num_devices): future = executor.submit( - _int3c2e_ip1_wjk_task, intopt, task_list[device_id], + _int3c2e_ip1_wjk_task, intopt, task_list[device_id], dm0_tag, orbo, wk, with_k=with_k, device_id=device_id, omega=omega) futures.append(future) wj_total = [] @@ -1023,7 +1121,12 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None): return wj, wk def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, device_id=0): + aux_ao_loc = intopt.aux_ao_loc with cupy.cuda.Device(device_id), _streams[device_id]: + cupy.get_default_memory_pool().free_all_blocks() + log = logger.new_logger(intopt.mol, intopt.mol.verbose) + t0 = log.init_timer() + ncp_ij = len(intopt.log_qs) dm0 = cupy.asarray(dm0) orbo = cupy.asarray(orbo) naux = intopt.auxmol.nao @@ -1032,24 +1135,29 @@ def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, devi wk = None if with_k: wk = cupy.zeros([naux,nocc,nocc,3]) - for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, - ip_type='ip2', omega=omega): - wj[k0:k1] += contract('xpji,ji->px', int3c_blk, dm0[j0:j1,i0:i1]) - tmp = contract('xpji,jo->piox', int3c_blk, orbo[j0:j1]) - if with_k: - wk[k0:k1] += contract('piox,ir->prox', tmp, orbo[i0:i1]) + for cp_k in task_list: + k0, k1 = aux_ao_loc[cp_k], aux_ao_loc[cp_k+1] + task_list = [(cp_k, cp_ij) for cp_ij in range(ncp_ij)] + + for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, + ip_type='ip2', omega=omega): + wj[k0:k1] += contract('xpji,ji->px', int3c_blk, dm0[j0:j1,i0:i1]) + if with_k: + tmp = contract('xpji,jo->piox', int3c_blk, orbo[j0:j1]) + wk[k0:k1] += contract('piox,ir->prox', tmp, orbo[i0:i1]) + tmp = None + int3c_blk = None + t0 = log.timer_debug1(f'int3c2e_ip2_wjk on Device {device_id}', *t0) return wj, wk def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None): orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') futures = [] - ncp_k = len(intopt.aux_log_qs) - ncp_ij = len(intopt.log_qs) - tasks = np.array(list(itertools.product(range(ncp_k), range(ncp_ij)))) - task_list = [] - for device_id in range(_num_devices): - task_list.append(tasks[device_id::_num_devices]) - + + aux_ao_loc = np.array(intopt.aux_ao_loc) + loads = aux_ao_loc[1:] - aux_ao_loc[:-1] + task_list = _split_tasks(loads, _num_devices) + cupy.cuda.get_current_stream().synchronize() with ThreadPoolExecutor(max_workers=_num_devices) as executor: for device_id in range(_num_devices): @@ -1057,205 +1165,20 @@ def get_int3c2e_ip2_wjk(intopt, dm0_tag, with_k=True, omega=None): _int3c2e_ip2_wjk, intopt, task_list[device_id], dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega) futures.append(future) - + wj_total = [] wk_total = [] for future in futures: wj, wk = future.result() wj_total.append(wj) wk_total.append(wk) - + wj = wk = None wj = reduce_to_device(wj_total, inplace=True) if with_k: wk = reduce_to_device(wk_total, inplace=True) return wj, wk -def _int3c2e_ipip1_hjk(intopt, task_list, rhoj, rhok, dm0, orbo, - device_id=0, with_k=True, omega=None): - with cupy.cuda.Device(device_id), _streams[device_id]: - rhoj = cupy.asarray(rhoj) - rhok = cupy.asarray(rhok) - orbo = cupy.asarray(orbo) - dm0 = cupy.asarray(dm0) - nao = dm0.shape[0] - hj = cupy.zeros([nao,9]) - hk = None - if with_k: - hk = cupy.zeros([nao,9]) - for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, - ip_type='ipip1', omega=omega): - tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) - hj[i0:i1] += contract('xpi,p->ix', tmp, rhoj[k0:k1]) - if with_k: - rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1]) - rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1]) - hk[i0:i1] += contract('xpji,pij->ix', int3c_blk, rhok_tmp) - hj = hj.reshape([nao,3,3]) - if with_k: - hk = hk.reshape([nao,3,3]) - return hj, hk - -def _int3c2e_ipvip1_hjk(intopt, task_list, rhoj, rhok, dm0, orbo, - device_id=0, with_k=True, omega=None): - with cupy.cuda.Device(device_id), _streams[device_id]: - rhoj = cupy.asarray(rhoj) - rhok = cupy.asarray(rhok) - orbo = cupy.asarray(orbo) - dm0 = cupy.asarray(dm0) - nao = dm0.shape[0] - hj = cupy.zeros([nao,nao,9]) - hk = None - if with_k: - hk = cupy.zeros([nao,nao,9]) - for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, - ip_type='ipvip1', omega=omega): - tmp = contract('xpji,ij->xpij', int3c_blk, dm0[i0:i1,j0:j1]) - hj[i0:i1,j0:j1] += contract('xpij,p->ijx', tmp, rhoj[k0:k1]) - if with_k: - rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1]) - rhok_tmp = contract('pio,jo->pji', rhok_tmp, orbo[j0:j1]) - hk[i0:i1,j0:j1] += contract('xpji,pji->ijx', int3c_blk, rhok_tmp) - hj = hj.reshape([nao,nao,3,3]) - if with_k: - hk = hk.reshape([nao,nao,3,3]) - return hj, hk - -def _int3c2e_ip1ip2_hjk(intopt, task_list, rhoj, rhok, dm0, orbo, - device_id=0, with_k=True, omega=None): - with cupy.cuda.Device(device_id), _streams[device_id]: - naux = rhok.shape[0] - rhoj = cupy.asarray(rhoj) - rhok = cupy.asarray(rhok) - orbo = cupy.asarray(orbo) - dm0 = cupy.asarray(dm0) - nao = dm0.shape[0] - hj = cupy.zeros([nao,naux,9]) - hk = None - if with_k: - hk = cupy.zeros([nao,naux,9]) - for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, - ip_type='ip1ip2', omega=omega): - tmp = contract('xpji,ij->xpi', int3c_blk, dm0[i0:i1,j0:j1]) - hj[i0:i1,k0:k1] += contract('xpi,p->ipx', tmp, rhoj[k0:k1]) - if with_k: - rhok_tmp = contract('por,ir->pio', rhok[k0:k1], orbo[i0:i1]) - rhok_tmp = contract('pio,jo->pij', rhok_tmp, orbo[j0:j1]) - hk[i0:i1,k0:k1] += contract('xpji,pij->ipx', int3c_blk, rhok_tmp) - hj = hj.reshape([nao,naux,3,3]) - if with_k: - hk = hk.reshape([nao,naux,3,3]) - return hj, hk - -def _int3c2e_ipip2_hjk(intopt, task_list, rhoj, rhok, dm0, orbo, - device_id=0, with_k=True, omega=None): - with cupy.cuda.Device(device_id), _streams[device_id]: - naux = rhok.shape[0] - rhoj = cupy.asarray(rhoj) - rhok = cupy.asarray(rhok) - orbo = cupy.asarray(orbo) - dm0 = cupy.asarray(dm0) - hj = cupy.zeros([naux,9]) - hk = None - if with_k: - hk = cupy.zeros([naux,9]) - for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, task_list=task_list, - ip_type='ipip2', omega=omega): - tmp = contract('xpji,ij->xp', int3c_blk, dm0[i0:i1,j0:j1]) - hj[k0:k1] += contract('xp,p->px', tmp, rhoj[k0:k1]) - if with_k: - rhok_tmp = contract('por,jr->pjo', rhok[k0:k1], orbo[j0:j1]) - rhok_tmp = contract('pjo,io->pji', rhok_tmp, orbo[i0:i1]) - hk[k0:k1] += contract('xpji,pji->px', int3c_blk, rhok_tmp) - hj = hj.reshape([naux,3,3]) - if with_k: - hk = hk.reshape([naux,3,3]) - return hj, hk - -def get_int3c2e_hjk(intopt, task_type, rhoj, rhok, dm0_tag, with_k=True, omega=None): - if task_type == 'ipip1': task_fn = _int3c2e_ipip1_hjk - if task_type == 'ipip2': task_fn = _int3c2e_ipip2_hjk - if task_type == 'ip1ip2': task_fn = _int3c2e_ip1ip2_hjk - if task_type == 'ipvip1': task_fn = _int3c2e_ipvip1_hjk - - orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') - futures = [] - ncp_k = len(intopt.aux_log_qs) - ncp_ij = len(intopt.log_qs) - tasks = np.array(list(itertools.product(range(ncp_k), range(ncp_ij)))) - task_list = [] - for device_id in range(_num_devices): - task_list.append(tasks[device_id::_num_devices]) - - cupy.cuda.get_current_stream().synchronize() - with ThreadPoolExecutor(max_workers=_num_devices) as executor: - for device_id in range(_num_devices): - future = executor.submit( - task_fn, intopt, task_list[device_id], - rhoj, rhok, dm0_tag, orbo, with_k=with_k, device_id=device_id, omega=omega) - futures.append(future) - - hj_total = [] - hk_total = [] - for future in futures: - hj, hk = future.result() - hj_total.append(hj) - hk_total.append(hk) - - hj = hk = None - hj = reduce_to_device(hj_total, inplace=True) - if with_k: - hk = reduce_to_device(hk_total, inplace=True) - return hj, hk - -def get_hess_nuc_elec(mol, dm): - ''' - calculate int1e_ipiprinv contribution - ''' - coords = mol.atom_coords() - charges = cupy.asarray(mol.atom_charges(), dtype=np.float64) - - fakemol = gto.fakemol_for_charges(coords) - fakemol.output = mol.output - fakemol.verbose = mol.verbose - fakemol.stdout = mol.stdout - intopt = VHFOpt(mol, fakemol, 'int2e') - intopt.build(1e-14, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) - dm = intopt.sort_orbitals(cupy.asarray(dm), axis=[0,1]) - - natm = mol.natm - nao = mol.nao - hcore_diag = cupy.zeros([9,natm]) - hcore_aa = cupy.zeros([9,natm,nao]) - for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipip1'): - haa = contract('xpji,ij->xpi', int3c_blk, dm[i0:i1,j0:j1]) - hcore_aa[:,k0:k1,i0:i1] += haa - hcore_diag[:,k0:k1] -= contract('xpji,ij->xp', int3c_blk, dm[i0:i1,j0:j1]) - - hcore_ab = cupy.zeros([9,natm,nao]) - for i0,i1,j0,j1,k0,k1,int3c_blk in loop_int3c2e_general(intopt, ip_type='ipvip1'): - hab = contract('xpji,ij->xpi', int3c_blk, dm[i0:i1,j0:j1]) - hcore_ab[:,k0:k1,i0:i1] += hab - hcore_diag[:,k0:k1] -= contract('xpji,ij->xp', int3c_blk, dm[i0:i1,j0:j1]) - - hcore_diag = contract('xp,p->xp', hcore_diag, charges) - hcore_aa = contract('xpj,p->xpj', hcore_aa, charges) - hcore_ab = contract('xpj,p->xpj', hcore_ab, charges) - - aoslices = mol.aoslice_by_atom() - ao2atom = get_ao2atom(intopt, aoslices) - - hcore_aa = contract('xpj,jq->xpq', hcore_aa, ao2atom).reshape([3,3,natm,natm]) - hcore_ab = contract('xpj,jq->xpq', hcore_ab, ao2atom).reshape([3,3,natm,natm]) - hcore = hcore_aa + hcore_aa.transpose([1,0,3,2]) - hcore+= hcore_ab.transpose([1,0,2,3]) + hcore_ab.transpose([0,1,3,2]) - hcore_diag = hcore_diag.reshape([3,3,natm]) - idx = np.arange(natm) - for x in range(3): - for y in range(3): - hcore[x,y,idx,idx] += hcore_diag[x,y] - return hcore - def get_int3c2e_ip_slice(intopt, cp_aux_id, ip_type, out=None, omega=None, stream=None): ''' Generate int3c2e_ip slice along k, full dimension in ij @@ -1414,15 +1337,6 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di intopt = VHFOpt(mol, auxmol, 'int2e') intopt.build(direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) - lmax = mol._bas[:gto.ANG_OF].max() - aux_lmax = auxmol._bas[:gto.ANG_OF].max() - nroots = (lmax + aux_lmax + order)//2 + 1 - if nroots > NROOT_ON_GPU: - from pyscf.gto.moleintor import getints, make_cintopt - pmol = intopt._tot_mol - intor = pmol._add_suffix('int3c2e_' + ip_type) - opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor) - nao_cart = intopt._sorted_mol.nao naux_cart = intopt._sorted_auxmol.nao norb_cart = nao_cart + naux_cart + 1 @@ -1472,6 +1386,11 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di if err != 0: raise RuntimeError("int3c2e failed\n") else: + from pyscf.gto.moleintor import getints, make_cintopt + pmol = intopt._tot_mol + intor = pmol._add_suffix('int3c2e_' + ip_type) + opt = make_cintopt(pmol._atm, pmol._bas, pmol._env, intor) + # TODO: sph2cart in CPU? ishl0, ishl1 = intopt.l_ctr_offsets[cpi], intopt.l_ctr_offsets[cpi+1] jshl0, jshl1 = intopt.l_ctr_offsets[cpj], intopt.l_ctr_offsets[cpj+1] @@ -1562,7 +1481,7 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, cart=False, aosym=None, out=N nbins = 1 bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) bins_locs_kl = np.array([0, len(log_q_kl)], dtype=np.int32) - + cart_ao_loc = intopt.cart_ao_loc cart_aux_loc = intopt.cart_aux_loc i0, i1 = cart_ao_loc[cpi], cart_ao_loc[cpi+1] @@ -1604,11 +1523,11 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, cart=False, aosym=None, out=N if err != 0: raise RuntimeError('GINT_fill_int2e failed') - + # move this operation to j2c? if lk > 1 and intopt.auxmol.cart == 0: int3c_blk = cart2sph(int3c_blk, axis=0, ang=lk, out=out) - + stream.synchronize() return int3c_blk diff --git a/gpu4pyscf/df/tests/test_df_hessian.py b/gpu4pyscf/df/tests/test_df_hessian.py index 8e254c67..266cef29 100644 --- a/gpu4pyscf/df/tests/test_df_hessian.py +++ b/gpu4pyscf/df/tests/test_df_hessian.py @@ -135,7 +135,7 @@ def test_hessian_rhf(self, disp=None): h = hobj.kernel() _check_rhf_hessian(mf, h, ix=0, iy=0) _check_rhf_hessian(mf, h, ix=0, iy=1) - + def test_hessian_lda(self, disp=None): print('-----testing DF LDA Hessian----') mf = _make_rks(mol_sph, 'LDA') @@ -239,7 +239,6 @@ def test_hessian_rks_D3(self): hobj = mf.Hessian() hobj.set(auxbasis_response=2) h = hobj.kernel() - print(np.linalg.norm(h)) _check_dft_hessian(mf, h, ix=0,iy=0) def test_hessian_rks_D4(self): diff --git a/gpu4pyscf/df/tests/test_df_rhf.py b/gpu4pyscf/df/tests/test_df_rhf.py index e724015a..c2f3caa9 100644 --- a/gpu4pyscf/df/tests/test_df_rhf.py +++ b/gpu4pyscf/df/tests/test_df_rhf.py @@ -13,12 +13,17 @@ # limitations under the License. import unittest +import pickle import numpy as np import pyscf from pyscf import scf as cpu_scf from pyscf.df import df_jk as cpu_df_jk from gpu4pyscf.df import df_jk as gpu_df_jk from gpu4pyscf import scf as gpu_scf +try: + import cloudpickle +except ImportError: + cloudpickle = None atom = ''' O 0.0000000000 -0.0000000000 0.1174000000 @@ -48,12 +53,17 @@ class KnownValues(unittest.TestCase): ''' def test_rhf(self): print('------- RHF -----------------') - mf = gpu_scf.RHF(mol_sph).density_fit(auxbasis='def2-tzvpp-jkfit') + mf = mol_sph.RHF().density_fit(auxbasis='def2-tzvpp-jkfit').to_gpu() e_tot = mf.kernel() e_qchem = -76.0624582299 print(f'diff from qchem {e_tot - e_qchem}') assert np.abs(e_tot - e_qchem) < 1e-5 + # test serialization + if cloudpickle is not None: + mf1 = pickle.loads(cloudpickle.dumps(mf)) + assert mf1.e_tot == e_tot + def test_cart(self): print('------- RHF Cart -----------------') mf = gpu_scf.RHF(mol_cart).density_fit(auxbasis='def2-tzvpp-jkfit') diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index 70186a5a..17498c7d 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -32,11 +32,11 @@ LMAX_ON_GPU = 6 BAS_ALIGNED = 1 -GRID_BLKSIZE = 32 MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 64*64) ALIGNED = getattr(__config__, 'grid_aligned', 16*16) AO_ALIGNMENT = getattr(__config__, 'ao_aligned', 16) AO_THRESHOLD = 1e-10 +GB = 1024*1024*1024 # Should we release the cupy cache? FREE_CUPY_CACHE = False @@ -273,26 +273,23 @@ def eval_rho4(mol, ao, mo0, mo1, non0tab=None, xctype='LDA', hermi=0, na = mo1.shape[0] if xctype == 'LDA' or xctype == 'HF': c0 = mo0.T.dot(ao) - t1 = log.timer_debug2('eval occ_coeff', *t0) - c_0 = contract('aio,ig->aog', mo1, ao) rho = cupy.empty([na,ngrids]) for i in range(na): - rho[i] = _contract_rho(c0, c_0[i]) + c_0 = contract('io,ig->og', mo1[i], ao) + rho[i] = _contract_rho(c0, c_0) elif xctype in ('GGA', 'NLC'): c0 = contract('nig,io->nog', ao, mo0) - t1 = log.timer_debug2('eval occ_coeff', *t0) - c_0 = contract('nig,aio->anog', ao, mo1) - t1 = log.timer_debug2('ao * cpos', *t1) rho = cupy.empty([na, 4, ngrids]) for i in range(na): - _contract_rho_gga(c0, c_0[i], rho=rho[i]) + c_0 = contract('nig,io->nog', ao, mo1[i]) + _contract_rho_gga(c0, c_0, rho=rho[i]) else: # meta-GGA assert not with_lapl rho = cupy.empty((na,5,ngrids)) c0 = contract('nig,io->nog', ao, mo0) - c_0 = contract('nig,aio->anog', ao, mo1) for i in range(na): - _contract_rho_mgga(c0, c_0[i], rho=rho[i]) + c_0 = contract('nig,io->nog', ao, mo1[i]) + _contract_rho_mgga(c0, c_0, rho=rho[i]) if hermi: # corresponding to the density of ao * mo1[i].dot(mo0.T) * ao rho *= 2. @@ -417,9 +414,11 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, ngrids_glob = grids.coords.shape[0] ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices - grid_start = device_id * ngrids_per_device - grid_end = (device_id + 1) * ngrids_per_device + ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE + grid_start = min(device_id * ngrids_per_device, ngrids_glob) + grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob) ngrids_local = grid_end - grid_start + log.debug(f"{ngrids_local} on Device {device_id}") weights = cupy.empty([ngrids_local]) if xctype == 'LDA': @@ -428,7 +427,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, rho_tot = cupy.empty([nset,4,ngrids_local]) else: rho_tot = cupy.empty([nset,5,ngrids_local]) - + p0 = p1 = 0 for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=None, @@ -436,8 +435,10 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, p1 = p0 + weight.size weights[p0:p1] = weight for i in range(nset): + # If AO is sparse enough, use density matrix to calculate rho if mo_coeff is None: - rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms[i][idx[:,None],idx], + dms_mask = dms[i][idx[:,None],idx] + rho_tot[i,:,p0:p1] = eval_rho(_sorted_mol, ao_mask, dms_mask, xctype=xctype, hermi=hermi, with_lapl=with_lapl) else: assert hermi == 1 @@ -446,7 +447,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, None, xctype, with_lapl) p0 = p1 t0 = log.timer_debug1(f'eval rho on Device {device_id}', *t0) - + # libxc calls are still running on default stream nelec = cupy.zeros(nset) excsum = cupy.zeros(nset) @@ -817,8 +818,11 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ, ngrids_glob = grids.coords.shape[0] ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices - grid_start = device_id * ngrids_per_device - grid_end = (device_id + 1) * ngrids_per_device + ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE + grid_start = min(device_id * ngrids_per_device, ngrids_glob) + grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob) + ngrids_local = grid_end - grid_start + log.debug(f"{ngrids_local} on Device {device_id}") for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory=None, @@ -1019,13 +1023,16 @@ def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, ngrids_glob = grids.coords.shape[0] ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices - grid_start = device_id * ngrids_per_device - grid_end = (device_id + 1) * ngrids_per_device + ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE + grid_start = min(device_id * ngrids_per_device, ngrids_glob) + grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob) + ngrids_local = grid_end - grid_start + log.debug(f"{ngrids_local} on Device {device_id}") p0 = p1 = grid_start t1 = t0 = log.init_timer() for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, - max_memory=None, + max_memory=None, blksize=None, grid_range=(grid_start, grid_end)): p0, p1 = p1, p1+len(weights) # precompute molecular orbitals @@ -1133,6 +1140,105 @@ def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0=None, dms_alpha=None, return nr_rks_fxc(ni, mol, grids, xc_code, dm0, dms_alpha, hermi=0, fxc=fxc, max_memory=max_memory, verbose=verbose) +def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff, + verbose=None, hermi=1, device_id=0): + with cupy.cuda.Device(device_id), _streams[device_id]: + if dms is not None: + dma, dmb = dms + dma = cupy.asarray(dma) + dmb = cupy.asarray(dmb) + if mo1 is not None: + mo1a, mo1b = mo1 + mo1a = cupy.asarray(mo1a) + mo1b = cupy.asarray(mo1b) + if occ_coeff is not None: + occ_coeff_a, occ_coeff_b = occ_coeff + occ_coeff_a = cupy.asarray(occ_coeff_a) + occ_coeff_b = cupy.asarray(occ_coeff_b) + + if fxc is not None: fxc = cupy.asarray(fxc) + assert isinstance(verbose, int) + log = logger.new_logger(mol, verbose) + xctype = ni._xc_type(xc_code) + opt = getattr(ni, 'gdftopt', None) + + _sorted_mol = opt.mol + nao = mol.nao + nset = len(dma) + vmata = cupy.zeros((nset, nao, nao)) + vmatb = cupy.zeros((nset, nao, nao)) + + if xctype == 'LDA': + ao_deriv = 0 + else: + ao_deriv = 1 + + ngrids_glob = grids.coords.shape[0] + ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices + ngrids_per_device = (ngrids_per_device + MIN_BLK_SIZE - 1) // MIN_BLK_SIZE * MIN_BLK_SIZE + grid_start = min(device_id * ngrids_per_device, ngrids_glob) + grid_end = min((device_id + 1) * ngrids_per_device, ngrids_glob) + ngrids_local = grid_end - grid_start + log.debug(f"{ngrids_local} on Device {device_id}") + + p0 = p1 = grid_start + t1 = t0 = log.init_timer() + for ao, mask, weights, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, + max_memory=None, + grid_range=(grid_start, grid_end)): + + t0 = log.init_timer() + p0, p1 = p1, p1+len(weights) + # precompute fxc_w + fxc_w = fxc[:,:,:,:,p0:p1] * weights + + # precompute molecular orbitals + if occ_coeff is not None: + occ_coeff_a_mask = occ_coeff_a[mask] + occ_coeff_b_mask = occ_coeff_b[mask] + rho1a = eval_rho4(_sorted_mol, ao, occ_coeff_a_mask, mo1a[:,mask], + xctype=xctype, hermi=hermi).reshape(nset,-1,p1-p0) + rho1b = eval_rho4(_sorted_mol, ao, occ_coeff_b_mask, mo1b[:,mask], + xctype=xctype, hermi=hermi).reshape(nset,-1,p1-p0) + else: # slow version + rho1a = [] + rho1b = [] + for i in range(nset): + rho_tmp = eval_rho(_sorted_mol, ao, dma[i,mask[:,None],mask], + xctype=xctype, hermi=hermi) + rho1a.append(rho_tmp.reshape(-1,p1-p0)) + rho_tmp = eval_rho(_sorted_mol, ao, dmb[i,mask[:,None],mask], + xctype=xctype, hermi=hermi) + rho1b.append(rho_tmp.reshape(-1,p1-p0)) + t0 = log.timer_debug1('rho', *t0) + + for i in range(nset): + wv_a = contract('xg,xyg->yg', rho1a[i], fxc_w[0,:,0]) + wv_a+= contract('xg,xyg->yg', rho1b[i], fxc_w[1,:,0]) + wv_b = contract('xg,xyg->yg', rho1a[i], fxc_w[0,:,1]) + wv_b+= contract('xg,xyg->yg', rho1b[i], fxc_w[1,:,1]) + if xctype == 'LDA': + va = ao.dot(_scale_ao(ao, wv_a[0]).T) + vb = ao.dot(_scale_ao(ao, wv_b[0]).T) + elif xctype == 'GGA': + wv_a[0] *= .5 # for transpose_sum at the end + wv_b[0] *= .5 + va = ao[0].dot(_scale_ao(ao, wv_a).T) + vb = ao[0].dot(_scale_ao(ao, wv_b).T) + elif xctype == 'NLC': + raise NotImplementedError('NLC') + else: + wv_a[[0,4]] *= .5 # for transpose_sum at the end + wv_b[[0,4]] *= .5 + va = ao[0].dot(_scale_ao(ao[:4], wv_a[:4]).T) + vb = ao[0].dot(_scale_ao(ao[:4], wv_b[:4]).T) + va += _tau_dot(ao, ao, wv_a[4]) + vb += _tau_dot(ao, ao, wv_b[4]) + add_sparse(vmata[i], va, mask) + add_sparse(vmatb[i], vb, mask) + t1 = log.timer_debug2('integration', *t1) + t0 = log.timer_debug1('vxc', *t0) + return vmata, vmatb def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=0, rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None): @@ -1144,13 +1250,13 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= if opt is None or mol not in [opt.mol, opt._sorted_mol]: ni.build(mol, grids.coords) opt = ni.gdftopt - mol = None - _sorted_mol = opt._sorted_mol + nao, nao0 = opt.coeff.shape dma, dmb = dms dm_shape = dma.shape # AO basis -> gdftopt AO basis with_mocc = hasattr(dms, 'mo1') + mo1 = occ_coeff = None if with_mocc: mo1a, mo1b = dms.mo1 occ_coeffa, occ_coeffb = dms.occ_coeff @@ -1158,70 +1264,32 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= mo1b = opt.sort_orbitals(mo1b, axis=[1]) occ_coeff_a = opt.sort_orbitals(occ_coeffa, axis=[0]) occ_coeff_b = opt.sort_orbitals(occ_coeffb, axis=[0]) - + occ_coeff = (occ_coeff_a, occ_coeff_b) + mo1 = (mo1a, mo1b) dma = cupy.asarray(dma).reshape(-1,nao0,nao0) dmb = cupy.asarray(dmb).reshape(-1,nao0,nao0) dma = opt.sort_orbitals(dma, axis=[1,2]) dmb = opt.sort_orbitals(dmb, axis=[1,2]) - nset = len(dma) - vmata = cupy.zeros((nset, nao, nao)) - vmatb = cupy.zeros((nset, nao, nao)) - - if xctype == 'LDA': - ao_deriv = 0 - nvar = 1 - elif xctype == 'GGA': - ao_deriv = 1 - nvar = 4 - else: - ao_deriv = 1 - nvar = 5 - p0 = p1 = 0 - for ao, mask, weights, coords in ni.block_loop( - _sorted_mol, grids, nao, ao_deriv, max_memory=max_memory): - t0 = log.init_timer() - p0, p1 = p1, p1+len(weights) - # precompute fxc_w - fxc_w = fxc[:,:,:,:,p0:p1] * weights - - # precompute molecular orbitals - if with_mocc: - occ_coeff_a_mask = occ_coeff_a[mask] - occ_coeff_b_mask = occ_coeff_b[mask] - rho1a = eval_rho4(_sorted_mol, ao, occ_coeff_a_mask, mo1a[:,mask], - xctype=xctype, hermi=hermi) - rho1b = eval_rho4(_sorted_mol, ao, occ_coeff_b_mask, mo1b[:,mask], - xctype=xctype, hermi=hermi) - rho1 = cupy.stack([rho1a, rho1b]).reshape(2, nset, nvar, p1-p0) - else: # slow version - rho1 = cupy.empty((2, nset, nvar, p1-p0)) - for i in range(nset): - rho1[0,i] = eval_rho(_sorted_mol, ao, dma[i,mask[:,None],mask], - xctype=xctype, hermi=hermi) - rho1[1,i] = eval_rho(_sorted_mol, ao, dmb[i,mask[:,None],mask], - xctype=xctype, hermi=hermi) - t0 = log.timer_debug1('rho', *t0) + futures = [] + cupy.cuda.get_current_stream().synchronize() + with ThreadPoolExecutor(max_workers=_num_devices) as executor: + for device_id in range(_num_devices): + future = executor.submit( + _nr_uks_fxc_task, + ni, mol, grids, xc_code, fxc, (dma, dmb), mo1, occ_coeff, + verbose=log.verbose, hermi=hermi, device_id=device_id) + futures.append(future) + vmata_dist = [] + vmatb_dist = [] + for future in futures: + vmata, vmatb = future.result() + vmata_dist.append(vmata) + vmatb_dist.append(vmatb) + + vmata = reduce_to_device(vmata_dist, inplace=True) + vmatb = reduce_to_device(vmatb_dist, inplace=True) - for i in range(nset): - wv = contract('axg,axbyg->byg', rho1[:,i], fxc_w) - if xctype == 'LDA': - va = ao.dot(_scale_ao(ao, wv[0,0]).T) - vb = ao.dot(_scale_ao(ao, wv[1,0]).T) - elif xctype == 'GGA': - wv[:,0] *= .5 # for transpose_sum at the end - va = ao[0].dot(_scale_ao(ao, wv[0]).T) - vb = ao[0].dot(_scale_ao(ao, wv[1]).T) - elif xctype == 'NLC': - raise NotImplementedError('NLC') - else: - wv[:,[0,4]] *= .5 # for transpose_sum at the end - va = ao[0].dot(_scale_ao(ao[:4], wv[0,:4]).T) - vb = ao[0].dot(_scale_ao(ao[:4], wv[1,:4]).T) - va += _tau_dot(ao, ao, wv[0,4]) - vb += _tau_dot(ao, ao, wv[1,4]) - add_sparse(vmata[i], va, mask) - add_sparse(vmatb[i], vb, mask) vmata = opt.unsort_orbitals(vmata, axis=[1,2]) vmatb = opt.unsort_orbitals(vmatb, axis=[1,2]) if xctype != 'LDA': @@ -1578,7 +1646,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, comp = (deriv+1)*(deriv+2)*(deriv+3)//6 if blksize is None: - #cupy.get_default_memory_pool().free_all_blocks() + # By default, a memory space of [comp,nao,blksize] is reserved mem_avail = get_avail_mem() blksize = int((mem_avail*.2/8/((comp+1)*nao + extra))/ ALIGNED) * ALIGNED blksize = min(blksize, MIN_BLK_SIZE) @@ -1737,6 +1805,9 @@ class NumInt(lib.StreamObject, LibXCMixin): screen_index = None xcfuns = None # can be multiple xc functionals + __getstate__, __setstate__ = lib.generate_pickle_methods( + excludes=('gdftopt',)) + def build(self, mol, coords): self.gdftopt = _GDFTOpt.from_mol(mol) self.grid_blksize = None diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py index d512caa5..496abfa3 100644 --- a/gpu4pyscf/dft/rks.py +++ b/gpu4pyscf/dft/rks.py @@ -13,9 +13,9 @@ # limitations under the License. # modified by Xiaojie Wu (wxj6000@gmail.com) + import cupy from pyscf.dft import rks - from gpu4pyscf.lib import logger from gpu4pyscf.dft import numint, gen_grid from gpu4pyscf.scf import hf @@ -257,6 +257,7 @@ def __init__(self, xc='LDA,VWN'): ################################################## # don't modify the following attributes, they are not input options self._numint = numint.NumInt() + @property def omega(self): return self._numint.omega @@ -291,8 +292,13 @@ def reset(self, mol=None): hf.SCF.reset(self, mol) self.grids.reset(mol) self.nlcgrids.reset(mol) - self.cphf_grids.reset(mol) self._numint.reset() + # The cphf_grids attribute is not available in the PySCF CPU version. + # In PySCF's to_gpu() function, this attribute is not properly + # initialized. mol of the KS object must be used for initialization. + if mol is None: + mol = self.mol + self.cphf_grids.reset(mol) return self def nuc_grad_method(self): diff --git a/gpu4pyscf/dft/tests/test_libxc.py b/gpu4pyscf/dft/tests/test_libxc.py index 99df03ce..c13dba13 100644 --- a/gpu4pyscf/dft/tests/test_libxc.py +++ b/gpu4pyscf/dft/tests/test_libxc.py @@ -92,7 +92,7 @@ def test_u_LDA(self): def test_u_GGA(self): # large errors found in B88 for the spin polarized case - self._check_xc('HYB_GGA_XC_B3LYP', spin=1, fxc_tol=1e-3) + self._check_xc('HYB_GGA_XC_B3LYP', spin=1, fxc_tol=1e-2) self._check_xc('GGA_X_B88', spin=1, fxc_tol=1e-1) self._check_xc('GGA_C_PBE', spin=1, fxc_tol=1e-4) diff --git a/gpu4pyscf/dft/tests/test_rks.py b/gpu4pyscf/dft/tests/test_rks.py index d1bf278d..4bae05ca 100644 --- a/gpu4pyscf/dft/tests/test_rks.py +++ b/gpu4pyscf/dft/tests/test_rks.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pickle import numpy as np import unittest import pyscf @@ -64,11 +65,18 @@ class KnownValues(unittest.TestCase): ''' def test_rks_lda(self): print('------- LDA ----------------') - e_tot = run_dft("LDA, vwn5", mol_sph) + mf = mol_sph.RKS(xc='LDA,vwn5').to_gpu() + mf.grids.level = grids_level + mf.nlcgrids.level = nlcgrids_level + e_tot = mf.kernel() e_ref = -75.9046410402 print('| CPU - GPU |:', e_tot - e_ref) assert np.abs(e_tot - e_ref) < 1e-5 + # test serialization + mf1 = pickle.loads(pickle.dumps(mf)) + assert mf1.e_tot == e_tot + def test_rks_pbe(self): print('------- PBE ----------------') e_tot = run_dft('PBE', mol_sph) diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py index c3390e95..dd374cc3 100644 --- a/gpu4pyscf/grad/rhf.py +++ b/gpu4pyscf/grad/rhf.py @@ -30,6 +30,7 @@ from gpu4pyscf.__config__ import _streams, _num_devices from gpu4pyscf.df import int3c2e #TODO: move int3c2e to out of df from gpu4pyscf.lib import logger +from gpu4pyscf.scf import jk from gpu4pyscf.scf.jk import ( LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, libvhf_rys, _VHFOpt, init_constant, _make_tril_tile_mappings, _nearest_power2) @@ -79,43 +80,41 @@ def _ejk_ip1_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0, info = cp.empty(2, dtype=np.uint32) t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0) - for i, j in task_list: + for i, j, k, l in task_list: ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1], l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]) tile_ij_mapping = tile_mappings[i,j] - for k in range(i+1): - for l in range(k+1): - llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' - kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], - l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) - tile_kl_mapping = tile_mappings[k,l] - scheme = _ejk_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) - err = kern( - ctypes.cast(ejk.data.ptr, ctypes.c_void_p), - ctypes.c_double(j_factor), ctypes.c_double(k_factor), - ctypes.cast(dms.data.ptr, ctypes.c_void_p), - ctypes.c_int(n_dm), ctypes.c_int(nao), - vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), - (ctypes.c_int*8)(*ij_shls, *kl_shls), - ctypes.c_int(tile_ij_mapping.size), - ctypes.c_int(tile_kl_mapping.size), - ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), - ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), - tile_q_ptr, q_ptr, s_ptr, - ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), - ctypes.c_float(log_cutoff), - ctypes.cast(pool.data.ptr, ctypes.c_void_p), - ctypes.cast(info.data.ptr, ctypes.c_void_p), - ctypes.c_int(workers), - mol._atm.ctypes, ctypes.c_int(mol.natm), - mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) - if err != 0: - raise RuntimeError(f'RYS_per_atom_jk_ip1 kernel for {llll} failed') - if log.verbose >= logger.DEBUG1: - msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' - t1, t1p = log.timer_debug1(msg, *t1), t1 - timing_counter[llll] += t1[1] - t1p[1] - kern_counts += 1 + llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' + kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], + l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) + tile_kl_mapping = tile_mappings[k,l] + scheme = _ejk_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) + err = kern( + ctypes.cast(ejk.data.ptr, ctypes.c_void_p), + ctypes.c_double(j_factor), ctypes.c_double(k_factor), + ctypes.cast(dms.data.ptr, ctypes.c_void_p), + ctypes.c_int(n_dm), ctypes.c_int(nao), + vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), + (ctypes.c_int*8)(*ij_shls, *kl_shls), + ctypes.c_int(tile_ij_mapping.size), + ctypes.c_int(tile_kl_mapping.size), + ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), + tile_q_ptr, q_ptr, s_ptr, + ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), + ctypes.c_float(log_cutoff), + ctypes.cast(pool.data.ptr, ctypes.c_void_p), + ctypes.cast(info.data.ptr, ctypes.c_void_p), + ctypes.c_int(workers), + mol._atm.ctypes, ctypes.c_int(mol.natm), + mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) + if err != 0: + raise RuntimeError(f'RYS_per_atom_jk_ip1 kernel for {llll} failed') + if log.verbose >= logger.DEBUG1: + msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' + t1, t1p = log.timer_debug1(msg, *t1), t1 + timing_counter[llll] += t1[1] - t1p[1] + kern_counts += 1 return ejk, kern_counts, timing_counter def _jk_energy_per_atom(mol, dm, vhfopt=None, @@ -126,7 +125,11 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None, log = logger.new_logger(mol, verbose) cput0 = log.init_timer() if vhfopt is None: - vhfopt = _VHFOpt(mol).build() + # Small group size for load balance + group_size = None + if _num_devices > 1: + group_size = jk.GROUP_SIZE + vhfopt = _VHFOpt(mol).build(group_size=group_size) mol = vhfopt.sorted_mol nao, nao_orig = vhfopt.coeff.shape @@ -145,7 +148,12 @@ def _jk_energy_per_atom(mol, dm, vhfopt=None, assert uniq_l.max() <= LMAX n_groups = len(uniq_l_ctr) - tasks = [(i,j) for i in range(n_groups) for j in range(i+1)] + tasks = [] + for i in range(n_groups): + for j in range(i+1): + for k in range(i+1): + for l in range(k+1): + tasks.append((i,j,k,l)) tasks = np.array(tasks) task_list = [] for device_id in range(_num_devices): diff --git a/gpu4pyscf/gto/int3c1e.py b/gpu4pyscf/gto/int3c1e.py index 36b45374..8e6ce88c 100644 --- a/gpu4pyscf/gto/int3c1e.py +++ b/gpu4pyscf/gto/int3c1e.py @@ -15,7 +15,7 @@ import ctypes import cupy as cp import numpy as np - +from pyscf import lib from pyscf.scf import _vhf from pyscf.gto import ATOM_OF from pyscf.lib import c_null_ptr @@ -161,7 +161,6 @@ def get_n_hermite_density_of_angular_pair(l): def sort_orbitals(self, mat, axis=[]): ''' Transform given axis of a matrix into sorted AO, - and transform given auxiliary axis of a matrix into sorted auxiliary AO ''' idx = self._ao_idx shape_ones = (1,) * mat.ndim @@ -176,6 +175,24 @@ def sort_orbitals(self, mat, axis=[]): fancy_index.append(indices.reshape(idx_shape)) return mat[tuple(fancy_index)] + def unsort_orbitals(self, sorted_mat, axis=[]): + ''' Transform given axis of a matrix into sorted AO, + ''' + idx = self._ao_idx + shape_ones = (1,) * sorted_mat.ndim + fancy_index = [] + for dim, n in enumerate(sorted_mat.shape): + if dim in axis: + assert n == len(idx) + indices = idx + else: + indices = np.arange(n) + idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:] + fancy_index.append(indices.reshape(idx_shape)) + mat = cp.empty_like(sorted_mat) + mat[tuple(fancy_index)] = sorted_mat + return mat + @property def bpcache(self): device_id = cp.cuda.Device().id @@ -205,17 +222,17 @@ def get_int3c1e(mol, grids, charge_exponents, intopt): "which requires {total_double_number * 8 / 1e9 : .1f} GB of memory") ngrids_per_split = (ngrids + n_grid_split - 1) // n_grid_split - int3c_pinned_memory_pool = cp.cuda.alloc_pinned_memory(ngrids * nao * nao * np.array([1.0]).nbytes) - int3c = np.frombuffer(int3c_pinned_memory_pool, np.float64, ngrids * nao * nao).reshape([ngrids, nao, nao], order='C') + buf_size = ngrids * nao * nao + int3c_pinned_buf = cp.cuda.alloc_pinned_memory(buf_size * 8) + int3c = np.frombuffer(int3c_pinned_buf, np.float64, buf_size).reshape([ngrids, nao, nao], order='C') # int3c = np.zeros([ngrids, nao, nao], order='C') # Using unpinned (pageable) memory, each memcpy is much slower, but there's no initialization time grids = cp.asarray(grids, order='C') if charge_exponents is not None: charge_exponents = cp.asarray(charge_exponents, order='C') - for i_grid_split in range(0, ngrids, ngrids_per_split): - ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split]) - int3c_grid_slice = cp.zeros([ngrids_of_split, nao, nao], order='C') + for p0, p1 in lib.prange(0, ngrids, ngrids_per_split): + int3c_grid_slice = cp.zeros([p1-p0, nao, nao], order='C') for cp_ij_id, _ in enumerate(intopt.log_qs): cpi = intopt.cp_idx[cp_ij_id] cpj = intopt.cp_jdx[cp_ij_id] @@ -237,18 +254,19 @@ def get_int3c1e(mol, grids, charge_exponents, intopt): ao_offsets = np.array([i0, j0], dtype=np.int32) strides = np.array([ni, ni*nj], dtype=np.int32) - int3c_angular_slice = cp.zeros([ngrids_of_split, j1-j0, i1-i0], order='C') + int3c_angular_slice = cp.zeros([p1-p0, j1-j0, i1-i0], order='C') charge_exponents_pointer = c_null_ptr() if charge_exponents is not None: - charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr - + exponents_slice = charge_exponents[p0:p1] + charge_exponents_pointer = exponents_slice.data.ptr + grids_slice = grids[p0:p1] err = libgint.GINTfill_int3c1e( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p), + ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p), ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), - ctypes.c_int(ngrids_of_split), + ctypes.c_int(p1-p0), ctypes.cast(int3c_angular_slice.data.ptr, ctypes.c_void_p), strides.ctypes.data_as(ctypes.c_void_p), ao_offsets.ctypes.data_as(ctypes.c_void_p), @@ -270,11 +288,11 @@ def get_int3c1e(mol, grids, charge_exponents, intopt): row, col = np.tril_indices(nao) int3c_grid_slice[:, row, col] = int3c_grid_slice[:, col, row] - ao_idx = np.argsort(intopt._ao_idx) - grid_idx = np.arange(ngrids_of_split) - int3c_grid_slice = int3c_grid_slice[np.ix_(grid_idx, ao_idx, ao_idx)] - - int3c_grid_slice.get(out = int3c[i_grid_split : i_grid_split + ngrids_of_split, :, :]) + #ao_idx = np.argsort(intopt._ao_idx) + #grid_idx = np.arange(p1-p0) + #int3c_grid_slice = int3c_grid_slice[np.ix_(grid_idx, ao_idx, ao_idx)] + int3c_grid_slice = intopt.unsort_orbitals(int3c_grid_slice, axis=[1,2]) + int3c_grid_slice.get(out = int3c[p0:p1, :, :]) return int3c @@ -355,9 +373,9 @@ def get_int3c1e_charge_contracted(mol, grids, charge_exponents, charges, intopt) row, col = np.tril_indices(nao) int1e_charge_contracted[row, col] = int1e_charge_contracted[col, row] - ao_idx = np.argsort(intopt._ao_idx) - int1e_charge_contracted = int1e_charge_contracted[np.ix_(ao_idx, ao_idx)] - + #ao_idx = np.argsort(intopt._ao_idx) + #int1e_charge_contracted = int1e_charge_contracted[np.ix_(ao_idx, ao_idx)] + int1e_charge_contracted = intopt.unsort_orbitals(int1e_charge_contracted, axis=[0,1]) return int1e_charge_contracted def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt): @@ -385,7 +403,7 @@ def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt): bas_coords = intopt._sorted_mol.atom_coords()[intopt._sorted_mol._bas[:, ATOM_OF]].flatten() n_total_hermite_density = intopt.density_offset[-1] - dm_pair_ordered = np.zeros(n_total_hermite_density) + dm_pair_ordered = np.empty(n_total_hermite_density) libgint.GINTinit_J_density_rys_preprocess(dm.ctypes.data_as(ctypes.c_void_p), dm_pair_ordered.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(1), ctypes.c_int(nao_cart), ctypes.c_int(len(intopt.bas_pairs_locs) - 1), @@ -413,8 +431,7 @@ def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt): int3c_density_contracted = cp.zeros(ngrids) - for i_grid_split in range(0, ngrids, ngrids_per_split): - ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split]) + for p0, p1 in lib.prange(0, ngrids, ngrids_per_split): for cp_ij_id, _ in enumerate(intopt.log_qs): stream = cp.cuda.get_current_stream() @@ -425,21 +442,22 @@ def get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt): charge_exponents_pointer = c_null_ptr() if charge_exponents is not None: - charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr + exponents_slice = charge_exponents[p0:p1] + charge_exponents_pointer = exponents_slice.data.ptr # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type n_pair_sum_per_thread = nao_cart - + grids_slice = grids[p0:p1, :] err = libgint.GINTfill_int3c1e_density_contracted( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p), + ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p), ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), - ctypes.c_int(ngrids_of_split), + ctypes.c_int(p1-p0), ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p), intopt.density_offset.ctypes.data_as(ctypes.c_void_p), - ctypes.cast(int3c_density_contracted[i_grid_split : i_grid_split + ngrids_of_split].data.ptr, ctypes.c_void_p), + ctypes.cast(int3c_density_contracted[p0:p1].data.ptr, ctypes.c_void_p), bins_locs_ij.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nbins), ctypes.c_int(cp_ij_id), diff --git a/gpu4pyscf/gto/int3c1e_ip.py b/gpu4pyscf/gto/int3c1e_ip.py index cc53feab..8b47adce 100644 --- a/gpu4pyscf/gto/int3c1e_ip.py +++ b/gpu4pyscf/gto/int3c1e_ip.py @@ -15,7 +15,7 @@ import ctypes import cupy as cp import numpy as np - +from pyscf import lib from pyscf.gto import ATOM_OF from pyscf.lib import c_null_ptr from gpu4pyscf.lib.cupy_helper import load_library, cart2sph, get_avail_mem @@ -40,19 +40,19 @@ def get_int3c1e_ip(mol, grids, charge_exponents, intopt): "the 3 center integral first derivative, " "which requires {total_double_number * 8 / 1e9 : .1f} GB of memory") ngrids_per_split = (ngrids + n_grid_split - 1) // n_grid_split - - int3cip1_pinned_memory_pool = cp.cuda.alloc_pinned_memory(ngrids * nao * nao * 3 * np.array([1.0]).nbytes) - int3c_ip1 = np.frombuffer(int3cip1_pinned_memory_pool, np.float64, ngrids * nao * nao * 3).reshape([3, ngrids, nao, nao], order='C') - int3cip2_pinned_memory_pool = cp.cuda.alloc_pinned_memory(ngrids * nao * nao * 3 * np.array([1.0]).nbytes) - int3c_ip2 = np.frombuffer(int3cip2_pinned_memory_pool, np.float64, ngrids * nao * nao * 3).reshape([3, ngrids, nao, nao], order='C') + + buf_size = ngrids * nao * nao * 3 + int3cip1_pinned_buf = cp.cuda.alloc_pinned_memory(buf_size * 8) + int3c_ip1 = np.frombuffer(int3cip1_pinned_buf, np.float64, buf_size).reshape([3, ngrids, nao, nao], order='C') + int3cip2_pinned_buf = cp.cuda.alloc_pinned_memory(buf_size * 8) + int3c_ip2 = np.frombuffer(int3cip2_pinned_buf, np.float64, buf_size).reshape([3, ngrids, nao, nao], order='C') grids = cp.asarray(grids, order='C') if charge_exponents is not None: charge_exponents = cp.asarray(charge_exponents, order='C') - for i_grid_split in range(0, ngrids, ngrids_per_split): - ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split]) - int3c_grid_slice = cp.zeros([6, ngrids_of_split, nao, nao], order='C') + for p0, p1 in lib.prange(0, ngrids, ngrids_per_split): + int3c_grid_slice = cp.zeros([6, p1-p0, nao, nao], order='C') for cp_ij_id, _ in enumerate(intopt.log_qs): cpi = intopt.cp_idx[cp_ij_id] cpj = intopt.cp_jdx[cp_ij_id] @@ -74,18 +74,20 @@ def get_int3c1e_ip(mol, grids, charge_exponents, intopt): ao_offsets = np.array([i0, j0], dtype=np.int32) strides = np.array([ni, ni*nj], dtype=np.int32) - int3c_angular_slice = cp.zeros([6, ngrids_of_split, j1-j0, i1-i0], order='C') + int3c_angular_slice = cp.zeros([6, p1-p0, j1-j0, i1-i0], order='C') charge_exponents_pointer = c_null_ptr() if charge_exponents is not None: - charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr + exponents_slice = charge_exponents[p0:p1] + charge_exponents_pointer = exponents_slice.data.ptr + grids_slice = grids[p0:p1, :] err = libgint.GINTfill_int3c1e_ip( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p), + ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p), ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), - ctypes.c_int(ngrids_of_split), + ctypes.c_int(p1-p0), ctypes.cast(int3c_angular_slice.data.ptr, ctypes.c_void_p), strides.ctypes.data_as(ctypes.c_void_p), ao_offsets.ctypes.data_as(ctypes.c_void_p), @@ -103,20 +105,20 @@ def get_int3c1e_ip(mol, grids, charge_exponents, intopt): int3c_angular_slice = cart2sph(int3c_angular_slice, axis=2, ang=lj) int3c_angular_slice = cart2sph(int3c_angular_slice, axis=3, ang=li) - int3c_grid_slice[:, :, j0:j1, i0:i1] = int3c_angular_slice + int3c_grid_slice[:, :, i0:i1, j0:j1] = int3c_angular_slice.transpose(0,1,3,2) ao_idx = np.argsort(intopt._ao_idx) - grid_idx = np.arange(ngrids_of_split) + grid_idx = np.arange(p1-p0) derivative_idx = np.arange(6) int3c_grid_slice = int3c_grid_slice[np.ix_(derivative_idx, grid_idx, ao_idx, ao_idx)] # Each piece of the following memory is contiguous - int3c_grid_slice[0, :, :, :].get(out = int3c_ip1[0, i_grid_split : i_grid_split + ngrids_of_split, :, :]) - int3c_grid_slice[1, :, :, :].get(out = int3c_ip1[1, i_grid_split : i_grid_split + ngrids_of_split, :, :]) - int3c_grid_slice[2, :, :, :].get(out = int3c_ip1[2, i_grid_split : i_grid_split + ngrids_of_split, :, :]) - int3c_grid_slice[3, :, :, :].get(out = int3c_ip2[0, i_grid_split : i_grid_split + ngrids_of_split, :, :]) - int3c_grid_slice[4, :, :, :].get(out = int3c_ip2[1, i_grid_split : i_grid_split + ngrids_of_split, :, :]) - int3c_grid_slice[5, :, :, :].get(out = int3c_ip2[2, i_grid_split : i_grid_split + ngrids_of_split, :, :]) + int3c_grid_slice[0, :, :, :].get(out = int3c_ip1[0, p0:p1, :, :]) + int3c_grid_slice[1, :, :, :].get(out = int3c_ip1[1, p0:p1, :, :]) + int3c_grid_slice[2, :, :, :].get(out = int3c_ip1[2, p0:p1, :, :]) + int3c_grid_slice[3, :, :, :].get(out = int3c_ip2[0, p0:p1, :, :]) + int3c_grid_slice[4, :, :, :].get(out = int3c_ip2[1, p0:p1, :, :]) + int3c_grid_slice[5, :, :, :].get(out = int3c_ip2[2, p0:p1, :, :]) return int3c_ip1, int3c_ip2 @@ -134,7 +136,7 @@ def get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, int charges = charges.reshape([-1, 1], order='C') grids = cp.concatenate([grids, charges], axis=1) - int1e_charge_contracted = cp.zeros([3, mol.nao, mol.nao], order='C') + int1e_charge_contracted = cp.empty([3, mol.nao, mol.nao], order='C') for cp_ij_id, _ in enumerate(intopt.log_qs): cpi = intopt.cp_idx[cp_ij_id] cpj = intopt.cp_jdx[cp_ij_id] @@ -191,13 +193,68 @@ def get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, int int1e_angular_slice = cart2sph(int1e_angular_slice, axis=1, ang=lj) int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=li) - int1e_charge_contracted[:, j0:j1, i0:i1] = int1e_angular_slice + int1e_charge_contracted[:, i0:i1, j0:j1] = int1e_angular_slice.transpose(0,2,1) + + return intopt.unsort_orbitals(int1e_charge_contracted, axis=[1,2]) + +def get_int3c1e_ip1_density_contracted(mol, grids, charge_exponents, dm, intopt): + omega = mol.omega + assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented." + + ngrids = grids.shape[0] + grids = cp.asarray(grids, order='C') + if charge_exponents is not None: + charge_exponents = cp.asarray(charge_exponents, order='C') + + dm = cp.asarray(dm) + assert dm.ndim == 2 + assert dm.shape[0] == dm.shape[1] and dm.shape[0] == mol.nao + + dm = intopt.sort_orbitals(dm, [0,1]) + if not mol.cart: + cart2sph_transformation_matrix = intopt.cart2sph + # TODO: This part is inefficient (O(N^3)), should be changed to the O(N^2) algorithm + dm = cart2sph_transformation_matrix @ dm @ cart2sph_transformation_matrix.T + dm = dm.flatten(order='F') # Column major order matches (i + j * n_ao) access pattern in the C function + + nao = intopt._sorted_mol.nao + + i_atom_of_each_shell = intopt._sorted_mol._bas[:, ATOM_OF] + i_atom_of_each_shell = cp.array(i_atom_of_each_shell, dtype=np.int32) + + ip1_per_atom = cp.zeros([mol.natm, 3, ngrids]) + + for cp_ij_id, _ in enumerate(intopt.log_qs): + stream = cp.cuda.get_current_stream() + + log_q_ij = intopt.log_qs[cp_ij_id] + + nbins = 1 + bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) + + charge_exponents_pointer = c_null_ptr() + if charge_exponents is not None: + charge_exponents_pointer = charge_exponents.data.ptr - ao_idx = np.argsort(intopt._ao_idx) - derivative_idx = np.arange(3) - int1e_charge_contracted = int1e_charge_contracted[np.ix_(derivative_idx, ao_idx, ao_idx)] + err = libgint.GINTfill_int3c1e_ip1_density_contracted( + ctypes.cast(stream.ptr, ctypes.c_void_p), + intopt.bpcache, + ctypes.cast(grids.data.ptr, ctypes.c_void_p), + ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), + ctypes.c_int(ngrids), + ctypes.cast(ip1_per_atom.data.ptr, ctypes.c_void_p), + bins_locs_ij.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbins), + ctypes.c_int(cp_ij_id), + ctypes.cast(dm.data.ptr, ctypes.c_void_p), + ctypes.cast(i_atom_of_each_shell.data.ptr, ctypes.c_void_p), + ctypes.c_int(nao), + ctypes.c_double(omega)) + + if err != 0: + raise RuntimeError('GINTfill_int3c1e_charge_contracted failed') - return int1e_charge_contracted + return ip1_per_atom def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt): omega = mol.omega @@ -228,10 +285,11 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt) bas_coords = intopt._sorted_mol.atom_coords()[intopt._sorted_mol._bas[:, ATOM_OF]].flatten() n_total_hermite_density = intopt.density_offset[-1] - dm_pair_ordered = np.zeros(n_total_hermite_density) + dm_pair_ordered = np.empty(n_total_hermite_density) libgint.GINTinit_J_density_rys_preprocess(dm.ctypes.data_as(ctypes.c_void_p), dm_pair_ordered.ctypes.data_as(ctypes.c_void_p), - ctypes.c_int(1), ctypes.c_int(nao_cart), ctypes.c_int(len(intopt.bas_pairs_locs) - 1), + ctypes.c_int(1), ctypes.c_int(nao_cart), + ctypes.c_int(len(intopt.bas_pairs_locs) - 1), intopt.bas_pair2shls.ctypes.data_as(ctypes.c_void_p), intopt.bas_pairs_locs.ctypes.data_as(ctypes.c_void_p), l_ij.ctypes.data_as(ctypes.c_void_p), @@ -252,8 +310,7 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt) int3c_density_contracted = cp.zeros([3, ngrids], order='C') - for i_grid_split in range(0, ngrids, ngrids_per_split): - ngrids_of_split = np.min([ngrids_per_split, ngrids - i_grid_split]) + for p0, p1 in lib.prange(0, ngrids, ngrids_per_split): for cp_ij_id, _ in enumerate(intopt.log_qs): stream = cp.cuda.get_current_stream() @@ -264,7 +321,9 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt) charge_exponents_pointer = c_null_ptr() if charge_exponents is not None: - charge_exponents_pointer = charge_exponents[i_grid_split : i_grid_split + ngrids_of_split].data.ptr + exponents_slice = charge_exponents[p0:p1] + charge_exponents_pointer = exponents_slice.data.ptr + grids_slice = grids[p0:p1] # n_pair_sum_per_thread = 1 # means every thread processes one pair and one grid # n_pair_sum_per_thread = nao_cart # or larger number gaurantees one thread processes one grid and all pairs of the same type @@ -273,12 +332,12 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt) err = libgint.GINTfill_int3c1e_ip2_density_contracted( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(grids[i_grid_split : i_grid_split + ngrids_of_split, :].data.ptr, ctypes.c_void_p), + ctypes.cast(grids_slice.data.ptr, ctypes.c_void_p), ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), - ctypes.c_int(ngrids_of_split), + ctypes.c_int(p1-p0), ctypes.cast(dm_pair_ordered.data.ptr, ctypes.c_void_p), intopt.density_offset.ctypes.data_as(ctypes.c_void_p), - ctypes.cast(int3c_density_contracted[:, i_grid_split : i_grid_split + ngrids_of_split].data.ptr, ctypes.c_void_p), + ctypes.cast(int3c_density_contracted[:, p0:p1].data.ptr, ctypes.c_void_p), bins_locs_ij.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nbins), ctypes.c_int(cp_ij_id), @@ -290,6 +349,82 @@ def get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt) return int3c_density_contracted +def get_int3c1e_ip2_charge_contracted(mol, grids, charge_exponents, charges, gridslice, output, intopt): + omega = mol.omega + assert omega >= 0.0, "Short-range one electron integrals with GPU acceleration is not implemented." + + ngrids = grids.shape[0] + grids = cp.asarray(grids, order='C') + if charge_exponents is not None: + charge_exponents = cp.asarray(charge_exponents, order='C') + + assert charges.ndim == 1 and charges.shape[0] == grids.shape[0] + charges = cp.asarray(charges).astype(np.float64) + + charges = charges.reshape([-1, 1], order='C') + grids = cp.concatenate([grids, charges], axis=1) + + n_atom = len(gridslice) + i_atom_of_each_charge = [[i_atom] * (gridslice[i_atom][1] - gridslice[i_atom][0]) for i_atom in range(n_atom)] + i_atom_of_each_charge = sum(i_atom_of_each_charge, []) + i_atom_of_each_charge = cp.array(i_atom_of_each_charge, dtype=np.int32) + + assert isinstance(output, cp.ndarray) + assert output.shape == (n_atom, 3, mol.nao, mol.nao) + + for cp_ij_id, _ in enumerate(intopt.log_qs): + cpi = intopt.cp_idx[cp_ij_id] + cpj = intopt.cp_jdx[cp_ij_id] + li = intopt.angular[cpi] + lj = intopt.angular[cpj] + + stream = cp.cuda.get_current_stream() + + log_q_ij = intopt.log_qs[cp_ij_id] + + nbins = 1 + bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) + + i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] + j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] + ni = i1 - i0 + nj = j1 - j0 + + ao_offsets = np.array([i0, j0], dtype=np.int32) + strides = np.array([ni, ni*nj], dtype=np.int32) + + charge_exponents_pointer = c_null_ptr() + if charge_exponents is not None: + charge_exponents_pointer = charge_exponents.data.ptr + + int1e_angular_slice = cp.zeros([n_atom, 3, j1-j0, i1-i0], order='C') + + err = libgint.GINTfill_int3c1e_ip2_charge_contracted( + ctypes.cast(stream.ptr, ctypes.c_void_p), + intopt.bpcache, + ctypes.cast(grids.data.ptr, ctypes.c_void_p), + ctypes.cast(charge_exponents_pointer, ctypes.c_void_p), + ctypes.c_int(ngrids), + ctypes.cast(int1e_angular_slice.data.ptr, ctypes.c_void_p), + strides.ctypes.data_as(ctypes.c_void_p), + ao_offsets.ctypes.data_as(ctypes.c_void_p), + bins_locs_ij.ctypes.data_as(ctypes.c_void_p), + ctypes.c_int(nbins), + ctypes.c_int(cp_ij_id), + ctypes.cast(i_atom_of_each_charge.data.ptr, ctypes.c_void_p), + ctypes.c_double(omega)) + + if err != 0: + raise RuntimeError('GINTfill_int3c1e_charge_contracted failed') + + i0, i1 = intopt.ao_loc[cpi], intopt.ao_loc[cpi+1] + j0, j1 = intopt.ao_loc[cpj], intopt.ao_loc[cpj+1] + if not mol.cart: + int1e_angular_slice = cart2sph(int1e_angular_slice, axis=2, ang=lj) + int1e_angular_slice = cart2sph(int1e_angular_slice, axis=3, ang=li) + + output[np.ix_(range(n_atom), range(3), intopt._ao_idx[i0:i1], intopt._ao_idx[j0:j1])] += int1e_angular_slice.transpose(0,1,3,2) + def get_int3c1e_ip1_charge_and_density_contracted(mol, grids, charge_exponents, dm, charges, intopt): dm = cp.asarray(dm) if dm.ndim == 3: @@ -302,7 +437,7 @@ def get_int3c1e_ip1_charge_and_density_contracted(mol, grids, charge_exponents, assert dm.shape[0] == dm.shape[1] and dm.shape[0] == mol.nao int3c_ip1 = get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, intopt) - int3c_ip1 = cp.einsum('xji,ij->xi', int3c_ip1, dm) + int3c_ip1 = cp.einsum('xij,ij->xi', int3c_ip1, dm) return int3c_ip1 def get_int3c1e_ip2_charge_and_density_contracted(mol, grids, charge_exponents, dm, charges, intopt): @@ -319,13 +454,18 @@ def int1e_grids_ip1(mol, grids, charge_exponents=None, dm=None, charges=None, di $$\left(\frac{\partial}{\partial \vec{A}} \mu \middle| \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ where $\mu(\vec{r})$ centers at $\vec{A}$ and $\nu(\vec{r})$ centers at $\vec{B}$. - If charges is not None, the function computes the following contraction: + If charges is not None and density is None, the function computes the following contraction: $$\sum_{C}^{n_{charge}} q_C \left(\frac{\partial}{\partial \vec{A}} \mu \middle| \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ where $q_C$ is the charge centered at $\vec{C}$. If charges is not None and dm is not None, the function computes the following contraction: $$\sum_\nu^{n_{ao}} D_{\mu\nu} \sum_{C}^{n_{charge}} q_C \left(\frac{\partial}{\partial \vec{A}} \mu \middle| \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ + + If dm is not None and charges is None, the function computes the following contraction: + $$\sum_{\mu \in \{\text{AO of atom A}\}} \sum_\nu^{n_{ao}} D_{\mu\nu} + \left(\frac{\partial}{\partial \vec{A}} \mu \middle| \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ + The output dimension is $(n_{atom}, 3, n_{charge})$. ''' assert grids is not None @@ -340,12 +480,14 @@ def int1e_grids_ip1(mol, grids, charge_exponents=None, dm=None, charges=None, di if dm is None and charges is None: return get_int3c1e_ip(mol, grids, charge_exponents, intopt)[0] - else: - assert charges is not None + elif charges is not None: if dm is not None: return get_int3c1e_ip1_charge_and_density_contracted(mol, grids, charge_exponents, dm, charges, intopt) else: return get_int3c1e_ip1_charge_contracted(mol, grids, charge_exponents, charges, intopt) + else: + assert dm is not None + return get_int3c1e_ip1_density_contracted(mol, grids, charge_exponents, dm, intopt) def int1e_grids_ip2(mol, grids, charge_exponents=None, dm=None, charges=None, direct_scf_tol=1e-13, intopt=None): r''' @@ -353,12 +495,16 @@ def int1e_grids_ip2(mol, grids, charge_exponents=None, dm=None, charges=None, di $$\left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ where $\mu(\vec{r})$ centers at $\vec{A}$ and $\nu(\vec{r})$ centers at $\vec{B}$. - If dm is not None, the function computes the following contraction: + If dm is not None and charges is None, the function computes the following contraction: $$\sum_{\mu, \nu}^{n_{ao}} D_{\mu\nu} \left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ If dm is not None and charges is not None, the function computes the following contraction: $$q_C \sum_{\mu, \nu}^{n_{ao}} D_{\mu\nu} \left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ where $q_C$ is the charge centered at $\vec{C}$. + + If charges is not None and dm is None, the function computes the following contraction: + $$\sum_{C}^{n_{charge}} q_C \left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ + Notice that this summation should not be performed if the charges originates from different atomic centers. ''' assert grids is not None @@ -373,9 +519,36 @@ def int1e_grids_ip2(mol, grids, charge_exponents=None, dm=None, charges=None, di if dm is None and charges is None: return get_int3c1e_ip(mol, grids, charge_exponents, intopt)[1] - else: - assert dm is not None + elif dm is not None: if charges is not None: return get_int3c1e_ip2_charge_and_density_contracted(mol, grids, charge_exponents, dm, charges, intopt) else: return get_int3c1e_ip2_density_contracted(mol, grids, charge_exponents, dm, intopt) + else: + assert charges is not None + output = cp.zeros([1, 3, mol.nao, mol.nao]) + get_int3c1e_ip2_charge_contracted(mol, grids, charge_exponents, charges, [[0, grids.shape[0]]], output, intopt) + return output.reshape([3, mol.nao, mol.nao]) + +def int1e_grids_ip2_charge_contracted(mol, grids, charges, gridslice, output, charge_exponents=None, direct_scf_tol=1e-13, intopt=None): + r''' + This function computes the following contraction: + $$\sum_{C \in \{\text{grid attached to atom A}\}} q_C + \left(\mu \middle| \frac{\partial}{\partial \vec{C}} \frac{1}{|\vec{r} - \vec{C}|} \middle| \nu\right)$$ + where $q_C$ is the charge centered at $\vec{C}$. The output dimension is $(n_{atom}, 3, n_{ao}, n_{ao})$. + ''' + assert grids is not None + assert charges is not None + assert gridslice is not None + assert output is not None + + if intopt is None: + intopt = VHFOpt(mol) + intopt.build(direct_scf_tol, aosym=False) + else: + assert isinstance(intopt, VHFOpt), \ + f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object." + assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first." + assert not intopt.aosym + + return get_int3c1e_ip2_charge_contracted(mol, grids, charge_exponents, charges, gridslice, output, intopt) diff --git a/gpu4pyscf/gto/moleintor.py b/gpu4pyscf/gto/moleintor.py deleted file mode 100644 index f386aed2..00000000 --- a/gpu4pyscf/gto/moleintor.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import ctypes -import cupy as cp -import numpy as np - -from gpu4pyscf.gto.int3c1e import VHFOpt, get_int3c1e, get_int3c1e_density_contracted, get_int3c1e_charge_contracted -from gpu4pyscf.gto.int3c1e_ip import get_int3c1e_ip, get_int3c1e_ip_contracted - -def intor(mol, intor, grids, charge_exponents=None, dm=None, charges=None, direct_scf_tol=1e-13, intopt=None): - assert grids is not None - - if intopt is None: - intopt = VHFOpt(mol) - aosym = False if 'ip' in intor else True - intopt.build(direct_scf_tol, aosym=aosym) - else: - assert isinstance(intopt, VHFOpt), \ - f"Please make sure intopt is a {VHFOpt.__module__}.{VHFOpt.__name__} object." - assert hasattr(intopt, "density_offset"), "Please call build() function for VHFOpt object first." - - if intor == 'int1e_grids': - assert dm is None or charges is None, \ - "Are you sure you want to contract the one electron integrals with both charge and density? " + \ - "If so, pass in density, obtain the result with n_charge and contract with the charges yourself." - assert intopt.aosym - - if dm is None and charges is None: - return get_int3c1e(mol, grids, charge_exponents, intopt) - elif dm is not None: - return get_int3c1e_density_contracted(mol, grids, charge_exponents, dm, intopt) - elif charges is not None: - return get_int3c1e_charge_contracted(mol, grids, charge_exponents, charges, intopt) - else: - raise ValueError(f"Logic error in {__file__} {__name__}") - elif intor == 'int1e_grids_ip': - assert not intopt.aosym - - if dm is None and charges is None: - return get_int3c1e_ip(mol, grids, charge_exponents, intopt) - else: - assert dm is not None - assert charges is not None - return get_int3c1e_ip_contracted(mol, grids, charge_exponents, dm, charges, intopt) - else: - raise NotImplementedError(f"GPU intor {intor} is not implemented.") diff --git a/gpu4pyscf/gto/tests/test_int1e_grids_ip.py b/gpu4pyscf/gto/tests/test_int1e_grids_ip.py index e77f30ec..56f87e4b 100644 --- a/gpu4pyscf/gto/tests/test_int1e_grids_ip.py +++ b/gpu4pyscf/gto/tests/test_int1e_grids_ip.py @@ -18,7 +18,7 @@ import cupy as cp import pyscf from pyscf import lib, gto, df -from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2 +from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2, int1e_grids_ip2_charge_contracted def setUpModule(): global mol_sph, mol_cart, grid_points, integral_threshold, density_contraction_threshold, charge_contraction_threshold @@ -74,8 +74,8 @@ def test_int1e_grids_ip_full_tensor_cart(self): test_int1e_dA = int1e_grids_ip1(mol, grid_points) test_int1e_dC = int1e_grids_ip2(mol, grid_points) - test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1) - test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1) + test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1) + test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1) np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold) np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold) @@ -94,8 +94,8 @@ def test_int1e_grids_ip_full_tensor_sph(self): test_int1e_dA = int1e_grids_ip1(mol, grid_points) test_int1e_dC = int1e_grids_ip2(mol, grid_points) - test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1) - test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1) + test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1) + test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1) np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold) np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold) @@ -117,8 +117,8 @@ def test_int1e_grids_ip_full_tensor_gaussian_charge(self): test_int1e_dA = int1e_grids_ip1(mol, grid_points, charge_exponents = charge_exponents) test_int1e_dC = int1e_grids_ip2(mol, grid_points, charge_exponents = charge_exponents) - test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1) - test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1) + test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1) + test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1) np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold) np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold) @@ -141,8 +141,8 @@ def test_int1e_grids_ip_full_tensor_omega(self): test_int1e_dA = int1e_grids_ip1(mol, grid_points) test_int1e_dC = int1e_grids_ip2(mol, grid_points) - test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1) - test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1) + test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1) + test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1) np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold) np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold) @@ -168,8 +168,8 @@ def test_int1e_grids_ip_full_tensor_gaussian_charge_omega(self): test_int1e_dA = int1e_grids_ip1(mol, grid_points, charge_exponents = charge_exponents) test_int1e_dC = int1e_grids_ip2(mol, grid_points, charge_exponents = charge_exponents) - test_int1e_dA = test_int1e_dA.transpose(0, 3, 2, 1) - test_int1e_dC = test_int1e_dC.transpose(0, 3, 2, 1) + test_int1e_dA = test_int1e_dA.transpose(0, 2, 3, 1) + test_int1e_dC = test_int1e_dC.transpose(0, 2, 3, 1) np.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold) np.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold) @@ -314,6 +314,55 @@ def test_int1e_grids_ip_contracted_gaussian_charge_omega(self): cp.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold) cp.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold) + def test_int1e_grids_ip2_charge_contracted(self): + np.random.seed(12346) + charges = np.random.uniform(-2.0, 2.0, grid_points.shape[0]) + + mol = mol_sph + fakemol = gto.fakemol_for_charges(grid_points) + + int3c2e_ip2 = mol._add_suffix('int3c2e_ip2') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip2) + q_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip2, aosym='s1', cintopt=cintopt) + + ngrids = grid_points.shape[0] + n_atom = mol.natm + nao = mol.nao + gridslice = [[ngrids * i // n_atom, ngrids * (i + 1) // n_atom] for i in range(n_atom)] + ref_int1e_dC = np.zeros([n_atom, 3, nao, nao]) + for i_atom in range(n_atom): + g0,g1 = gridslice[i_atom] + ref_int1e_dC[i_atom, :, :, :] += np.einsum('dijq,q->dij', q_nj[:, :, :, g0:g1], charges[g0:g1]) + + test_int1e_dC = cp.zeros([n_atom, 3, nao, nao]) + int1e_grids_ip2_charge_contracted(mol, grid_points, charges, gridslice, test_int1e_dC) + + cp.testing.assert_allclose(ref_int1e_dC, test_int1e_dC, atol = integral_threshold) + + def test_int1e_grids_ip1_density_contracted(self): + np.random.seed(12347) + dm = np.random.uniform(-2.0, 2.0, (mol_sph.nao, mol_sph.nao)) + + mol = mol_sph + fakemol = gto.fakemol_for_charges(grid_points) + + int3c2e_ip1 = mol._add_suffix('int3c2e_ip1') + cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip1) + v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1, aosym='s1', cintopt=cintopt) + + v_nj = np.einsum('dijq,ij->dqi', v_nj, dm) + + ngrids = grid_points.shape[0] + aoslice = np.array(mol.aoslice_by_atom()) + ref_int1e_dA = np.empty([mol.natm, 3, ngrids]) + for i_atom in range(mol.natm): + p0,p1 = aoslice[i_atom, 2:] + ref_int1e_dA[i_atom,:,:] = np.einsum('dqi->dq', v_nj[:,:,p0:p1]) + + test_int1e_dA = int1e_grids_ip1(mol, grid_points, dm = dm) + + cp.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold) + if __name__ == "__main__": print("Full Tests for One Electron Coulomb Integrals") unittest.main() diff --git a/gpu4pyscf/hessian/jk.py b/gpu4pyscf/hessian/jk.py new file mode 100644 index 00000000..65edff6b --- /dev/null +++ b/gpu4pyscf/hessian/jk.py @@ -0,0 +1,305 @@ +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +''' +Compute J/K matrices for Hessian +''' +import ctypes +import math +import numpy as np +import cupy as cp +from collections import Counter +from concurrent.futures import ThreadPoolExecutor + +from pyscf import lib +from pyscf.scf import _vhf +from pyscf import __config__ +from gpu4pyscf.scf import jk +from gpu4pyscf.scf.jk import (_make_tril_tile_mappings, quartets_scheme, QUEUE_DEPTH, + _VHFOpt, LMAX, init_constant, libvhf_rys) +from gpu4pyscf.lib.cupy_helper import (condense, sandwich_dot, transpose_sum, + reduce_to_device, contract) + +from gpu4pyscf.__config__ import props as gpu_specs +from gpu4pyscf.__config__ import _streams, _num_devices +from gpu4pyscf.lib import logger + + +def _ao2mo(v_ao, mocc, mo_coeff): + v_ao = contract('nij,jo->nio', v_ao, mocc) + return contract('nio,ip->npo', v_ao, mo_coeff) + +def _jk_task(mol, dms, mo_coeff, mo_occ, vhfopt, task_list, hermi=0, + device_id=0, with_j=True, with_k=True, verbose=0): + nao, _ = vhfopt.coeff.shape + uniq_l_ctr = vhfopt.uniq_l_ctr + uniq_l = uniq_l_ctr[:,0] + l_ctr_bas_loc = vhfopt.l_ctr_offsets + l_symb = [lib.param.ANGULAR[i] for i in uniq_l] + kern = libvhf_rys.RYS_build_jk + + timing_counter = Counter() + kern_counts = 0 + with cp.cuda.Device(device_id), _streams[device_id]: + log = logger.new_logger(mol, verbose) + cput0 = log.init_timer() + dms = cp.asarray(dms) + coeff = cp.asarray(vhfopt.coeff) + + # Transform MO coeffcients and DM into sorted, cartesian AO basis + #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff) + dms = sandwich_dot(dms, coeff.T) + dms = cp.asarray(dms, order='C') + + n_dm = dms.shape[0] + tile_q_ptr = ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p) + q_ptr = ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p) + s_ptr = lib.c_null_ptr() + if mol.omega < 0: + s_ptr = ctypes.cast(vhfopt.s_estimator.data.ptr, ctypes.c_void_p) + + vj = vk = None + vj_ptr = vk_ptr = lib.c_null_ptr() + assert with_j or with_k + if with_k: + vk = cp.zeros(dms.shape) + vk_ptr = ctypes.cast(vk.data.ptr, ctypes.c_void_p) + if with_j: + vj = cp.zeros(dms.shape) + vj_ptr = ctypes.cast(vj.data.ptr, ctypes.c_void_p) + + ao_loc = mol.ao_loc + dm_cond = cp.log(condense('absmax', dms, ao_loc) + 1e-300).astype(np.float32) + log_max_dm = dm_cond.max() + log_cutoff = math.log(vhfopt.direct_scf_tol) + tile_mappings = _make_tril_tile_mappings(l_ctr_bas_loc, vhfopt.tile_q_cond, + log_cutoff-log_max_dm) + workers = gpu_specs['multiProcessorCount'] + pool = cp.empty((workers, QUEUE_DEPTH*4), dtype=np.uint16) + info = cp.empty(2, dtype=np.uint32) + t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0) + + for i, j, k, l in task_list: + ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1], + l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]) + tile_ij_mapping = tile_mappings[i,j] + llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' + kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], + l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) + tile_kl_mapping = tile_mappings[k,l] + scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) + err = kern( + vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p), + ctypes.c_int(n_dm), ctypes.c_int(nao), + vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), + (ctypes.c_int*8)(*ij_shls, *kl_shls), + ctypes.c_int(tile_ij_mapping.size), + ctypes.c_int(tile_kl_mapping.size), + ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), + tile_q_ptr, q_ptr, s_ptr, + ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), + ctypes.c_float(log_cutoff), + ctypes.cast(pool.data.ptr, ctypes.c_void_p), + ctypes.cast(info.data.ptr, ctypes.c_void_p), + ctypes.c_int(workers), + mol._atm.ctypes, ctypes.c_int(mol.natm), + mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) + if err != 0: + raise RuntimeError(f'RYS_build_jk kernel for {llll} failed') + if log.verbose >= logger.DEBUG1: + msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' + t1, t1p = log.timer_debug1(msg, *t1), t1 + timing_counter[llll] += t1[1] - t1p[1] + kern_counts += 1 + if with_j: + vj *= 2.0 + vj = transpose_sum(vj) + if with_k: + vk = transpose_sum(vk) + + assert mo_coeff.ndim == 2 or mo_coeff.ndim == 3 + if mo_coeff.ndim == 3: + # Unrestricted case + mo_coeff = cp.asarray(mo_coeff) + mo_occ = cp.asarray(mo_occ) + moa = coeff.dot(mo_coeff[0]) + mob = coeff.dot(mo_coeff[1]) + nmoa, nmob = moa.shape[1], mob.shape[1] + mocca = moa[:,mo_occ[0] > 0.5] + moccb = mob[:,mo_occ[1] > 0.5] + nocca, noccb = mocca.shape[1], moccb.shape[1] + n_dm_2 = n_dm//2 + if with_j: + vjab = vj[:n_dm_2] + vj[n_dm_2:] + vj = cp.empty([n_dm_2,nmoa*nocca+nmob*noccb]) + vj[:,:nmoa*nocca] = _ao2mo(vjab, mocca, moa).reshape(n_dm_2,-1) + vj[:,nmoa*nocca:] = _ao2mo(vjab, moccb, mob).reshape(n_dm_2,-1) + if with_k: + vka, vkb = vk[:n_dm_2], vk[n_dm_2:] + vk = cp.empty([n_dm_2,nmoa*nocca+nmob*noccb]) + vk[:,:nmoa*nocca] = _ao2mo(vka, mocca, moa).reshape(n_dm_2,-1) + vk[:,nmoa*nocca:] = _ao2mo(vkb, moccb, mob).reshape(n_dm_2,-1) + else: + mo_coeff = cp.asarray(mo_coeff) + mo_occ = cp.asarray(mo_occ) + mo_coeff = coeff.dot(mo_coeff) + mocc = mo_coeff[:,mo_occ>0.5] + if with_j: + vj = _ao2mo(vj, mocc, mo_coeff).reshape(n_dm,-1) + if with_k: + vk = _ao2mo(vk, mocc, mo_coeff).reshape(n_dm,-1) + + return vj, vk, kern_counts, timing_counter + +def get_jk(mol, dm, mo_coeff, mo_occ, hermi=0, vhfopt=None, + with_j=True, with_k=True, verbose=None): + '''Compute J, K matrices in MO + ''' + log = logger.new_logger(mol, verbose) + cput0 = log.init_timer() + assert hermi == 1 + if vhfopt is None: + # Small group size for load balance + group_size = None + if _num_devices > 1: + group_size = jk.GROUP_SIZE + vhfopt = _VHFOpt(mol).build(group_size=group_size) + + mol = vhfopt.sorted_mol + nao, nao_orig = vhfopt.coeff.shape + + dm = cp.asarray(dm, order='C') + dms = dm.reshape(-1,nao_orig,nao_orig) + n_dm = dms.shape[0] + + assert with_j or with_k + + init_constant(mol) + + uniq_l_ctr = vhfopt.uniq_l_ctr + uniq_l = uniq_l_ctr[:,0] + l_symb = [lib.param.ANGULAR[i] for i in uniq_l] + n_groups = np.count_nonzero(uniq_l <= LMAX) + + tasks = [] + for i in range(n_groups): + for j in range(i+1): + for k in range(i+1): + for l in range(k+1): + tasks.append((i,j,k,l)) + tasks = np.array(tasks) + task_list = [] + for device_id in range(_num_devices): + task_list.append(tasks[device_id::_num_devices]) + + cp.cuda.get_current_stream().synchronize() + futures = [] + with ThreadPoolExecutor(max_workers=_num_devices) as executor: + for device_id in range(_num_devices): + future = executor.submit( + _jk_task, + mol, dms, mo_coeff, mo_occ, vhfopt, task_list[device_id], hermi=hermi, + with_j=with_j, with_k=with_k, verbose=verbose, + device_id=device_id) + futures.append(future) + + kern_counts = 0 + timing_collection = Counter() + vj_dist = [] + vk_dist = [] + for future in futures: + vj, vk, counts, counter = future.result() + kern_counts += counts + timing_collection += counter + vj_dist.append(vj) + vk_dist.append(vk) + + if log.verbose >= logger.DEBUG1: + log.debug1('kernel launches %d', kern_counts) + for llll, t in timing_collection.items(): + log.debug1('%s wall time %.2f', llll, t) + + for s in _streams: + s.synchronize() + cp.cuda.get_current_stream().synchronize() + vj = vk = None + if with_k: + vk = reduce_to_device(vk_dist, inplace=True) + + if with_j: + vj = reduce_to_device(vj_dist, inplace=True) + + h_shls = vhfopt.h_shls + if h_shls: + cput1 = log.timer_debug1('get_jk pass 1 on gpu', *cput0) + log.debug3('Integrals for %s functions on CPU', l_symb[LMAX+1]) + scripts = [] + if with_j: + scripts.append('ji->s2kl') + if with_k: + if hermi == 1: + scripts.append('jk->s2il') + else: + scripts.append('jk->s1il') + # Transform MO coeffcients and DM into sorted, cartesian AO basis + #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff) + dms = sandwich_dot(dms, vhfopt.coeff.T) + dms = cp.asarray(dms, order='C') + shls_excludes = [0, h_shls[0]] * 4 + vs_h = _vhf.direct_mapdm('int2e_cart', 's8', scripts, + dms.get(), 1, mol._atm, mol._bas, mol._env, + shls_excludes=shls_excludes) + if with_j and with_k: + vj1 = vs_h[0] + vk1 = vs_h[1] + elif with_j: + vj1 = vs_h[0] + else: + vk1 = vs_h[0] + + idx, idy = np.tril_indices(nao, -1) + if hermi == 1: + if with_j: + vj1[:,idy,idx] = vj1[:,idx,idy] + if with_k: + vk1[:,idy,idx] = vk1[:,idx,idy] + + if mo_coeff.ndim == 3: + moa = vhfopt.coeff.dot(mo_coeff[0]) + mob = vhfopt.coeff.dot(mo_coeff[1]) + mocca = moa[:,mo_occ[0]>0.5] + moccb = mob[:,mo_occ[1]>0.5] + nmoa = moa.shape[1] + nocca = mocca.shape[1] + n_dm_2 = n_dm//2 + if with_j: + vjab = vj1[:n_dm_2] + vj1[n_dm_2:] + vj[:,:nmoa*nocca] += _ao2mo(vjab, mocca, moa).reshape(n_dm_2,-1) + vj[:,nmoa*nocca:] += _ao2mo(vjab, moccb, mob).reshape(n_dm_2,-1) + if with_k: + vka, vkb = vk1[:n_dm_2], vk1[n_dm_2:] + vk[:,:nmoa*nocca] += _ao2mo(vka, mocca, moa).reshape(n_dm_2,-1) + vk[:,nmoa*nocca:] += _ao2mo(vkb, moccb, mob).reshape(n_dm_2,-1) + else: + mo_coeff = vhfopt.coeff.dot(mo_coeff) + mocc = mo_coeff[:,mo_occ>0.5] + if with_j: + vj += _ao2mo(cp.asarray(vj1), mocc, mo_coeff).reshape(n_dm,-1) + if with_k: + vk += _ao2mo(cp.asarray(vk1), mocc, mo_coeff).reshape(n_dm,-1) + log.timer_debug1('get_jk pass 2 for h functions on cpu', *cput1) + log.timer('vj and vk', *cput0) + return vj, vk diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index f5291b54..775a6e98 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -25,10 +25,8 @@ from collections import Counter from concurrent.futures import ThreadPoolExecutor from pyscf.hessian import rhf as rhf_hess_cpu -from pyscf import lib +from pyscf import lib, gto from pyscf.gto import ATOM_OF -# import _response_functions to load gen_response methods in SCF class -from gpu4pyscf.scf import _response_functions # noqa from gpu4pyscf.scf import cphf from gpu4pyscf.lib.cupy_helper import (reduce_to_device, contract, tag_array, sandwich_dot, transpose_sum, get_avail_mem, condense, @@ -37,9 +35,10 @@ from gpu4pyscf.__config__ import _streams, _num_devices from gpu4pyscf.lib import logger from gpu4pyscf.scf.jk import ( - LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, libvhf_rys, _VHFOpt, init_constant, - _make_tril_tile_mappings, _nearest_power2) + LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, GROUP_SIZE, libvhf_rys, _VHFOpt, + init_constant, _make_tril_tile_mappings, _nearest_power2) from gpu4pyscf.grad import rhf as rhf_grad +from gpu4pyscf.hessian import jk libvhf_rys.RYS_per_atom_jk_ip2_type12.restype = ctypes.c_int libvhf_rys.RYS_per_atom_jk_ip2_type3.restype = ctypes.c_int @@ -77,10 +76,10 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, h1mo = h1mo.get() t1 = log.timer_debug1('making H1', *t1) if mo1 is None or mo_e1 is None: + fx = hessobj.gen_vind(mo_coeff, mo_occ) mo1, mo_e1 = hessobj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1mo, - None, atmlst, max_memory, log) + fx, atmlst, max_memory, log) t1 = log.timer_debug1('solving MO1', *t1) - mo1 = cupy.asarray(mo1) # *2 for double occupancy, *2 for +c.c. de2 += contract('kxpi,lypi->klxy', cupy.asarray(h1mo), mo1) * 4 @@ -179,6 +178,11 @@ def _ejk_ip2_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0, log = logger.new_logger(mol, verbose) cput0 = log.init_timer() dms = cp.asarray(dms) + coeff = cp.asarray(vhfopt.coeff) + + #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff) + dms = sandwich_dot(dms, coeff.T) + dms = cp.asarray(dms, order='C') tile_q_ptr = ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p) q_ptr = ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p) @@ -200,62 +204,60 @@ def _ejk_ip2_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0, info = cp.empty(2, dtype=np.uint32) t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0) - for i, j in task_list: + for i, j, k, l in task_list: ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1], l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]) tile_ij_mapping = tile_mappings[i,j] - for k in range(i+1): - for l in range(k+1): - llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' - kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], - l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) - tile_kl_mapping = tile_mappings[k,l] - scheme = _ip2_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) - err1 = kern1( - ctypes.cast(ejk.data.ptr, ctypes.c_void_p), - ctypes.c_double(j_factor), ctypes.c_double(k_factor), - ctypes.cast(dms.data.ptr, ctypes.c_void_p), - ctypes.c_int(n_dm), ctypes.c_int(nao), - vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), - (ctypes.c_int*8)(*ij_shls, *kl_shls), - ctypes.c_int(tile_ij_mapping.size), - ctypes.c_int(tile_kl_mapping.size), - ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), - ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), - tile_q_ptr, q_ptr, s_ptr, - ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), - ctypes.c_float(log_cutoff), - ctypes.cast(pool.data.ptr, ctypes.c_void_p), - ctypes.cast(info.data.ptr, ctypes.c_void_p), - ctypes.c_int(workers), - mol._atm.ctypes, ctypes.c_int(mol.natm), - mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) - err2 = kern2( - ctypes.cast(ejk.data.ptr, ctypes.c_void_p), - ctypes.c_double(j_factor), ctypes.c_double(k_factor), - ctypes.cast(dms.data.ptr, ctypes.c_void_p), - ctypes.c_int(n_dm), ctypes.c_int(nao), - vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), - (ctypes.c_int*8)(*ij_shls, *kl_shls), - ctypes.c_int(tile_ij_mapping.size), - ctypes.c_int(tile_kl_mapping.size), - ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), - ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), - tile_q_ptr, q_ptr, s_ptr, - ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), - ctypes.c_float(log_cutoff), - ctypes.cast(pool.data.ptr, ctypes.c_void_p), - ctypes.cast(info.data.ptr, ctypes.c_void_p), - ctypes.c_int(workers), - mol._atm.ctypes, ctypes.c_int(mol.natm), - mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) - if err1 != 0 or err2 != 0: - raise RuntimeError(f'RYS_per_atom_jk_ip2 kernel for {llll} failed') - if log.verbose >= logger.DEBUG1: - msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' - t1, t1p = log.timer_debug1(msg, *t1), t1 - timing_counter[llll] += t1[1] - t1p[1] - kern_counts += 1 + llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' + kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], + l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) + tile_kl_mapping = tile_mappings[k,l] + scheme = _ip2_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) + err1 = kern1( + ctypes.cast(ejk.data.ptr, ctypes.c_void_p), + ctypes.c_double(j_factor), ctypes.c_double(k_factor), + ctypes.cast(dms.data.ptr, ctypes.c_void_p), + ctypes.c_int(n_dm), ctypes.c_int(nao), + vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), + (ctypes.c_int*8)(*ij_shls, *kl_shls), + ctypes.c_int(tile_ij_mapping.size), + ctypes.c_int(tile_kl_mapping.size), + ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), + tile_q_ptr, q_ptr, s_ptr, + ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), + ctypes.c_float(log_cutoff), + ctypes.cast(pool.data.ptr, ctypes.c_void_p), + ctypes.cast(info.data.ptr, ctypes.c_void_p), + ctypes.c_int(workers), + mol._atm.ctypes, ctypes.c_int(mol.natm), + mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) + err2 = kern2( + ctypes.cast(ejk.data.ptr, ctypes.c_void_p), + ctypes.c_double(j_factor), ctypes.c_double(k_factor), + ctypes.cast(dms.data.ptr, ctypes.c_void_p), + ctypes.c_int(n_dm), ctypes.c_int(nao), + vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), + (ctypes.c_int*8)(*ij_shls, *kl_shls), + ctypes.c_int(tile_ij_mapping.size), + ctypes.c_int(tile_kl_mapping.size), + ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), + tile_q_ptr, q_ptr, s_ptr, + ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), + ctypes.c_float(log_cutoff), + ctypes.cast(pool.data.ptr, ctypes.c_void_p), + ctypes.cast(info.data.ptr, ctypes.c_void_p), + ctypes.c_int(workers), + mol._atm.ctypes, ctypes.c_int(mol.natm), + mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) + if err1 != 0 or err2 != 0: + raise RuntimeError(f'RYS_per_atom_jk_ip2 kernel for {llll} failed') + if log.verbose >= logger.DEBUG1: + msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' + t1, t1p = log.timer_debug1(msg, *t1), t1 + timing_counter[llll] += t1[1] - t1p[1] + kern_counts += 1 ejk = ejk + ejk.transpose(1,0,3,2) return ejk, kern_counts, timing_counter @@ -267,16 +269,17 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non log = logger.new_logger(mol, verbose) cput0 = log.init_timer() if vhfopt is None: - vhfopt = _VHFOpt(mol).build() + # Small group size for load balance + group_size = None + if _num_devices > 1: + group_size = GROUP_SIZE + vhfopt = _VHFOpt(mol).build(group_size=group_size) mol = vhfopt.sorted_mol nao, nao_orig = vhfopt.coeff.shape dm = cp.asarray(dm, order='C') dms = dm.reshape(-1,nao_orig,nao_orig) - #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff) - dms = sandwich_dot(dms, vhfopt.coeff.T) - dms = cp.asarray(dms, order='C') init_constant(mol) @@ -285,7 +288,12 @@ def _partial_ejk_ip2(mol, dm, vhfopt=None, j_factor=1., k_factor=1., verbose=Non assert uniq_l.max() <= LMAX n_groups = len(uniq_l_ctr) - tasks = [(i,j) for i in range(n_groups) for j in range(i+1)] + tasks = [] + for i in range(n_groups): + for j in range(i+1): + for k in range(i+1): + for l in range(k+1): + tasks.append((i,j,k,l)) tasks = np.array(tasks) task_list = [] for device_id in range(_num_devices): @@ -354,16 +362,18 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): assert atmlst is None mol = hessobj.mol natm = mol.natm - nao = mo_coeff.shape[0] mo_coeff = cp.asarray(mo_coeff) mocc = cp.asarray(mo_coeff[:,mo_occ>0]) dm0 = mocc.dot(mocc.T) * 2 h1mo = rhf_grad.get_grad_hcore(hessobj.base.Gradients()) + # Estimate the size of intermediate variables + # dm, vj, and vk in [natm,3,nao_cart,nao_cart] + nao_cart = mol.nao_cart() avail_mem = get_avail_mem() - slice_size = int(avail_mem*0.6) // (8*3*nao*nao) + slice_size = int(avail_mem*0.5) // (8*3*nao_cart*nao_cart*3) for atoms_slice in lib.prange(0, natm, slice_size): - vj, vk = _get_jk(mol, dm0, atoms_slice=atoms_slice, verbose=verbose) + vj, vk = _get_jk_ip1(mol, dm0, atoms_slice=atoms_slice, verbose=verbose) #:vhf = vj - vk * .5 vhf = vk vhf *= -.5 @@ -375,9 +385,9 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): vj = vk = vhf = None return h1mo - def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice, device_id=0, with_j=True, with_k=True, verbose=0): + # TODO: compute JK in MO assert isinstance(verbose, int) nao, _ = vhfopt.coeff.shape natm = mol.natm @@ -391,7 +401,6 @@ def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice, uniq_l = uniq_l_ctr[:,0] l_ctr_bas_loc = vhfopt.l_ctr_offsets l_symb = [lib.param.ANGULAR[i] for i in uniq_l] - n_groups = len(uniq_l_ctr) kern = libvhf_rys.RYS_build_jk_ip1 timing_counter = Counter() @@ -423,7 +432,7 @@ def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice, info = cp.empty(2, dtype=np.uint32) t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0) - for i, j in task_list: + for i, j, k, l in task_list: ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1], l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]) ish0, ish1 = l_ctr_bas_loc[i], l_ctr_bas_loc[i+1] @@ -438,42 +447,40 @@ def _build_jk_ip1_task(mol, dms, vhfopt, task_list, atoms_slice, cp.arange(jsh0, jsh1, dtype=np.int32)) idx = cp.argsort(sub_tile_q[mask])[::-1] tile_ij_mapping = t_ij[mask][idx] - for k in range(n_groups): - for l in range(k+1): - llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' - kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], - l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) - tile_kl_mapping = tril_tile_mappings[k,l] - scheme = _ip1_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) - err = kern( - vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p), - ctypes.c_int(n_dm), ctypes.c_int(nao), ctypes.c_int(atom0), - vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), - (ctypes.c_int*8)(*ij_shls, *kl_shls), - ctypes.c_int(tile_ij_mapping.size), - ctypes.c_int(tile_kl_mapping.size), - ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), - ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), - ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p), - ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p), - lib.c_null_ptr(), - ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), - ctypes.c_float(log_cutoff), - ctypes.cast(pool.data.ptr, ctypes.c_void_p), - ctypes.cast(info.data.ptr, ctypes.c_void_p), - ctypes.c_int(workers), - mol._atm.ctypes, ctypes.c_int(mol.natm), - mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) - if err != 0: - raise RuntimeError(f'RYS_build_jk kernel for {llll} failed') - if log.verbose >= logger.DEBUG1: - msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' - t1, t1p = log.timer_debug1(msg, *t1), t1 - timing_counter[llll] += t1[1] - t1p[1] - kern_counts += 1 + llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' + kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], + l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) + tile_kl_mapping = tril_tile_mappings[k,l] + scheme = _ip1_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) + err = kern( + vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p), + ctypes.c_int(n_dm), ctypes.c_int(nao), ctypes.c_int(atom0), + vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), + (ctypes.c_int*8)(*ij_shls, *kl_shls), + ctypes.c_int(tile_ij_mapping.size), + ctypes.c_int(tile_kl_mapping.size), + ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p), + ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p), + lib.c_null_ptr(), + ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), + ctypes.c_float(log_cutoff), + ctypes.cast(pool.data.ptr, ctypes.c_void_p), + ctypes.cast(info.data.ptr, ctypes.c_void_p), + ctypes.c_int(workers), + mol._atm.ctypes, ctypes.c_int(mol.natm), + mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) + if err != 0: + raise RuntimeError(f'RYS_build_jk kernel for {llll} failed') + if log.verbose >= logger.DEBUG1: + msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' + t1, t1p = log.timer_debug1(msg, *t1), t1 + timing_counter[llll] += t1[1] - t1p[1] + kern_counts += 1 return vj, vk, kern_counts, timing_counter -def _get_jk(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None): +def _get_jk_ip1(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None): r''' For each atom, compute J = ((\nabla_X i) j| kl) (D_lk + D_ji) @@ -485,7 +492,11 @@ def _get_jk(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None): vhfopt = _VHFOpt(mol) # tile must set to 1. This tile size is assumed in the GPU kernel code vhfopt.tile = 1 - vhfopt.build() + # Small group size for load balance + group_size = None + if _num_devices > 1: + group_size = GROUP_SIZE + vhfopt.build(group_size=group_size) mol = vhfopt.sorted_mol nao, nao_orig = vhfopt.coeff.shape @@ -513,7 +524,12 @@ def _get_jk(mol, dm, with_j=True, with_k=True, atoms_slice=None, verbose=None): assert vhfopt.tile_q_cond.shape == (nbas, nbas) n_groups = len(uniq_l_ctr) - tasks = [(i,j) for i in range(n_groups) for j in range(n_groups)] + tasks = [] + for i in range(n_groups): + for j in range(n_groups): + for k in range(n_groups): + for l in range(k+1): + tasks.append((i,j,k,l)) tasks = np.array(tasks) task_list = [] for device_id in range(_num_devices): @@ -655,10 +671,10 @@ def fvind_vo(mo1): avail_mem = get_avail_mem() # *4 for input dm, vj, vk, and vxc - blksize = int(min(avail_mem*.3 / (8*3*nao*nao*4), - avail_mem*.6 / (8*nmo*nocc*natm*3*5))) + blksize = int(min(avail_mem*.3 / (8*3*nao*nocc*4), # in MO + avail_mem*.3 / (8*nao*nao*3*3))) # vj, vk, dm in AO if blksize < ALIGNED**2: - raise RuntimeError('GPU memory insufficient') + raise RuntimeError('GPU memory insufficient for solving CPHF equations') blksize = (blksize // ALIGNED**2) * ALIGNED**2 log.debug(f'GPU memory {avail_mem/GB:.1f} GB available') @@ -704,78 +720,73 @@ def fvind_vo(mo1): log.timer('CPHF solver', *t0) return mo1s, e1s -def gen_vind(mf, mo_coeff, mo_occ): - # Move data to GPU +def gen_vind(hessobj, mo_coeff, mo_occ): + mol = hessobj.mol mo_coeff = cupy.asarray(mo_coeff) mo_occ = cupy.asarray(mo_occ) nao, nmo = mo_coeff.shape mocc = mo_coeff[:,mo_occ>0] nocc = mocc.shape[1] mocc_2 = mocc * 2 - grids = getattr(mf, 'cphf_grids', None) - if grids is not None: - logger.info(mf, 'Secondary grids defined for CPHF in Hessian') - vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1, grids=grids) def fx(mo1): mo1 = cupy.asarray(mo1) mo1 = mo1.reshape(-1,nmo,nocc) mo1_mo = contract('npo,ip->nio', mo1, mo_coeff) - #dm1 = contract('nio,jo->nij', mo1_mo, mocc_2) - #dm1 = dm1 + dm1.transpose(0,2,1) dm1 = mo1_mo.dot(mocc_2.T) - transpose_sum(dm1) + dm1 = transpose_sum(dm1) dm1 = tag_array(dm1, mo1=mo1_mo, occ_coeff=mocc, mo_occ=mo_occ) - v1 = vresp(dm1) - tmp = contract('nij,jo->nio', v1, mocc) - v1vo = contract('nio,ip->npo', tmp, mo_coeff) - return v1vo + return hessobj.get_veff_resp_mo(mol, dm1, mo_coeff, mo_occ, hermi=1) return fx def hess_nuc_elec(mol, dm): ''' calculate hessian contribution due to (nuc, elec) pair ''' + from gpu4pyscf.df import int3c2e + coords = mol.atom_coords() + charges = cupy.asarray(mol.atom_charges(), dtype=np.float64) + + fakemol = gto.fakemol_for_charges(coords) + fakemol.output = mol.output + fakemol.verbose = mol.verbose + fakemol.stdout = mol.stdout + intopt = int3c2e.VHFOpt(mol, fakemol, 'int2e') + intopt.build(1e-14, diag_block_with_triu=True, aosym=False, + group_size=int3c2e.BLKSIZE, group_size_aux=int3c2e.BLKSIZE) + dm = intopt.sort_orbitals(cupy.asarray(dm), axis=[0,1]) - ''' - nao = mol.nao - aoslices = mol.aoslice_by_atom() natm = mol.natm - hcore = numpy.zeros([3,3,natm,natm]) - # CPU version - for ia in range(mol.natm): - ish0, ish1, i0, i1 = aoslices[ia] - zi = mol.atom_charge(ia) - with mol.with_rinv_at_nucleus(ia): - rinv2aa = mol.intor('int1e_ipiprinv', comp=9).reshape([3,3,nao,nao]) - rinv2ab = mol.intor('int1e_iprinvip', comp=9).reshape([3,3,nao,nao]) - rinv2aa *= zi - rinv2ab *= zi - - hcore[:,:,ia,ia] -= numpy.einsum('xypq,pq->xy', rinv2aa+rinv2ab, dm) - - haa = numpy.einsum('xypq,pq->xyp', rinv2aa, dm) - hab = numpy.einsum('xypq,pq->xyp', rinv2ab, dm) - - haa = [haa[:,:,p0:p1].sum(axis=2) for p0,p1 in aoslices[:,2:]] - hab = [hab[:,:,p0:p1].sum(axis=2) for p0,p1 in aoslices[:,2:]] - - haa = numpy.stack(haa, axis=2) - hab = numpy.stack(hab, axis=2) - - hcore[:,:,ia] += haa - hcore[:,:,ia] += hab.transpose([1,0,2]) - - hcore[:,:,:,ia] += haa.transpose([1,0,2]) - hcore[:,:,:,ia] += hab + nao = mol.nao + hcore_diag = cupy.zeros([9,natm]) + hcore_aa = cupy.zeros([9,natm,nao]) + for i0,i1,j0,j1,k0,k1,int3c_blk in int3c2e.loop_int3c2e_general(intopt, ip_type='ipip1'): + haa = contract('xpji,ij->xpi', int3c_blk, dm[i0:i1,j0:j1]) + hcore_aa[:,k0:k1,i0:i1] += haa + hcore_diag[:,k0:k1] -= contract('xpji,ij->xp', int3c_blk, dm[i0:i1,j0:j1]) + + hcore_ab = cupy.zeros([9,natm,nao]) + for i0,i1,j0,j1,k0,k1,int3c_blk in int3c2e.loop_int3c2e_general(intopt, ip_type='ipvip1'): + hab = contract('xpji,ij->xpi', int3c_blk, dm[i0:i1,j0:j1]) + hcore_ab[:,k0:k1,i0:i1] += hab + hcore_diag[:,k0:k1] -= contract('xpji,ij->xp', int3c_blk, dm[i0:i1,j0:j1]) + + hcore_diag = contract('xp,p->xp', hcore_diag, charges) + hcore_aa = contract('xpj,p->xpj', hcore_aa, charges) + hcore_ab = contract('xpj,p->xpj', hcore_ab, charges) - hcore = cupy.asarray(hcore) - ''' - from gpu4pyscf.df import int3c2e - hcore = int3c2e.get_hess_nuc_elec(mol, dm) + aoslices = mol.aoslice_by_atom() + ao2atom = int3c2e.get_ao2atom(intopt, aoslices) + + hcore_aa = contract('xpj,jq->xpq', hcore_aa, ao2atom).reshape([3,3,natm,natm]) + hcore_ab = contract('xpj,jq->xpq', hcore_ab, ao2atom).reshape([3,3,natm,natm]) + hcore = hcore_aa + hcore_aa.transpose([1,0,3,2]) + hcore+= hcore_ab.transpose([1,0,2,3]) + hcore_ab.transpose([0,1,3,2]) + hcore_diag = hcore_diag.reshape([3,3,natm]) + idx = np.arange(natm) + hcore[:,:,idx,idx] += hcore_diag return hcore * 2.0 - def kernel(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None): cput0 = (logger.process_clock(), logger.perf_counter()) if mo_energy is None: mo_energy = hessobj.base.mo_energy @@ -832,14 +843,14 @@ def _e_hcore_generator(hessobj, dm): h1aa = cupy.asarray(h1aa) h1ab = cupy.asarray(h1ab) - hcore = cupy.empty((3,3,nao,nao)) t1 = log.timer_debug1('get_hcore', *t1) def get_hcore(iatm, jatm): - nonlocal hcore ish0, ish1, i0, i1 = aoslices[iatm] jsh0, jsh1, j0, j1 = aoslices[jatm] rinv2aa = rinv2ab = None if iatm == jatm: + de = contract('xypq,pq->xy', h1aa[:,:,i0:i1], dm[i0:i1]) + de+= contract('xypq,pq->xy', h1ab[:,:,i0:i1,i0:i1], dm[i0:i1,i0:i1]) with mol.with_rinv_at_nucleus(iatm): # The remaining integrals like int1e_ipiprinv are computed in # hess_nuc_elec(mol, dm) @@ -850,18 +861,16 @@ def get_hcore(iatm, jatm): rinv2ab = cupy.asarray(rinv2ab) rinv2aa = rinv2aa.reshape(3,3,nao,nao) rinv2ab = rinv2ab.reshape(3,3,nao,nao) - hcore[:] = 0. - hcore[:,:,i0:i1] += h1aa[:,:,i0:i1] - hcore[:,:,i0:i1,i0:i1] += h1ab[:,:,i0:i1,i0:i1] + if rinv2aa is not None or rinv2ab is not None: - hcore -= rinv2aa + rinv2ab + hcore = -(rinv2aa + rinv2ab) hcore[:,:,i0:i1] += rinv2aa[:,:,i0:i1] hcore[:,:,i0:i1] += rinv2ab[:,:,i0:i1] hcore[:,:,:,i0:i1] += rinv2aa[:,:,i0:i1].transpose(0,1,3,2) hcore[:,:,:,i0:i1] += rinv2ab[:,:,:,i0:i1] + de += cupy.einsum('xypq,pq->xy', hcore, dm) else: - hcore[:] = 0. - hcore[:,:,i0:i1,j0:j1] += h1ab[:,:,i0:i1,j0:j1] + de = contract('xypq,pq->xy',h1ab[:,:,i0:i1,j0:j1],dm[i0:i1,j0:j1]) with mol.with_rinv_at_nucleus(iatm): if with_ecp and iatm in ecp_atoms: shls_slice = (jsh0, jsh1, 0, nbas) @@ -869,8 +878,9 @@ def get_hcore(iatm, jatm): rinv2ab = -mol.intor('ECPscalar_iprinvip', comp=9, shls_slice=shls_slice) rinv2aa = cupy.asarray(rinv2aa) rinv2ab = cupy.asarray(rinv2ab) - hcore[:,:,j0:j1] += rinv2aa.reshape(3,3,j1-j0,nao) - hcore[:,:,j0:j1] += rinv2ab.reshape(3,3,j1-j0,nao).transpose(1,0,2,3) + hcore = rinv2aa.reshape(3,3,j1-j0,nao) + hcore+= rinv2ab.reshape(3,3,j1-j0,nao).transpose(1,0,2,3) + de += contract('xypq,pq->xy', hcore, dm[j0:j1]) with mol.with_rinv_at_nucleus(jatm): if with_ecp and jatm in ecp_atoms: shls_slice = (ish0, ish1, 0, nbas) @@ -878,16 +888,39 @@ def get_hcore(iatm, jatm): rinv2ab = -mol.intor('ECPscalar_iprinvip', comp=9, shls_slice=shls_slice) rinv2aa = cupy.asarray(rinv2aa) rinv2ab = cupy.asarray(rinv2ab) - hcore[:,:,i0:i1] += rinv2aa.reshape(3,3,i1-i0,nao) - hcore[:,:,i0:i1] += rinv2ab.reshape(3,3,i1-i0,nao) - de = cupy.einsum('xypq,pq->xy', hcore, dm) - de += cupy.einsum('xyqp,pq->xy', hcore, dm) - return cp.asarray(de + de_nuc_elec[:,:,iatm,jatm]) + hcore = rinv2aa.reshape(3,3,i1-i0,nao) + hcore+= rinv2ab.reshape(3,3,i1-i0,nao) + de += contract('xypq,pq->xy', hcore, dm[i0:i1]) + # 2.0* due to the symmetry + return cp.asarray(2.0*de + de_nuc_elec[:,:,iatm,jatm]) return get_hcore def hcore_generator(hessobj, mol=None): raise NotImplementedError +def _get_jk_mo(hessobj, mol, dms, mo_coeff, mo_occ, + hermi=1, with_j=True, with_k=True, omega=None): + ''' Compute J/K matrices in MO for multiple DMs + ''' + mf = hessobj.base + vhfopt = mf._opt_gpu.get(omega) + if vhfopt is None: + with mol.with_range_coulomb(omega): + # Small group size for load balance + group_size = None + if _num_devices > 1: + group_size = GROUP_SIZE + vhfopt = _VHFOpt(mol, mf.direct_scf_tol).build(group_size=group_size) + mf._opt_gpu[omega] = vhfopt + with mol.with_range_coulomb(omega): + vj, vk = jk.get_jk(mol, dms, mo_coeff, mo_occ, hermi, vhfopt, with_j, with_k) + return vj, vk + +def _get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None): + vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, + hermi=hermi, with_j=True, with_k=True, omega=omega) + return vj - 0.5 * vk + class HessianBase(lib.StreamObject): # attributes max_cycle = rhf_hess_cpu.HessianBase.max_cycle @@ -899,6 +932,8 @@ class HessianBase(lib.StreamObject): make_h1 = rhf_hess_cpu.HessianBase.make_h1 hcore_generator = hcore_generator # the functionality is different from cpu version hess_nuc = rhf_hess_cpu.HessianBase.hess_nuc + gen_vind = NotImplemented + get_jk = NotImplemented kernel = hess = kernel def get_hcore(self, mol=None): @@ -950,6 +985,9 @@ def __init__(self, scf_method): hess_elec = hess_elec make_h1 = make_h1 gen_hop = NotImplemented + gen_vind = gen_vind + get_jk_mo = _get_jk_mo + get_veff_resp_mo = _get_veff_resp_mo # Inject to RHF class from gpu4pyscf import scf diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index 64c6fa4b..d506b934 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -25,12 +25,13 @@ from pyscf import lib from gpu4pyscf.hessian import rhf as rhf_hess from gpu4pyscf.grad import rhf as rhf_grad -# import pyscf.grad.rks to activate nuc_grad_method method from gpu4pyscf.grad import rks as rks_grad from gpu4pyscf.dft import numint -from gpu4pyscf.lib.cupy_helper import contract, add_sparse, get_avail_mem, reduce_to_device +from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem, + reduce_to_device, transpose_sum) from gpu4pyscf.lib import logger from gpu4pyscf.__config__ import _streams, _num_devices +from gpu4pyscf.hessian import jk def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, max_memory=4000, verbose=None): @@ -109,7 +110,6 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): mol = hessobj.mol natm = mol.natm assert atmlst is None or atmlst == range(natm) - nao = mo_coeff.shape[0] mocc = mo_coeff[:,mo_occ>0] dm0 = numpy.dot(mocc, mocc.T) * 2 avail_mem = get_avail_mem() @@ -122,25 +122,29 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) with_k = ni.libxc.is_hybrid_xc(mf.xc) + # Estimate the size of intermediate variables + # dm, vj, and vk in [natm,3,nao_cart,nao_cart] + nao_cart = mol.nao_cart() avail_mem -= 8 * h1mo.size - slice_size = int(avail_mem*0.5) // (8*3*nao*nao) + slice_size = int(avail_mem*0.5) // (8*3*nao_cart*nao_cart*3) for atoms_slice in lib.prange(0, natm, slice_size): - vj, vk = rhf_hess._get_jk(mol, dm0, with_k=with_k, - atoms_slice=atoms_slice, verbose=verbose) + vj, vk = rhf_hess._get_jk_ip1(mol, dm0, with_k=with_k, + atoms_slice=atoms_slice, verbose=verbose) veff = vj if with_k: vk *= .5 * hyb veff -= vk + vj = vk = None if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: with mol.with_range_coulomb(omega): - vk_lr = rhf_hess._get_jk(mol, dm0, with_j=False, verbose=verbose)[1] + vk_lr = rhf_hess._get_jk_ip1(mol, dm0, with_j=False, verbose=verbose)[1] vk_lr *= (alpha-hyb) * .5 veff -= vk_lr atom0, atom1 = atoms_slice for i, ia in enumerate(range(atom0, atom1)): for ix in range(3): h1mo[ia,ix] += mo_coeff.T.dot(veff[i,ix].dot(mocc)) - vj = vk = vk_lr = veff = None + vk_lr = veff = None return h1mo XX, XY, XZ = 4, 5, 6 @@ -698,6 +702,166 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): vmat = reduce_to_device(vmat_dist, inplace=True) return vmat +def _nr_rks_fxc_mo_task(ni, mol, grids, xc_code, fxc, mo_coeff, mo1, mocc, + verbose=None, hermi=1, device_id=0): + with cupy.cuda.Device(device_id), _streams[device_id]: + if mo_coeff is not None: mo_coeff = cupy.asarray(mo_coeff) + if mo1 is not None: mo1 = cupy.asarray(mo1) + if mocc is not None: mocc = cupy.asarray(mocc) + if fxc is not None: fxc = cupy.asarray(fxc) + + assert isinstance(verbose, int) + log = logger.new_logger(mol, verbose) + xctype = ni._xc_type(xc_code) + opt = getattr(ni, 'gdftopt', None) + + _sorted_mol = opt.mol + nao = mol.nao + nset = mo1.shape[0] + vmat = cupy.zeros((nset, nao, nao)) + + if xctype == 'LDA': + ao_deriv = 0 + else: + ao_deriv = 1 + + ngrids_glob = grids.coords.shape[0] + ngrids_per_device = (ngrids_glob + _num_devices - 1) // _num_devices + grid_start = device_id * ngrids_per_device + grid_end = (device_id + 1) * ngrids_per_device + + p0 = p1 = grid_start + t1 = t0 = log.init_timer() + for ao, mask, weights, coords in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, + max_memory=None, blksize=None, + grid_range=(grid_start, grid_end)): + p0, p1 = p1, p1+len(weights) + occ_coeff_mask = mocc[mask] + rho1 = numint.eval_rho4(_sorted_mol, ao, 2.0*occ_coeff_mask, mo1[:,mask], + xctype=xctype, hermi=hermi) + t1 = log.timer_debug2('eval rho', *t1) + + # precompute fxc_w + if xctype == 'LDA': + fxc_w = fxc[0,0,p0:p1] * weights + wv = rho1 * fxc_w + else: + fxc_w = fxc[:,:,p0:p1] * weights + wv = contract('axg,xyg->ayg', rho1, fxc_w) + + for i in range(nset): + if xctype == 'LDA': + vmat_tmp = ao.dot(numint._scale_ao(ao, wv[i]).T) + elif xctype == 'GGA': + wv[i,0] *= .5 + aow = numint._scale_ao(ao, wv[i]) + vmat_tmp = aow.dot(ao[0].T) + elif xctype == 'NLC': + raise NotImplementedError('NLC') + else: + wv[i,0] *= .5 + wv[i,4] *= .5 + vmat_tmp = ao[0].dot(numint._scale_ao(ao[:4], wv[i,:4]).T) + vmat_tmp+= numint._tau_dot(ao, ao, wv[i,4]) + add_sparse(vmat[i], vmat_tmp, mask) + + t1 = log.timer_debug2('integration', *t1) + ao = rho1 = None + t0 = log.timer_debug1(f'vxc on Device {device_id} ', *t0) + if xctype != 'LDA': + transpose_sum(vmat) + vmat = jk._ao2mo(vmat, mocc, mo_coeff) + return vmat + +def nr_rks_fxc_mo(ni, mol, grids, xc_code, dm0=None, dms=None, mo_coeff=None, relativity=0, hermi=0, + rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None): + log = logger.new_logger(mol, verbose) + t0 = log.init_timer() + if fxc is None: + raise RuntimeError('fxc was not initialized') + #xctype = ni._xc_type(xc_code) + opt = getattr(ni, 'gdftopt', None) + if opt is None or mol not in [opt.mol, opt._sorted_mol]: + ni.build(mol, grids.coords) + opt = ni.gdftopt + + nao = mol.nao + dms = cupy.asarray(dms) + dm_shape = dms.shape + # AO basis -> gdftopt AO basis + with_mocc = hasattr(dms, 'mo1') + mo1 = mocc = None + if with_mocc: + mo1 = opt.sort_orbitals(dms.mo1, axis=[1]) + mocc = opt.sort_orbitals(dms.occ_coeff, axis=[0]) + mo_coeff = opt.sort_orbitals(mo_coeff, axis=[0]) + dms = opt.sort_orbitals(dms.reshape(-1,nao,nao), axis=[1,2]) + + futures = [] + cupy.cuda.get_current_stream().synchronize() + with ThreadPoolExecutor(max_workers=_num_devices) as executor: + for device_id in range(_num_devices): + future = executor.submit( + _nr_rks_fxc_mo_task, + ni, mol, grids, xc_code, fxc, mo_coeff, mo1, mocc, + verbose=log.verbose, hermi=hermi, device_id=device_id) + futures.append(future) + dms = None + vmat_dist = [] + for future in futures: + vmat_dist.append(future.result()) + vmat = reduce_to_device(vmat_dist, inplace=True) + + if len(dm_shape) == 2: + vmat = vmat[0] + t0 = log.timer_debug1('nr_rks_fxc', *t0) + return cupy.asarray(vmat) + +def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1, omega=None): + mol = hessobj.mol + mf = hessobj.base + grids = getattr(mf, 'cphf_grids', None) + if grids is not None: + logger.info(mf, 'Secondary grids defined for CPHF in Hessian') + else: + # If cphf_grids is not defined, e.g object defined from CPU + grids = getattr(mf, 'grids', None) + logger.info(mf, 'Primary grids is used for CPHF in Hessian') + + if grids and grids.coords is None: + grids.build(mol=mol, with_non0tab=False, sort_grids=True) + + ni = mf._numint + omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin) + hybrid = ni.libxc.is_hybrid_xc(mf.xc) + assert not mf.do_nlc() + hermi = 1 + + mocc = mo_coeff[:,mo_occ>0] + nocc = mocc.shape[1] + nao, nmo = mo_coeff.shape + # TODO: evaluate v1 in MO + rho0, vxc, fxc = ni.cache_xc_kernel(mol, grids, mf.xc, + mo_coeff, mo_occ, 0) + v1 = nr_rks_fxc_mo(ni, mol, grids, mf.xc, None, dms, mo_coeff, 0, hermi, + rho0, vxc, fxc, max_memory=None) + v1 = v1.reshape(-1,nmo*nocc) + + if hybrid: + vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1) + vk *= hyb + if omega > 1e-10: # For range separated Coulomb + _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi, + with_j=False, omega=omega) + vk_lr *= (alpha-hyb) + vk += vk_lr + v1 += vj - .5 * vk + else: + v1 += hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1, + with_k=False)[0] + + return v1 + class Hessian(rhf_hess.HessianBase): '''Non-relativistic RKS hessian''' @@ -714,6 +878,9 @@ def __init__(self, mf): partial_hess_elec = partial_hess_elec hess_elec = rhf_hess.hess_elec make_h1 = make_h1 + gen_vind = rhf_hess.gen_vind + get_jk_mo = rhf_hess._get_jk_mo + get_veff_resp_mo = get_veff_resp_mo from gpu4pyscf import dft dft.rks.RKS.Hessian = lib.class_as_method(Hessian) diff --git a/gpu4pyscf/hessian/tests/test_rhf_hessian.py b/gpu4pyscf/hessian/tests/test_rhf_hessian.py index a0b07196..ac657199 100644 --- a/gpu4pyscf/hessian/tests/test_rhf_hessian.py +++ b/gpu4pyscf/hessian/tests/test_rhf_hessian.py @@ -14,10 +14,14 @@ import unittest import numpy as np -from pyscf import gto, scf, lib +import cupy +import pyscf +from pyscf import gto, lib from pyscf import grad, hessian from pyscf.hessian import rhf as rhf_cpu +from gpu4pyscf import scf from gpu4pyscf.hessian import rhf as rhf_gpu +from gpu4pyscf.hessian import jk def setUpModule(): global mol @@ -46,7 +50,7 @@ def test_hessian_rhf(self): assert abs(ref - e2_gpu).max() < 1e-6 def test_partial_hess_elec(self): - mf = scf.RHF(mol) + mf = pyscf.scf.RHF(mol) mf.conv_tol = 1e-14 mf.kernel() hobj = mf.Hessian() @@ -102,8 +106,7 @@ def test_get_jk(self): nao = mol.nao mo_coeff = np.random.rand(nao, nao) dm = mo_coeff.dot(mo_coeff.T) * 2 - - vj, vk = rhf_gpu._get_jk(mol, dm) + vj, vk = rhf_gpu._get_jk_ip1(mol, dm) assert abs(lib.fp(vj.get()) - 87674.69061160382) < 1e-7 assert abs(lib.fp(vk.get()) - -9.317650662101629) < 1e-7 @@ -139,6 +142,59 @@ def test_hessian_rhf_D3(self): e2_gpu = mf.Hessian().to_gpu().kernel() assert abs(ref - e2_gpu).max() < 1e-6 + def test_jk_mix(self): + mol1 = pyscf.M( + atom=''' + C -1.20806619, -0.34108413, -0.00755148 + C 1.28636081, -0.34128013, -0.00668648 + H 2.53407081, 1.81906387, -0.00736748 + H 1.28693681, 3.97963587, -0.00925948 + ''', + basis='''unc + #BASIS SET: + H S + 1.815041 1 + 0.591063 1 + H P + 2.305000 1 + #BASIS SET: + C S + 8.383976 1 + 3.577015 1 + 1.547118 1 + H P + 2.305000 1 + 1.098827 1 + 0.806750 1 + 0.282362 1 + H D + 1.81900 1 + 0.72760 1 + 0.29104 1 + H F + 0.970109 1 + C G + 0.625000 1 + C H + 0.4 1 + ''', + output = '/dev/null' + ) + nao = mol1.nao + mo_coeff = cupy.random.rand(nao, nao) + mo_occ = cupy.zeros([nao]) + mo_occ[:3] = 2 + mocc = mo_coeff[:,:3] + dm = mocc.dot(mocc.T) * 2 + vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, mo_occ, hermi=1) + + mf = scf.RHF(mol1) + vj, vk = mf.get_jk(mol1, dm, hermi=1) + vj_cpu = (mo_coeff.T @ vj @ mocc).reshape(1,-1) + vk_cpu = (mo_coeff.T @ vk @ mocc).reshape(1,-1) + assert cupy.linalg.norm(vj_cpu - vj_mo) < 1e-5 + assert cupy.linalg.norm(vk_cpu - vk_mo) < 1e-5 + if __name__ == "__main__": print("Full Tests for RHF Hessian") unittest.main() diff --git a/gpu4pyscf/hessian/tests/test_uhf_hessian.py b/gpu4pyscf/hessian/tests/test_uhf_hessian.py index c4112bec..a7d5c983 100644 --- a/gpu4pyscf/hessian/tests/test_uhf_hessian.py +++ b/gpu4pyscf/hessian/tests/test_uhf_hessian.py @@ -14,10 +14,14 @@ import unittest import numpy -from pyscf import gto, scf, lib +import cupy +import pyscf +from pyscf import gto, lib from pyscf import grad, hessian from pyscf.hessian import uhf as uhf_cpu +from gpu4pyscf import scf from gpu4pyscf.hessian import uhf as uhf_gpu +from gpu4pyscf.hessian import jk def setUpModule(): global mol @@ -48,7 +52,7 @@ def test_hessian_uhf(self): assert abs(ref - e2_gpu).max() < 1e-6 def test_partial_hess_elec(self): - mf = scf.UHF(mol) + mf = pyscf.scf.UHF(mol) mf.conv_tol = 1e-14 mf.kernel() hobj = mf.Hessian() @@ -73,6 +77,68 @@ def test_hessian_uhf_D3(self): e2_gpu = mf.Hessian().to_gpu().kernel() assert abs(ref - e2_gpu).max() < 1e-6 + def test_jk_mix(self): + mol1 = pyscf.M( + atom=''' + C -1.20806619, -0.34108413, -0.00755148 + C 1.28636081, -0.34128013, -0.00668648 + H 2.53407081, 1.81906387, -0.00736748 + H 1.28693681, 3.97963587, -0.00925948 + ''', + basis='''unc + #BASIS SET: + H S + 1.815041 1 + 0.591063 1 + H P + 2.305000 1 + #BASIS SET: + C S + 8.383976 1 + 3.577015 1 + 1.547118 1 + H P + 2.305000 1 + 1.098827 1 + 0.806750 1 + 0.282362 1 + H D + 1.81900 1 + 0.72760 1 + 0.29104 1 + H F + 0.970109 1 + C G + 0.625000 1 + C H + 0.4 1 + ''', + output = '/dev/null' + ) + nao = mol1.nao + mo_coeff = cupy.random.rand(2, nao, nao) + mocca = mo_coeff[0,:,:3] + moccb = mo_coeff[1,:,:2] + mo_occ = cupy.zeros([2,nao]) + mo_occ[0,:3] = 1 + mo_occ[1,:2] = 1 + dm = cupy.empty([2,nao,nao]) + dm[0] = mocca.dot(mocca.T) + dm[1] = moccb.dot(moccb.T) + vj_mo, vk_mo = jk.get_jk(mol1, dm, mo_coeff, mo_occ, hermi=1) + + mf = scf.UHF(mol1) + vj, vk = mf.get_jk(mol1, dm, hermi=1) + vj2 = cupy.empty([5*nao]) + vk2 = cupy.empty([5*nao]) + vj = vj[0] + vj[1] + vj2[:3*nao] = (mo_coeff[0].T @ vj @ mocca).reshape(1,-1) + vj2[3*nao:] = (mo_coeff[1].T @ vj @ moccb).reshape(1,-1) + vk2[:3*nao] = (mo_coeff[0].T @ vk[0] @ mocca).reshape(1,-1) + vk2[3*nao:] = (mo_coeff[1].T @ vk[1] @ moccb).reshape(1,-1) + assert cupy.linalg.norm(vj2 - vj_mo) < 1e-5 + assert cupy.linalg.norm(vk2 - vk_mo) < 1e-5 + if __name__ == "__main__": print("Full Tests for UHF Hessian") unittest.main() diff --git a/gpu4pyscf/hessian/uhf.py b/gpu4pyscf/hessian/uhf.py index 9d389bc0..88a6c9fd 100644 --- a/gpu4pyscf/hessian/uhf.py +++ b/gpu4pyscf/hessian/uhf.py @@ -21,19 +21,17 @@ Non-relativistic UHF analytical Hessian ''' -from functools import reduce import numpy as np import cupy import cupy as cp from pyscf import lib from pyscf.scf import ucphf -# import _response_functions to load gen_response methods in SCF class -from gpu4pyscf.scf import _response_functions # noqa -from gpu4pyscf.gto.mole import sort_atoms -from gpu4pyscf.lib.cupy_helper import contract, tag_array, get_avail_mem, krylov +from gpu4pyscf.lib.cupy_helper import (contract, transpose_sum, get_avail_mem, + krylov, tag_array) from gpu4pyscf.lib import logger from gpu4pyscf.grad import rhf as rhf_grad from gpu4pyscf.hessian import rhf as rhf_hess_gpu +from gpu4pyscf.hessian import jk GB = 1024*1024*1024 ALIGNED = 4 @@ -67,8 +65,9 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, h1mo = (h1mo[0].get(), h1mo[1].get()) t1 = log.timer_debug1('making H1', *t1) if mo1 is None or mo_e1 is None: + fx = hessobj.gen_vind(mo_coeff, mo_occ) mo1, mo_e1 = hessobj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1mo, - None, atmlst, max_memory, log) + fx, atmlst, max_memory, log) t1 = log.timer_debug1('solving MO1', *t1) mo1a = cupy.asarray(mo1[0]) @@ -181,18 +180,20 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): mo_a, mo_b = mo_coeff mocca = mo_a[:,mo_occ[0]>0] moccb = mo_b[:,mo_occ[1]>0] - nao = mo_a.shape[0] dm0a = mocca.dot(mocca.T) dm0b = moccb.dot(moccb.T) grad_obj = hessobj.base.Gradients() h1moa = rhf_grad.get_grad_hcore(grad_obj, mo_a, mo_occ[0]) h1mob = rhf_grad.get_grad_hcore(grad_obj, mo_b, mo_occ[1]) + # Estimate the size of intermediate variables + # dm, vj, and vk in [natm,3,nao_cart,nao_cart] + nao_cart = mol.nao_cart() avail_mem = get_avail_mem() - slice_size = int(avail_mem*0.6) // (8*3*nao*nao*2) + slice_size = int(avail_mem*0.5) // (8*3*nao_cart*nao_cart*6) for atoms_slice in lib.prange(0, natm, slice_size): - vja, vka = rhf_hess_gpu._get_jk(mol, dm0a, atoms_slice=atoms_slice, verbose=verbose) - vjb, vkb = rhf_hess_gpu._get_jk(mol, dm0b, atoms_slice=atoms_slice, verbose=verbose) + vja, vka = rhf_hess_gpu._get_jk_ip1(mol, dm0a, atoms_slice=atoms_slice, verbose=verbose) + vjb, vkb = rhf_hess_gpu._get_jk_ip1(mol, dm0b, atoms_slice=atoms_slice, verbose=verbose) #:vhfa = vja+vjb - vka #:vhfb = vja+vjb - vkb vhfa = vka @@ -291,8 +292,8 @@ def fvind_vo(mo1): avail_mem = get_avail_mem() # *8 for spin-up/down input dm, vj, vk, and vxc - blksize = int(min(avail_mem*.3 / (8*3*nao*nao*8), - avail_mem*.6 / (8*nmo*nocc*natm*3*5))) + blksize = int(min(avail_mem*.3 / (8*3*nao*nocc*8), + avail_mem*.3 / (8*nao*nao*3*6))) # in vj, vk, dm in AO if blksize < ALIGNED**2: raise RuntimeError('GPU memory insufficient') @@ -368,8 +369,9 @@ def fvind_vo(mo1): log.timer('CPHF solver', *t0) return (mo1sa, mo1sb), (e1sa, e1sb) -def gen_vind(mf, mo_coeff, mo_occ): +def gen_vind(hessobj, mo_coeff, mo_occ): # Move data to GPU + mol = hessobj.mol mo_coeff = cupy.asarray(mo_coeff) mo_occ = cupy.asarray(mo_occ) nao, nmoa = mo_coeff[0].shape @@ -378,39 +380,32 @@ def gen_vind(mf, mo_coeff, mo_occ): moccb = mo_coeff[1][:,mo_occ[1]>0] nocca = mocca.shape[1] noccb = moccb.shape[1] - grids = getattr(mf, 'cphf_grids', None) - if grids is not None: - logger.info(mf, 'Secondary grids defined for CPHF in Hessian') - vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1, grids=grids) def fx(mo1): mo1 = cupy.asarray(mo1) mo1 = mo1.reshape(-1,nmoa*nocca+nmob*noccb) nset = len(mo1) + dm1 = cupy.empty([2,nset,nao,nao]) + x = mo1[:,:nmoa*nocca].reshape(nset,nmoa,nocca) mo1_moa = contract('npo,ip->nio', x, mo_coeff[0]) dma = contract('nio,jo->nij', mo1_moa, mocca) + dm1[0] = transpose_sum(dma) x = mo1[:,nmoa*nocca:].reshape(nset,nmob,noccb) mo1_mob = contract('npo,ip->nio', x, mo_coeff[1]) dmb = contract('nio,jo->nij', mo1_mob, moccb) - - dm1 = cupy.empty([2,nset,nao,nao]) - dm1[0] = dma + dma.transpose(0,2,1) - dm1[1] = dmb + dmb.transpose(0,2,1) + dm1[1] = transpose_sum(dmb) dm1 = tag_array(dm1, mo1=[mo1_moa,mo1_mob], occ_coeff=[mocca,moccb], mo_occ=mo_occ) - v1 = vresp(dm1) - v1vo = cupy.empty_like(mo1) - tmp = contract('nij,jo->nio', v1[0], mocca) - v1vo[:,:nmoa*nocca] = contract('nio,ip->npo', tmp, mo_coeff[0]).reshape(nset,-1) - - tmp = contract('nij,jo->nio', v1[1], moccb) - v1vo[:,nmoa*nocca:] = contract('nio,ip->npo', tmp, mo_coeff[1]).reshape(nset,-1) - return v1vo + return hessobj.get_veff_resp_mo(mol, dm1, mo_coeff, mo_occ, hermi=1) return fx +def _get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1): + vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, + hermi=hermi, with_j=True, with_k=True) + return vj - vk class Hessian(rhf_hess_gpu.HessianBase): '''Non-relativistic unrestricted Hartree-Fock hessian''' @@ -421,6 +416,9 @@ class Hessian(rhf_hess_gpu.HessianBase): partial_hess_elec = partial_hess_elec hess_elec = hess_elec make_h1 = make_h1 + gen_vind = gen_vind + get_jk_mo = rhf_hess_gpu._get_jk_mo + get_veff_resp_mo = _get_veff_resp_mo def solve_mo1(self, mo_energy, mo_coeff, mo_occ, h1mo, fx=None, atmlst=None, max_memory=4000, verbose=None): diff --git a/gpu4pyscf/hessian/uks.py b/gpu4pyscf/hessian/uks.py index 19216a55..2a048f5f 100644 --- a/gpu4pyscf/hessian/uks.py +++ b/gpu4pyscf/hessian/uks.py @@ -23,11 +23,11 @@ from gpu4pyscf.hessian import rhf as rhf_hess from gpu4pyscf.hessian import uhf as uhf_hess from gpu4pyscf.grad import rhf as rhf_grad -# import pyscf.grad.rks to activate nuc_grad_method method from gpu4pyscf.grad import rks as rks_grad from gpu4pyscf.dft import numint -from gpu4pyscf.lib.cupy_helper import contract, add_sparse, take_last2d, get_avail_mem +from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem) from gpu4pyscf.lib import logger +from gpu4pyscf.hessian import jk def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, max_memory=4000, verbose=None): @@ -114,7 +114,6 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): mo_a, mo_b = mo_coeff mocca = mo_a[:,mo_occ[0]>0] moccb = mo_b[:,mo_occ[1]>0] - nao = mo_a.shape[0] dm0a = mocca.dot(mocca.T) dm0b = moccb.dot(moccb.T) avail_mem = get_avail_mem() @@ -129,11 +128,14 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) with_k = ni.libxc.is_hybrid_xc(mf.xc) + # Estimate the size of intermediate variables + # dm, vj, and vk in [natm,3,nao_cart,nao_cart] + nao_cart = mol.nao_cart() avail_mem -= 8 * (h1moa.size + h1mob.size) - slice_size = int(avail_mem*0.5) // (8*3*nao*nao) + slice_size = int(avail_mem*0.5) // (8*3*nao_cart*nao_cart*6) for atoms_slice in lib.prange(0, natm, slice_size): - vja, vka = rhf_hess._get_jk(mol, dm0a, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose) - vjb, vkb = rhf_hess._get_jk(mol, dm0b, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose) + vja, vka = rhf_hess._get_jk_ip1(mol, dm0a, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose) + vjb, vkb = rhf_hess._get_jk_ip1(mol, dm0b, with_k=with_k, atoms_slice=atoms_slice, verbose=verbose) vj = vja + vjb if with_k: #:veffa = vja + vjb - hyb * vka @@ -150,8 +152,8 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): vj = vja = vjb = vka = vkb = None if abs(omega) > 1e-10 and abs(alpha-hyb) > 1e-10: with mol.with_range_coulomb(omega): - vka_lr = rhf_hess._get_jk(mol, dm0a, with_j=False, verbose=verbose)[1] - vkb_lr = rhf_hess._get_jk(mol, dm0b, with_j=False, verbose=verbose)[1] + vka_lr = rhf_hess._get_jk_ip1(mol, dm0a, with_j=False, verbose=verbose)[1] + vkb_lr = rhf_hess._get_jk_ip1(mol, dm0b, with_j=False, verbose=verbose)[1] vka_lr *= (alpha-hyb) vkb_lr *= (alpha-hyb) veffa -= vka_lr @@ -842,6 +844,55 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): vmatb[ia] -= vmat_tmp return vmata, vmatb +def get_veff_resp_mo(hessobj, mol, dms, mo_coeff, mo_occ, hermi=1): + mol = hessobj.mol + mf = hessobj.base + grids = getattr(mf, 'cphf_grids', None) + if grids is not None: + logger.info(mf, 'Secondary grids defined for CPHF in Hessian') + else: + # If cphf_grids is not defined, e.g object defined from CPU + grids = getattr(mf, 'grids', None) + logger.info(mf, 'Primary grids is used for CPHF in Hessian') + + if grids and grids.coords is None: + grids.build(mol=mol, with_non0tab=False, sort_grids=True) + + nao, nmoa = mo_coeff[0].shape + nao, nmob = mo_coeff[1].shape + mocca = mo_coeff[0][:,mo_occ[0]>0] + moccb = mo_coeff[1][:,mo_occ[1]>0] + nocca = mocca.shape[1] + noccb = moccb.shape[1] + + ni = mf._numint + omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin) + hybrid = ni.libxc.is_hybrid_xc(mf.xc) + assert not mf.do_nlc() + hermi = 1 + + rho0, vxc, fxc = ni.cache_xc_kernel(mol, grids, mf.xc, + mo_coeff, mo_occ, 1) + v1 = ni.nr_uks_fxc(mol, grids, mf.xc, None, dms, 0, hermi, + rho0, vxc, fxc, max_memory=None) + nset = dms.shape[1] + v1vo = cupy.empty([nset, nmoa*nocca+nmob*noccb]) + v1vo[:,:nmoa*nocca] = jk._ao2mo(v1[0], mocca, mo_coeff[0]).reshape(-1,nmoa*nocca) + v1vo[:,nmoa*nocca:] = jk._ao2mo(v1[1], moccb, mo_coeff[1]).reshape(-1,nmob*noccb) + if hybrid: + vj, vk = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, hermi=1) + vk *= hyb + if omega > 1e-10: + _, vk_lr = hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, + hermi, with_j=False, omega=omega) + vk_lr *= (alpha-hyb) + vk += vk_lr + v1vo += vj - vk + else: + v1vo += hessobj.get_jk_mo(mol, dms, mo_coeff, mo_occ, + hermi=1, with_k=False)[0] + return v1vo + class Hessian(rhf_hess.HessianBase): '''Non-relativistic UKS hessian''' @@ -856,6 +907,9 @@ def __init__(self, mf): solve_mo1 = uhf_hess.Hessian.solve_mo1 partial_hess_elec = partial_hess_elec make_h1 = make_h1 + gen_vind = uhf_hess.gen_vind + get_jk_mo = rhf_hess._get_jk_mo + get_veff_resp_mo = get_veff_resp_mo from gpu4pyscf import dft dft.uks.UKS.Hessian = lib.class_as_method(Hessian) diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index 5828bbfe..d68cbcff 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -23,6 +23,7 @@ from gpu4pyscf.gto import mole from gpu4pyscf.lib.cutensor import contract from gpu4pyscf.lib.cusolver import eigh, cholesky #NOQA +from gpu4pyscf.lib.memcpy import copy_array #NOQA from gpu4pyscf.__config__ import _streams, _num_devices, _p2p_access LMAX_ON_GPU = 7 @@ -87,15 +88,15 @@ def p2p_transfer(a, b): a[:] = b elif _p2p_access: a[:] = b + ''' elif a.strides == b.strides and a.flags.c_contiguous and a.dtype == b.dtype: # cupy supports a direct copy from different devices without p2p. See also # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L48 # https://github.com/cupy/cupy/blob/v13.3.0/cupy/_core/_routines_indexing.pyx#L1015 a[:] = b + ''' else: - with cupy.cuda.Device(a.device): - # TODO: reduce memory copy, a can be non-contiguous array - a[:] = cupy.asarray(b.get()) + copy_array(b, a) def concatenate(array_list): ''' Concatenate axis=0 only @@ -103,15 +104,16 @@ def concatenate(array_list): if _p2p_access: return cupy.concatenate(array_list) else: - array_list_cpu = [a.get() for a in array_list] - n = sum([a.shape[0] for a in array_list_cpu]) - a0_shape = list(array_list_cpu[0].shape) + #array_list_cpu = [a.get() for a in array_list] + n = sum([a.shape[0] for a in array_list]) + a0_shape = list(array_list[0].shape) out_shape = tuple([n] + a0_shape[1:]) out = cupy.empty(out_shape) p0 = p1 = 0 - for a in array_list_cpu: + for a in array_list: p1 = p0 + a.shape[0] - out[p0:p1].set(a) + #out[p0:p1].set(a) + copy_array(a, out[p0:p1]) p0 = p1 return out @@ -136,18 +138,19 @@ def reduce_to_device(array_list, inplace=False): result = array_list[0] else: result = array_list[0].copy() + + # Transfer data chunk by chunk, reduce memory footprint, result = result.reshape(-1) - # Asynchronously add each matrix from its device for device_id, matrix in enumerate(array_list): if device_id == 0: continue assert matrix.device.id == device_id matrix = matrix.reshape(-1) - blksize = 1024*1024*128 # 1GB + blksize = 1024*1024*1024 // matrix.itemsize # 1GB for p0, p1 in lib.prange(0,len(matrix), blksize): - result[p0:p1] += cupy.asarray(matrix[p0:p1]) - + result[p0:p1] += copy_array(matrix[p0:p1]) + #result[p0:p1] += cupy.asarray(matrix[p0:p1]) return result.reshape(out_shape) def device2host_2d(a_cpu, a_gpu, stream=None): diff --git a/gpu4pyscf/lib/gint/g1e_ip_root_1.cu b/gpu4pyscf/lib/gint/g1e_ip_root_1.cu index d04b1b2f..1cf53f89 100644 --- a/gpu4pyscf/lib/gint/g1e_ip_root_1.cu +++ b/gpu4pyscf/lib/gint/g1e_ip_root_1.cu @@ -210,6 +210,100 @@ static void GINTfill_int3c1e_ip1_charge_contracted_kernel00(double* output, cons atomicAdd(output + (i0 + j0 * stride_j + 2 * stride_ij), deri_dAz_grid_sum); } +__global__ +static void GINTfill_int3c1e_ip1_density_contracted_kernel00(double* output, const BasisProdOffsets offsets, const int nprim_ij, + const double* density, const int* aoslice, const int nao, + const double omega, const double* grid_points, const double* charge_exponents) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_ij = blockIdx.x * blockDim.x + threadIdx.x; + const int task_grid = blockIdx.y * blockDim.y + threadIdx.y; + + if (task_ij >= ntasks_ij || task_grid >= ngrids) { + return; + } + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + const int jsh = bas_pair2ket[bas_ij]; + + const double* __restrict__ a12 = c_bpcache.a12; + const double* __restrict__ e12 = c_bpcache.e12; + const double* __restrict__ x12 = c_bpcache.x12; + const double* __restrict__ y12 = c_bpcache.y12; + const double* __restrict__ z12 = c_bpcache.z12; + + const double* __restrict__ a_exponents = c_bpcache.a1; + const int nbas = c_bpcache.nbas; + const double* __restrict__ bas_x = c_bpcache.bas_coords; + const double* __restrict__ bas_y = bas_x + nbas; + const double* __restrict__ bas_z = bas_y + nbas; + const double Ax = bas_x[ish]; + const double Ay = bas_y[ish]; + const double Az = bas_z[ish]; + + const double* grid_point = grid_points + task_grid * 3; + const double Cx = grid_point[0]; + const double Cy = grid_point[1]; + const double Cz = grid_point[2]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + double deri_dAx = 0; + double deri_dAy = 0; + double deri_dAz = 0; + for (int ij = prim_ij; ij < prim_ij + nprim_ij; ij++) { + const double aij = a12[ij]; + const double eij = e12[ij]; + const double Px = x12[ij]; + const double Py = y12[ij]; + const double Pz = z12[ij]; + const double PCx = Px - Cx; + const double PCy = Py - Cy; + const double PCz = Pz - Cz; + const double PAx = Px - Ax; + const double PAy = Py - Ay; + const double PAz = Pz - Az; + const double minus_two_a = -2.0 * a_exponents[ij]; + const double one_over_two_p = 0.5 / aij; + double a0 = aij; + const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0; + const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0; + a0 *= q_over_p_plus_q; + const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; + const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0; + a0 *= theta; + + const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q; + const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz); + if (boys_input > 3.e-7) { + const double sqrt_boys_input = sqrt(boys_input); + const double R000_0 = SQRTPIE4 / sqrt_boys_input * erf(sqrt_boys_input); + const double R000_1 = -a0 * (R000_0 - exp(-boys_input)) / boys_input; + deri_dAx += prefactor * minus_two_a * (PAx * R000_0 + one_over_two_p * R000_1 * PCx); + deri_dAy += prefactor * minus_two_a * (PAy * R000_0 + one_over_two_p * R000_1 * PCy); + deri_dAz += prefactor * minus_two_a * (PAz * R000_0 + one_over_two_p * R000_1 * PCz); + } + } + + const int* ao_loc = c_bpcache.ao_loc; + const int i0 = ao_loc[ish]; + const int j0 = ao_loc[jsh]; + + const double Dij = density[i0 + j0 * nao]; + deri_dAx *= Dij; + deri_dAy *= Dij; + deri_dAz *= Dij; + + const int i_atom = aoslice[ish]; + atomicAdd(output + (task_grid + ngrids * (i_atom * 3 + 0)), deri_dAx); + atomicAdd(output + (task_grid + ngrids * (i_atom * 3 + 1)), deri_dAy); + atomicAdd(output + (task_grid + ngrids * (i_atom * 3 + 2)), deri_dAz); +} + __global__ static void GINTfill_int3c1e_ip2_density_contracted_kernel00(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets, const BasisProdOffsets offsets, const int nprim_ij, @@ -282,3 +376,85 @@ static void GINTfill_int3c1e_ip2_density_contracted_kernel00(double* output, con atomicAdd(output + task_grid + 1 * ngrids, deri_dCy_pair_sum); atomicAdd(output + task_grid + 2 * ngrids, deri_dCz_pair_sum); } + +__global__ +static void GINTfill_int3c1e_ip2_charge_contracted_kernel00(double* output, const BasisProdOffsets offsets, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, const int* gridslice, + const double omega, const double* grid_points, const double* charge_exponents) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_ij = blockIdx.x * blockDim.x + threadIdx.x; + const int task_grid = blockIdx.y * blockDim.y + threadIdx.y; + + if (task_ij >= ntasks_ij || task_grid >= ngrids) { + return; + } + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + const int jsh = bas_pair2ket[bas_ij]; + + const double* __restrict__ a12 = c_bpcache.a12; + const double* __restrict__ e12 = c_bpcache.e12; + const double* __restrict__ x12 = c_bpcache.x12; + const double* __restrict__ y12 = c_bpcache.y12; + const double* __restrict__ z12 = c_bpcache.z12; + + const double* grid_point = grid_points + task_grid * 4; + const double Cx = grid_point[0]; + const double Cy = grid_point[1]; + const double Cz = grid_point[2]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + double deri_dCx = 0; + double deri_dCy = 0; + double deri_dCz = 0; + for (int ij = prim_ij; ij < prim_ij + nprim_ij; ij++) { + const double aij = a12[ij]; + const double eij = e12[ij]; + const double Px = x12[ij]; + const double Py = y12[ij]; + const double Pz = z12[ij]; + const double PCx = Px - Cx; + const double PCy = Py - Cy; + const double PCz = Pz - Cz; + double a0 = aij; + const double q_over_p_plus_q = charge_exponent > 0.0 ? charge_exponent / (aij + charge_exponent) : 1.0; + const double sqrt_q_over_p_plus_q = charge_exponent > 0.0 ? sqrt(q_over_p_plus_q) : 1.0; + a0 *= q_over_p_plus_q; + const double theta = omega > 0.0 ? omega * omega / (omega * omega + a0) : 1.0; + const double sqrt_theta = omega > 0.0 ? sqrt(theta) : 1.0; + a0 *= theta; + + const double prefactor = 2.0 * M_PI / aij * eij * sqrt_theta * sqrt_q_over_p_plus_q; + const double boys_input = a0 * (PCx * PCx + PCy * PCy + PCz * PCz); + if (boys_input > 3.e-7) { + const double sqrt_boys_input = sqrt(boys_input); + const double R000_0 = SQRTPIE4 / sqrt_boys_input * erf(sqrt_boys_input); + const double R000_1 = -a0 * (R000_0 - exp(-boys_input)) / boys_input; + const double R100_0 = R000_1 * PCx; + const double R010_0 = R000_1 * PCy; + const double R001_0 = R000_1 * PCz; + deri_dCx += prefactor * R100_0; + deri_dCy += prefactor * R010_0; + deri_dCz += prefactor * R001_0; + } + } + + const double charge = grid_point[3]; + deri_dCx *= charge; + deri_dCy *= charge; + deri_dCz *= charge; + + const int i_atom = gridslice[task_grid]; + const int* ao_loc = c_bpcache.ao_loc; + const int i0 = ao_loc[ish] - ao_offsets_i; + const int j0 = ao_loc[jsh] - ao_offsets_j; + atomicAdd(output + (i0 + j0 * stride_j + 0 * stride_ij + i_atom * 3 * stride_ij), deri_dCx); + atomicAdd(output + (i0 + j0 * stride_j + 1 * stride_ij + i_atom * 3 * stride_ij), deri_dCy); + atomicAdd(output + (i0 + j0 * stride_j + 2 * stride_ij + i_atom * 3 * stride_ij), deri_dCz); +} diff --git a/gpu4pyscf/lib/gint/g3c1e_ip.cu b/gpu4pyscf/lib/gint/g3c1e_ip.cu index b7524ca2..9a806bef 100644 --- a/gpu4pyscf/lib/gint/g3c1e_ip.cu +++ b/gpu4pyscf/lib/gint/g3c1e_ip.cu @@ -366,6 +366,104 @@ static void GINTfill_int3c1e_ip1_charge_contracted_kernel_general(double* output } } +template +__device__ +static void GINTwrite_int3c1e_ip1_density_contracted(const double* g, double* output, const double minus_two_a, const double* density, const int* aoslice, const int nao, + const int ish, const int jsh, const int i_grid, const int i_l, const int j_l, const int ngrids) +{ + const int* ao_loc = c_bpcache.ao_loc; + + const int i0 = ao_loc[ish]; + const int j0 = ao_loc[jsh]; + + const int i_atom = aoslice[ish]; + + const int *idx = c_idx; + const int *idy = c_idx + TOT_NF; + const int *idz = c_idx + TOT_NF * 2; + + const int g_size = NROOTS * (i_l + 1 + 1) * (j_l + 1); + const double* __restrict__ gx = g; + const double* __restrict__ gy = g + g_size; + const double* __restrict__ gz = g + g_size * 2; + + const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2; + const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2; + for (int j = 0; j < n_density_elements_j; j++) { + for (int i = 0; i < n_density_elements_i; i++) { + const int loc_j = c_l_locs[j_l] + j; + const int loc_i = c_l_locs[i_l] + i; + const int ix = idx[loc_i]; + const int iy = idy[loc_i]; + const int iz = idz[loc_i]; + const int jx = idx[loc_j]; + const int jy = idy[loc_j]; + const int jz = idz[loc_j]; + const int gx_offset = ix + jx * (i_l + 1 + 1); + const int gy_offset = iy + jy * (i_l + 1 + 1); + const int gz_offset = iz + jz * (i_l + 1 + 1); + + double deri_dAx = 0; + double deri_dAy = 0; + double deri_dAz = 0; +#pragma unroll + for (int i_root = 0; i_root < NROOTS; i_root++) { + const double gx_0 = gx[gx_offset * NROOTS + i_root]; + const double gy_0 = gy[gy_offset * NROOTS + i_root]; + const double gz_0 = gz[gz_offset * NROOTS + i_root]; + const double dgx_dAx = (ix > 0 ? ix * gx[(gx_offset - 1) * NROOTS + i_root] : 0) + minus_two_a * gx[(gx_offset + 1) * NROOTS + i_root]; + const double dgy_dAy = (iy > 0 ? iy * gy[(gy_offset - 1) * NROOTS + i_root] : 0) + minus_two_a * gy[(gy_offset + 1) * NROOTS + i_root]; + const double dgz_dAz = (iz > 0 ? iz * gz[(gz_offset - 1) * NROOTS + i_root] : 0) + minus_two_a * gz[(gz_offset + 1) * NROOTS + i_root]; + deri_dAx += dgx_dAx * gy_0 * gz_0; + deri_dAy += gx_0 * dgy_dAy * gz_0; + deri_dAz += gx_0 * gy_0 * dgz_dAz; + } + const double Dij = density[(i + i0) + (j + j0) * nao]; + deri_dAx *= Dij; + deri_dAy *= Dij; + deri_dAz *= Dij; + atomicAdd(output + (i_grid + ngrids * (i_atom * 3 + 0)), deri_dAx); + atomicAdd(output + (i_grid + ngrids * (i_atom * 3 + 1)), deri_dAy); + atomicAdd(output + (i_grid + ngrids * (i_atom * 3 + 2)), deri_dAz); + } + } +} + +template +__global__ +static void GINTfill_int3c1e_ip1_density_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const double* density, const int* aoslice, const int nao, + const double omega, const double* grid_points, const double* charge_exponents) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_ij = blockIdx.x * blockDim.x + threadIdx.x; + const int task_grid = blockIdx.y * blockDim.y + threadIdx.y; + + if (task_ij >= ntasks_ij || task_grid >= ngrids) { + return; + } + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + const int jsh = bas_pair2ket[bas_ij]; + const double* __restrict__ a_exponents = c_bpcache.a1; + + const double* grid_point = grid_points + task_grid * 3; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + double g[GSIZE_INT3C_1E]; + + for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { + GINT_g1e(g, grid_point, ish, jsh, ij, i_l + 1, j_l, charge_exponent, omega); + const double minus_two_a = -2.0 * a_exponents[ij]; + GINTwrite_int3c1e_ip1_density_contracted(g, output, minus_two_a, density, aoslice, nao, ish, jsh, task_grid, i_l, j_l, ngrids); + } +} + template __global__ static void GINTfill_int3c1e_ip2_density_contracted_kernel_general(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets, @@ -464,3 +562,120 @@ static void GINTfill_int3c1e_ip2_density_contracted_kernel_general(double* outpu atomicAdd(output + task_grid + ngrids * 1, deri_dCy_pair_sum); atomicAdd(output + task_grid + ngrids * 2, deri_dCz_pair_sum); } + +template +__device__ +static void GINTwrite_int3c1e_ip2_charge_contracted(const double* g, double* output, const double minus_two_a, const double* u2, const double* AC, const double prefactor, + const int ish, const int jsh, const int i_grid, const int i_l, const int j_l, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, const int* gridslice, const int ngrids) +{ + const int* ao_loc = c_bpcache.ao_loc; + + const int i0 = ao_loc[ish] - ao_offsets_i; + const int j0 = ao_loc[jsh] - ao_offsets_j; + + const int i_atom = gridslice[i_grid]; + + const int *idx = c_idx; + const int *idy = c_idx + TOT_NF; + const int *idz = c_idx + TOT_NF * 2; + + const int g_size = NROOTS * (i_l + 1 + 1) * (j_l + 1); + const double* __restrict__ gx = g; + const double* __restrict__ gy = g + g_size; + const double* __restrict__ gz = g + g_size * 2; + + const int n_density_elements_i = (i_l + 1) * (i_l + 2) / 2; + const int n_density_elements_j = (j_l + 1) * (j_l + 2) / 2; + for (int j = 0; j < n_density_elements_j; j++) { + for (int i = 0; i < n_density_elements_i; i++) { + const int loc_j = c_l_locs[j_l] + j; + const int loc_i = c_l_locs[i_l] + i; + const int ix = idx[loc_i]; + const int iy = idy[loc_i]; + const int iz = idz[loc_i]; + const int jx = idx[loc_j]; + const int jy = idy[loc_j]; + const int jz = idz[loc_j]; + const int gx_offset = ix + jx * (i_l + 1 + 1); + const int gy_offset = iy + jy * (i_l + 1 + 1); + const int gz_offset = iz + jz * (i_l + 1 + 1); + + double deri_dCx = 0; + double deri_dCy = 0; + double deri_dCz = 0; +#pragma unroll + for (int i_root = 0; i_root < NROOTS; i_root++) { + const double gx_0 = gx[gx_offset * NROOTS + i_root]; + const double gy_0 = gy[gy_offset * NROOTS + i_root]; + const double gz_0 = gz[gz_offset * NROOTS + i_root]; + const double gx_1 = gx[(gx_offset + 1) * NROOTS + i_root]; + const double gy_1 = gy[(gy_offset + 1) * NROOTS + i_root]; + const double gz_1 = gz[(gz_offset + 1) * NROOTS + i_root]; + const double minus_two_u2 = -2.0 * u2[i_root]; + const double dgx_dCx = minus_two_u2 * (gx_1 + AC[0] * gx_0); + const double dgy_dCy = minus_two_u2 * (gy_1 + AC[1] * gy_0); + const double dgz_dCz = minus_two_u2 * (gz_1 + AC[2] * gz_0); + deri_dCx += dgx_dCx * gy_0 * gz_0; + deri_dCy += gx_0 * dgy_dCy * gz_0; + deri_dCz += gx_0 * gy_0 * dgz_dCz; + } + deri_dCx *= prefactor; + deri_dCy *= prefactor; + deri_dCz *= prefactor; + + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 0 * stride_ij + i_atom * 3 * stride_ij), deri_dCx); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 1 * stride_ij + i_atom * 3 * stride_ij), deri_dCy); + atomicAdd(output + ((i + i0) + (j + j0) * stride_j + 2 * stride_ij + i_atom * 3 * stride_ij), deri_dCz); + } + } +} + +template +__global__ +static void GINTfill_int3c1e_ip2_charge_contracted_kernel_general(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, const int* gridslice, + const double omega, const double* grid_points, const double* charge_exponents) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + const int task_ij = blockIdx.x * blockDim.x + threadIdx.x; + const int task_grid = blockIdx.y * blockDim.y + threadIdx.y; + + if (task_ij >= ntasks_ij || task_grid >= ngrids) { + return; + } + + const int bas_ij = offsets.bas_ij + task_ij; + const int prim_ij = offsets.primitive_ij + task_ij * nprim_ij; + const int* bas_pair2bra = c_bpcache.bas_pair2bra; + const int* bas_pair2ket = c_bpcache.bas_pair2ket; + const int ish = bas_pair2bra[bas_ij]; + const int jsh = bas_pair2ket[bas_ij]; + const double* __restrict__ a_exponents = c_bpcache.a1; + const int nbas = c_bpcache.nbas; + const double* __restrict__ bas_x = c_bpcache.bas_coords; + const double* __restrict__ bas_y = bas_x + nbas; + const double* __restrict__ bas_z = bas_y + nbas; + const double Ax = bas_x[ish]; + const double Ay = bas_y[ish]; + const double Az = bas_z[ish]; + + const double* grid_point = grid_points + task_grid * 4; + const double Cx = grid_point[0]; + const double Cy = grid_point[1]; + const double Cz = grid_point[2]; + const double charge = grid_point[3]; + const double charge_exponent = (charge_exponents != NULL) ? charge_exponents[task_grid] : 0.0; + + const double AC[3] { Ax - Cx, Ay - Cy, Az - Cz }; + + double g[GSIZE_INT3C_1E]; + double u2[NROOTS]; + + for (int ij = prim_ij; ij < prim_ij+nprim_ij; ++ij) { + GINT_g1e_save_u2(g, u2, grid_point, ish, jsh, ij, i_l + 1, j_l, charge_exponent, omega); + const double minus_two_a = -2.0 * a_exponents[ij]; + GINTwrite_int3c1e_ip2_charge_contracted(g, output, minus_two_a, u2, AC, charge, ish, jsh, task_grid, i_l, j_l, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, ngrids); + } +} diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu index e0ace197..3ee7c423 100644 --- a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu +++ b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ip.cu @@ -113,6 +113,55 @@ static int GINTfill_int3c1e_ip1_charge_contracted_tasks(double* output, const Ba return 0; } +static int GINTfill_int3c1e_ip1_density_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const double* density, const int* aoslice, const int nao, + const double omega, const double* grid_points, const double* charge_exponents, + const cudaStream_t stream) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + + const dim3 threads(THREADSX, THREADSY); + const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY); + const int type_ij = i_l * 10 + j_l; + switch (type_ij) { + case 00: GINTfill_int3c1e_ip1_density_contracted_kernel00<<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 01: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<0, 1> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 02: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<0, 2> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 03: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<0, 3> <<>>(output, offsets, nprim_ij, density, shell, nao, omega, grid_points, charge_exponents); break; + // case 04: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<0, 4> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 10: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<1, 0> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 11: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<1, 1> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 12: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<1, 2> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 13: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<1, 3> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 20: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<2, 0> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 21: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<2, 1> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 22: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<2, 2> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 30: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<3, 0> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 31: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<3, 1> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + // case 40: GINTfill_int3c1e_ip1_density_contracted_kernel_expanded<4, 0> <<>>(output, offsets, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + default: + const int nrys_roots = (i_l + j_l + 1) / 2 + 1; + switch (nrys_roots) { + case 1: GINTfill_int3c1e_ip1_density_contracted_kernel_general<1, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + case 2: GINTfill_int3c1e_ip1_density_contracted_kernel_general<2, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + case 3: GINTfill_int3c1e_ip1_density_contracted_kernel_general<3, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + case 4: GINTfill_int3c1e_ip1_density_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + case 5: GINTfill_int3c1e_ip1_density_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, density, aoslice, nao, omega, grid_points, charge_exponents); break; + default: + fprintf(stderr, "type_ij = %d, nrys_roots = %d out of range\n", type_ij, nrys_roots); + return 1; + } + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err)); + return 1; + } + return 0; +} + static int GINTfill_int3c1e_ip2_density_contracted_tasks(double* output, const double* density, const HermiteDensityOffsets hermite_density_offsets, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, const double omega, const double* grid_points, const double* charge_exponents, @@ -147,12 +196,62 @@ static int GINTfill_int3c1e_ip2_density_contracted_tasks(double* output, const d return 0; } +static int GINTfill_int3c1e_ip2_charge_contracted_tasks(double* output, const BasisProdOffsets offsets, const int i_l, const int j_l, const int nprim_ij, + const int stride_j, const int stride_ij, const int ao_offsets_i, const int ao_offsets_j, + const int* gridslice, + const double omega, const double* grid_points, const double* charge_exponents, + const cudaStream_t stream) +{ + const int ntasks_ij = offsets.ntasks_ij; + const int ngrids = offsets.ntasks_kl; + + const dim3 threads(THREADSX, THREADSY); + const dim3 blocks((ntasks_ij+THREADSX-1)/THREADSX, (ngrids+THREADSY-1)/THREADSY); + const int type_ij = i_l * 10 + j_l; + switch (type_ij) { + case 00: GINTfill_int3c1e_ip2_charge_contracted_kernel00<<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 01: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<0, 1> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 02: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<0, 2> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 03: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<0, 3> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 04: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<0, 4> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 10: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<1, 0> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 11: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<1, 1> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 12: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<1, 2> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 13: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<1, 3> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 20: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<2, 0> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 21: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<2, 1> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 22: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<2, 2> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 30: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<3, 0> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 31: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<3, 1> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + // case 40: GINTfill_int3c1e_ip2_charge_contracted_kernel_expanded<4, 0> <<>>(output, offsets, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + default: + const int nrys_roots = (i_l + j_l + 1) / 2 + 1; + switch (nrys_roots) { + case 1: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<1, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + case 2: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<2, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + case 3: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<3, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + case 4: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<4, GSIZE4_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + case 5: GINTfill_int3c1e_ip2_charge_contracted_kernel_general<5, GSIZE5_INT3C_1E> <<>>(output, offsets, i_l, j_l, nprim_ij, stride_j, stride_ij, ao_offsets_i, ao_offsets_j, gridslice, omega, grid_points, charge_exponents); break; + default: + fprintf(stderr, "type_ij = %d, nrys_roots = %d out of range\n", type_ij, nrys_roots); + return 1; + } + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in %s: %s\n", __func__, cudaGetErrorString(err)); + return 1; + } + return 0; +} + extern "C" { int GINTfill_int3c1e_ip(const cudaStream_t stream, const BasisProdCache* bpcache, const double* grid_points, const double* charge_exponents, const int ngrids, double* integrals, const int* strides, const int* ao_offsets, - const int* bins_locs_ij, int nbins, + const int* bins_locs_ij, const int nbins, const int cp_ij_id, const double omega) { const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; @@ -198,11 +297,63 @@ int GINTfill_int3c1e_ip(const cudaStream_t stream, const BasisProdCache* bpcache return 0; } +int GINTfill_int3c1e_ip1_density_contracted(const cudaStream_t stream, const BasisProdCache* bpcache, + const double* grid_points, const double* charge_exponents, const int ngrids, + double* integral_charge_contracted, + const int* bins_locs_ij, const int nbins, + const int cp_ij_id, + const double* density, const int* aoslice, const int nao, + const double omega) +{ + const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; + const int i_l = cp_ij->l_bra; + const int j_l = cp_ij->l_ket; + const int nrys_roots = (i_l + j_l + 1) / 2 + 1; + const int nprim_ij = cp_ij->nprim_12; + + if (nrys_roots > MAX_NROOTS_INT3C_1E + 1) { + fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots); + return 2; + } + + checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache))); + + const int* bas_pairs_locs = bpcache->bas_pairs_locs; + const int* primitive_pairs_locs = bpcache->primitive_pairs_locs; + for (int ij_bin = 0; ij_bin < nbins; ij_bin++) { + const int bas_ij0 = bins_locs_ij[ij_bin]; + const int bas_ij1 = bins_locs_ij[ij_bin + 1]; + const int ntasks_ij = bas_ij1 - bas_ij0; + if (ntasks_ij <= 0) { + continue; + } + + BasisProdOffsets offsets; + offsets.ntasks_ij = ntasks_ij; + offsets.ntasks_kl = ngrids; + offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0; + offsets.bas_kl = -1; + offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij; + offsets.primitive_kl = -1; + + const int err = GINTfill_int3c1e_ip1_density_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij, + density, aoslice, nao, + omega, grid_points, charge_exponents, stream); + + if (err != 0) { + return err; + } + } + + return 0; +} + + int GINTfill_int3c1e_ip1_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache, const double* grid_points, const double* charge_exponents, const int ngrids, double* integral_charge_contracted, const int* strides, const int* ao_offsets, - const int* bins_locs_ij, int nbins, + const int* bins_locs_ij, const int nbins, const int cp_ij_id, const double omega, const int n_charge_sum_per_thread) { const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; @@ -252,7 +403,7 @@ int GINTfill_int3c1e_ip2_density_contracted(const cudaStream_t stream, const Bas const double* grid_points, const double* charge_exponents, const int ngrids, const double* dm_pair_ordered, const int* density_offset, double* integral_density_contracted, - const int* bins_locs_ij, int nbins, + const int* bins_locs_ij, const int nbins, const int cp_ij_id, const double omega, const int n_pair_sum_per_thread) { const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; @@ -302,4 +453,56 @@ int GINTfill_int3c1e_ip2_density_contracted(const cudaStream_t stream, const Bas return 0; } + +int GINTfill_int3c1e_ip2_charge_contracted(const cudaStream_t stream, const BasisProdCache* bpcache, + const double* grid_points, const double* charge_exponents, const int ngrids, + double* integral_charge_contracted, + const int* strides, const int* ao_offsets, + const int* bins_locs_ij, const int nbins, + const int cp_ij_id, + const int* gridslice, + const double omega) +{ + const ContractionProdType *cp_ij = bpcache->cptype + cp_ij_id; + const int i_l = cp_ij->l_bra; + const int j_l = cp_ij->l_ket; + const int nrys_roots = (i_l + j_l + 1) / 2 + 1; + const int nprim_ij = cp_ij->nprim_12; + + if (nrys_roots > MAX_NROOTS_INT3C_1E + 1) { + fprintf(stderr, "nrys_roots = %d too high\n", nrys_roots); + return 2; + } + + checkCudaErrors(cudaMemcpyToSymbol(c_bpcache, bpcache, sizeof(BasisProdCache))); + + const int* bas_pairs_locs = bpcache->bas_pairs_locs; + const int* primitive_pairs_locs = bpcache->primitive_pairs_locs; + for (int ij_bin = 0; ij_bin < nbins; ij_bin++) { + const int bas_ij0 = bins_locs_ij[ij_bin]; + const int bas_ij1 = bins_locs_ij[ij_bin + 1]; + const int ntasks_ij = bas_ij1 - bas_ij0; + if (ntasks_ij <= 0) { + continue; + } + + BasisProdOffsets offsets; + offsets.ntasks_ij = ntasks_ij; + offsets.ntasks_kl = ngrids; + offsets.bas_ij = bas_pairs_locs[cp_ij_id] + bas_ij0; + offsets.bas_kl = -1; + offsets.primitive_ij = primitive_pairs_locs[cp_ij_id] + bas_ij0 * nprim_ij; + offsets.primitive_kl = -1; + + const int err = GINTfill_int3c1e_ip2_charge_contracted_tasks(integral_charge_contracted, offsets, i_l, j_l, nprim_ij, + strides[0], strides[1], ao_offsets[0], ao_offsets[1], + gridslice, omega, grid_points, charge_exponents, stream); + + if (err != 0) { + return err; + } + } + + return 0; +} } diff --git a/gpu4pyscf/lib/memcpy.py b/gpu4pyscf/lib/memcpy.py new file mode 100644 index 00000000..c961a9a2 --- /dev/null +++ b/gpu4pyscf/lib/memcpy.py @@ -0,0 +1,95 @@ +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import cupy +import numpy as np + +def find_contiguous_chunks(shape, h_strides, d_strides): + """ + Find the largest contiguous chunk size based on strides and shape. + """ + chunk_shape = [] + chunk_size = 1 + for dim, h_stride, d_stride in zip(reversed(shape), reversed(h_strides), reversed(d_strides)): + if h_stride == chunk_size and d_stride == chunk_size: + chunk_shape.append(dim) + chunk_size *= dim + else: + break + chunk_shape = tuple(reversed(chunk_shape)) + return chunk_shape, chunk_size + +def copy_array(src_view, out=None): + ''' Copy cupy/numpy array to cupy array if out is None + Copy cupy/numpy array to cupy/numpy array (out) + ''' + if out is None: + out = cupy.empty_like(src_view) + else: + # Ensure both arrays have the same shape + if src_view.shape != out.shape: + raise ValueError("Host and device views must have the same shape.") + return _copy_array(src_view, out) + +def _copy_array(src_view, dst_view): + ''' Copy data from cupy/numpy array to another cupy/numpy array + Check memory layout, then copy memory chunks by cupy.cuda.runtime.memcpy + ''' + if src_view.nbytes == 0: + return dst_view + + shape = src_view.shape + itemsize = src_view.itemsize + strides_src = [stride // itemsize for stride in src_view.strides] + strides_dst = [stride // itemsize for stride in dst_view.strides] + + # Find the largest contiguous chunk + chunk_shape, chunk_size = find_contiguous_chunks(shape, strides_src, strides_dst) + + if isinstance(src_view, cupy.ndarray): + src_data_ptr = src_view.data.ptr + else: + src_data_ptr = src_view.ctypes.data + + if isinstance(dst_view, cupy.ndarray): + dst_data_ptr = dst_view.data.ptr + else: + dst_data_ptr = dst_view.ctypes.data + + if isinstance(src_view, cupy.ndarray) and isinstance(dst_view, cupy.ndarray): + kind = cupy.cuda.runtime.memcpyDeviceToDevice + elif isinstance(src_view, cupy.ndarray) and isinstance(dst_view, np.ndarray): + kind = cupy.cuda.runtime.memcpyDeviceToHost + elif isinstance(src_view, np.ndarray) and isinstance(dst_view, cupy.ndarray): + kind = cupy.cuda.runtime.memcpyHostToDevice + else: + raise NotImplementedError + + assert len(chunk_shape) > 0 + + # Transfer data chunk-by-chunk + outer_dims = shape[:-len(chunk_shape)] + for outer_index in np.ndindex(*outer_dims): + # Compute offsets for the current outer slice + src_offset = sum(outer_index[i] * strides_src[i] for i in range(len(outer_dims))) + dst_offset = sum(outer_index[i] * strides_dst[i] for i in range(len(outer_dims))) + # Perform the memcpy for the contiguous chunk + cupy.cuda.runtime.memcpy( + dst_data_ptr + dst_offset * dst_view.itemsize, + src_data_ptr + src_offset * src_view.itemsize, + chunk_size * src_view.itemsize, + kind + ) + return dst_view diff --git a/gpu4pyscf/lib/tests/test_cupy_helper.py b/gpu4pyscf/lib/tests/test_cupy_helper.py index 0f406c82..b322f8ed 100644 --- a/gpu4pyscf/lib/tests/test_cupy_helper.py +++ b/gpu4pyscf/lib/tests/test_cupy_helper.py @@ -19,7 +19,8 @@ from gpu4pyscf.lib.cupy_helper import ( take_last2d, transpose_sum, krylov, unpack_sparse, add_sparse, takebak, empty_mapped, dist_matrix, - grouped_dot, grouped_gemm, cond, cart2sph_cutensor, cart2sph) + grouped_dot, grouped_gemm, cond, cart2sph_cutensor, cart2sph, + copy_array) class KnownValues(unittest.TestCase): def test_take_last2d(self): @@ -214,6 +215,41 @@ def test_unpack_tril(self): ref[:,idx,idy] = atril assert abs(a - ref).max() < 1e-12 + def test_copy_host2dev(self): + host_array = cupy.cuda.alloc_pinned_memory(10*10*10 * 8) + host_data = numpy.ndarray(10**3, dtype=cupy.float64, buffer=host_array) + host_data = host_data.reshape(10,10,10) + host_data += numpy.random.rand(10,10,10) + + device_data = cupy.empty_like(host_data) + host_view = host_data[:, 8:] # Non-contiguous view on the host + device_view = device_data[:, 8:] # Non-contiguous view on the device + + copy_array(host_view, device_view) + assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10 + + copy_array(host_view.copy(), device_view) + assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10 + + device_view = copy_array(host_view) + assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10 + + def test_copy_dev2host(self): + host_array = cupy.cuda.alloc_pinned_memory(10*10*10 * 8) + host_data = numpy.ndarray(3*10**2, dtype=cupy.float64, buffer=host_array) + host_data = host_data.reshape(3,10,10) + + device_data = cupy.zeros_like(host_data) + device_data += cupy.random.rand(3,10,10) + host_view = host_data[:, 8:] # Non-contiguous view on the host + device_view = device_data[:, 8:] # Non-contiguous view on the device + + copy_array(device_view, host_view) + assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10 + + copy_array(device_view.copy(), host_view) + assert numpy.linalg.norm(host_view - device_view.get()) < 1e-10 + if __name__ == "__main__": print("Full tests for cupy helper module") unittest.main() diff --git a/gpu4pyscf/mp/mp2.py b/gpu4pyscf/mp/mp2.py index c7fe059a..c12d68e4 100644 --- a/gpu4pyscf/mp/mp2.py +++ b/gpu4pyscf/mp/mp2.py @@ -349,6 +349,8 @@ def init_amps(self, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2): # to_cpu can be reused only when __init__ still takes mf def to_cpu(self): mf = self._scf.to_cpu() + if mf.converged: + mf.kernel() # create intermediate variables if converged from importlib import import_module mod = import_module(self.__module__.replace('gpu4pyscf', 'pyscf')) cls = getattr(mod, self.__class__.__name__) diff --git a/gpu4pyscf/mp/tests/test_mp2.py b/gpu4pyscf/mp/tests/test_mp2.py index 1570dd27..b5127816 100644 --- a/gpu4pyscf/mp/tests/test_mp2.py +++ b/gpu4pyscf/mp/tests/test_mp2.py @@ -37,6 +37,7 @@ def setUpModule(): 'O': 'cc-pvdz',} mol.build() mol.incore_anyway = True + mol.max_memory = 32000 mf = scf.RHF(mol) mf.conv_tol = 1e-12 mf.scf() diff --git a/gpu4pyscf/properties/ir.py b/gpu4pyscf/properties/ir.py index 61cfa72e..ec8b65b1 100644 --- a/gpu4pyscf/properties/ir.py +++ b/gpu4pyscf/properties/ir.py @@ -93,8 +93,9 @@ def eval_ir_freq_intensity(mf, hessian_obj): h1ao = hessian_obj.make_h1(mo_coeff, mo_occ, None, atmlst) # TODO: compact with hessian method, which can save one time cphf solve. # ! Different from PySCF, mo1 is all in mo! + fx = hessian_obj.gen_vind(mo_coeff, mo_occ) mo1, mo_e1 = hessian_obj.solve_mo1(mo_energy, mo_coeff, mo_occ, h1ao, - None, atmlst, hessian_obj.max_memory, log) + fx, atmlst, hessian_obj.max_memory, log) mo1 = cupy.asarray(mo1) mo_e1 = cupy.asarray(mo_e1) diff --git a/gpu4pyscf/qmmm/pbc/itrf.py b/gpu4pyscf/qmmm/pbc/itrf.py index 236e3e10..1b098c8f 100644 --- a/gpu4pyscf/qmmm/pbc/itrf.py +++ b/gpu4pyscf/qmmm/pbc/itrf.py @@ -1009,7 +1009,7 @@ def calculate_h1e(self, h1_gpu): nao = mol.nao if mm_mol.charge_model == 'gaussian' and len(coords) != 0: expnts = cp.hstack([mm_mol.get_zetas()] * len(Ls))[mask] - g_qm += int1e_grids_ip1(mol, coords, charges = charges, charge_exponents = expnts).transpose(0,2,1) + g_qm += int1e_grids_ip1(mol, coords, charges = charges, charge_exponents = expnts) elif mm_mol.charge_model == 'point' and len(coords) != 0: raise RuntimeError("Not tested yet") max_memory = self.max_memory - lib.current_memory()[0] diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py index 28d76b57..3a0497ff 100644 --- a/gpu4pyscf/scf/hf.py +++ b/gpu4pyscf/scf/hf.py @@ -391,6 +391,9 @@ def __init__(self, mol): self._opt_gpu = {None: None} self._eri = None # Note: self._eri requires large amount of memory + __getstate__, __setstate__ = pyscf_lib.generate_pickle_methods( + excludes=('_opt_gpu', '_eri', '_numint')) + def check_sanity(self): s1e = self.get_ovlp() if isinstance(s1e, cupy.ndarray) and s1e.ndim == 2: diff --git a/gpu4pyscf/scf/j_engine.py b/gpu4pyscf/scf/j_engine.py index 2ecb5293..3d98ae5f 100644 --- a/gpu4pyscf/scf/j_engine.py +++ b/gpu4pyscf/scf/j_engine.py @@ -26,6 +26,7 @@ from pyscf import __config__ from gpu4pyscf.lib.cupy_helper import load_library, condense, sandwich_dot, transpose_sum from gpu4pyscf.__config__ import props as gpu_specs +from gpu4pyscf.__config__ import _num_devices from gpu4pyscf.lib import logger from gpu4pyscf.scf import jk from gpu4pyscf.scf.jk import _make_j_engine_pair_locs, RysIntEnvVars, _scale_sp_ctr_coeff @@ -51,7 +52,10 @@ def get_j(mol, dm, hermi=1, vhfopt=None, omega=None, verbose=None): cput0 = log.init_timer() if vhfopt is None: with mol.with_range_coulomb(omega): - vhfopt = _VHFOpt(mol).build() + groupsize = None + if _num_devices > 1: + groupsize = jk.GROUP_SIZE + vhfopt = _VHFOpt(mol).build(group_size=groupsize) if omega is None: omega = mol.omega diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py index e1ff1d34..0e328204 100644 --- a/gpu4pyscf/scf/jk.py +++ b/gpu4pyscf/scf/jk.py @@ -58,8 +58,9 @@ SHM_SIZE = getattr(__config__, 'GPU_SHM_SIZE', int(gpu_specs['sharedMemPerBlockOptin']//9)*8) THREADS = 256 +GROUP_SIZE = 256 -def _jk_task(mol, dms, vhfopt, task_list, +def _jk_task(mol, dms, vhfopt, task_list, hermi=0, device_id=0, with_j=True, with_k=True, verbose=None): n_dm = dms.shape[0] nao, _ = vhfopt.coeff.shape @@ -76,6 +77,10 @@ def _jk_task(mol, dms, vhfopt, task_list, cput0 = log.init_timer() dms = cp.asarray(dms) + if hermi == 0: + # Contract the tril and triu parts separately + dms = cp.vstack([dms, dms.transpose(0,2,1)]) + n_dm = dms.shape[0] tile_q_ptr = ctypes.cast(vhfopt.tile_q_cond.data.ptr, ctypes.c_void_p) q_ptr = ctypes.cast(vhfopt.q_cond.data.ptr, ctypes.c_void_p) s_ptr = lib.c_null_ptr() @@ -103,41 +108,51 @@ def _jk_task(mol, dms, vhfopt, task_list, info = cp.empty(2, dtype=np.uint32) t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0) - for i, j in task_list: + for i, j, k, l in task_list: ij_shls = (l_ctr_bas_loc[i], l_ctr_bas_loc[i+1], l_ctr_bas_loc[j], l_ctr_bas_loc[j+1]) tile_ij_mapping = tile_mappings[i,j] - for k in range(i+1): - for l in range(k+1): - llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' - kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], - l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) - tile_kl_mapping = tile_mappings[k,l] - scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) - err = kern( - vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p), - ctypes.c_int(n_dm), ctypes.c_int(nao), - vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), - (ctypes.c_int*8)(*ij_shls, *kl_shls), - ctypes.c_int(tile_ij_mapping.size), - ctypes.c_int(tile_kl_mapping.size), - ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), - ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), - tile_q_ptr, q_ptr, s_ptr, - ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), - ctypes.c_float(log_cutoff), - ctypes.cast(pool.data.ptr, ctypes.c_void_p), - ctypes.cast(info.data.ptr, ctypes.c_void_p), - ctypes.c_int(workers), - mol._atm.ctypes, ctypes.c_int(mol.natm), - mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) - if err != 0: - raise RuntimeError(f'RYS_build_jk kernel for {llll} failed') - if log.verbose >= logger.DEBUG1: - msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' - t1, t1p = log.timer_debug1(msg, *t1), t1 - timing_counter[llll] += t1[1] - t1p[1] - kern_counts += 1 + llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' + kl_shls = (l_ctr_bas_loc[k], l_ctr_bas_loc[k+1], + l_ctr_bas_loc[l], l_ctr_bas_loc[l+1]) + tile_kl_mapping = tile_mappings[k,l] + scheme = quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]]) + err = kern( + vj_ptr, vk_ptr, ctypes.cast(dms.data.ptr, ctypes.c_void_p), + ctypes.c_int(n_dm), ctypes.c_int(nao), + vhfopt.rys_envs, (ctypes.c_int*2)(*scheme), + (ctypes.c_int*8)(*ij_shls, *kl_shls), + ctypes.c_int(tile_ij_mapping.size), + ctypes.c_int(tile_kl_mapping.size), + ctypes.cast(tile_ij_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(tile_kl_mapping.data.ptr, ctypes.c_void_p), + tile_q_ptr, q_ptr, s_ptr, + ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), + ctypes.c_float(log_cutoff), + ctypes.cast(pool.data.ptr, ctypes.c_void_p), + ctypes.cast(info.data.ptr, ctypes.c_void_p), + ctypes.c_int(workers), + mol._atm.ctypes, ctypes.c_int(mol.natm), + mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes) + if err != 0: + raise RuntimeError(f'RYS_build_jk kernel for {llll} failed') + if log.verbose >= logger.DEBUG1: + msg = f'processing {llll}, tasks = {info[1].get()} on Device {device_id}' + t1, t1p = log.timer_debug1(msg, *t1), t1 + timing_counter[llll] += t1[1] - t1p[1] + kern_counts += 1 + if with_j: + if hermi == 1: + vj *= 2. + else: + vj, vjT = vj[:n_dm//2], vj[n_dm//2:] + vj += vjT.transpose(0,2,1) + if with_k: + if hermi == 1: + vk = transpose_sum(vk) + else: + vk, vkT = vk[:n_dm//2], vk[n_dm//2:] + vk += vkT.transpose(0,2,1) return vj, vk, kern_counts, timing_counter def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None): @@ -157,9 +172,7 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None #:dms = cp.einsum('pi,nij,qj->npq', vhfopt.coeff, dms, vhfopt.coeff) dms = sandwich_dot(dms, vhfopt.coeff.T) dms = cp.asarray(dms, order='C') - if hermi == 0: - # Contract the tril and triu parts separately - dms = cp.vstack([dms, dms.transpose(0,2,1)]) + n_dm = dms.shape[0] assert with_j or with_k @@ -171,7 +184,12 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None l_symb = [lib.param.ANGULAR[i] for i in uniq_l] n_groups = np.count_nonzero(uniq_l <= LMAX) - tasks = [(i,j) for i in range(n_groups) for j in range(i+1)] + tasks = [] + for i in range(n_groups): + for j in range(i+1): + for k in range(i+1): + for l in range(k+1): + tasks.append((i,j,k,l)) tasks = np.array(tasks) task_list = [] for device_id in range(_num_devices): @@ -183,8 +201,8 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None for device_id in range(_num_devices): future = executor.submit( _jk_task, - mol, dms, vhfopt, task_list[device_id], - with_j=with_j, with_k=with_k, verbose=verbose, + mol, dms, vhfopt, task_list[device_id], hermi=hermi, + with_j=with_j, with_k=with_k, verbose=verbose, device_id=device_id) futures.append(future) @@ -210,28 +228,17 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None vj = vk = None if with_k: vk = reduce_to_device(vk_dist, inplace=True) - if hermi == 1: - vk = transpose_sum(vk) - else: - vk, vkT = vk[:n_dm//2], vk[n_dm//2:] - vk += vkT.transpose(0,2,1) #:vk = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vk, vhfopt.coeff) vk = sandwich_dot(vk, vhfopt.coeff) - vk = vk.reshape(dm.shape) - + if with_j: vj = reduce_to_device(vj_dist, inplace=True) - if hermi == 1: - vj *= 2. - else: - vj, vjT = vj[:n_dm//2], vj[n_dm//2:] - vj += vjT.transpose(0,2,1) vj = transpose_sum(vj) #:vj = cp.einsum('pi,npq,qj->nij', vhfopt.coeff, vj, vhfopt.coeff) vj = sandwich_dot(vj, vhfopt.coeff) - vj = vj.reshape(dm.shape) h_shls = vhfopt.h_shls + if h_shls: cput1 = log.timer_debug1('get_jk pass 1 on gpu', *cput0) log.debug3('Integrals for %s functions on CPU', l_symb[LMAX+1]) @@ -270,6 +277,11 @@ def get_jk(mol, dm, hermi=0, vhfopt=None, with_j=True, with_k=True, verbose=None for i, v in enumerate(vk1): vk[i] += coeff.T.dot(cp.asarray(v)).dot(coeff) log.timer_debug1('get_jk pass 2 for h functions on cpu', *cput1) + + if with_j: + vj = vj.reshape(dm.shape) + if with_k: + vk = vk.reshape(dm.shape) log.timer('vj and vk', *cput0) return vj, vk diff --git a/gpu4pyscf/scf/soscf.py b/gpu4pyscf/scf/soscf.py index 81da0361..6d9bf87b 100644 --- a/gpu4pyscf/scf/soscf.py +++ b/gpu4pyscf/scf/soscf.py @@ -27,7 +27,7 @@ from pyscf.soscf import ciah from pyscf.soscf.newton_ah import _CIAH_SOSCF as _SOSCF_cpu from gpu4pyscf.lib import logger -from gpu4pyscf.scf import hf, rohf, uhf +from gpu4pyscf.scf import hf, rohf, uhf, _response_functions from gpu4pyscf.lib.cupy_helper import transpose_sum, contract from gpu4pyscf.lib import utils diff --git a/gpu4pyscf/scf/tests/test_rhf.py b/gpu4pyscf/scf/tests/test_rhf.py index 530f6cc8..dd8f7b51 100644 --- a/gpu4pyscf/scf/tests/test_rhf.py +++ b/gpu4pyscf/scf/tests/test_rhf.py @@ -273,8 +273,8 @@ def test_chkfile(self): mf_copy = scf.RHF(mol) mf_copy.chkfile = ftmp.name dm_loaded = mf_copy.init_guess_by_chkfile() - assert np.allclose(dm_stored, dm_loaded, atol = 1e-14) # Since we reload the MO coefficients, the density matrix should be identical up to numerical noise. - + # Since we reload the MO coefficients, the density matrix should be identical up to numerical noise. + assert np.allclose(dm_stored, dm_loaded, atol = 1e-14) # TODO: #test analyze #test mulliken_pop diff --git a/gpu4pyscf/scf/tests/test_scf_jk.py b/gpu4pyscf/scf/tests/test_scf_jk.py index 231a36cf..78ae68eb 100644 --- a/gpu4pyscf/scf/tests/test_scf_jk.py +++ b/gpu4pyscf/scf/tests/test_scf_jk.py @@ -15,7 +15,7 @@ import unittest import numpy as np import pyscf -from pyscf import lib +from pyscf import lib, gto from gpu4pyscf.scf import jk from pyscf.scf.hf import get_jk @@ -125,4 +125,3 @@ def test_jk_hermi0(): assert abs(vj2+vj3 - vj1).max() < 1e-9 assert abs(vk2+vk3 - vk1).max() < 1e-9 - \ No newline at end of file diff --git a/gpu4pyscf/scf/tests/test_soscf.py b/gpu4pyscf/scf/tests/test_soscf.py index 924dfd2e..4a07bcc5 100644 --- a/gpu4pyscf/scf/tests/test_soscf.py +++ b/gpu4pyscf/scf/tests/test_soscf.py @@ -24,18 +24,18 @@ def setUpModule(): verbose = 5, output = '/dev/null', atom = [ - ["O" , (0. , 0. , 0.)], - [1 , (0. , -0.757 , 0.587)], - [1 , (0. , 0.757 , 0.587)] ], + ["O" , (0. , 0. , 0.)], + [1 , (0. , -0.757 , 0.587)], + [1 , (0. , 0.757 , 0.587)] ], basis = '6-31g') h2o_z1 = gto.M( verbose = 5, output = '/dev/null', atom = [ - ["O" , (0. , 0. , 0.)], - [1 , (0. , -0.757 , 0.587)], - [1 , (0. , 0.757 , 0.587)] ], + ["O" , (0. , 0. , 0.)], + [1 , (0. , -0.757 , 0.587)], + [1 , (0. , 0.757 , 0.587)] ], basis = '6-31g', charge = 1, spin = 1,) diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py index 0544f751..3fe7cb6c 100644 --- a/gpu4pyscf/solvent/grad/pcm.py +++ b/gpu4pyscf/solvent/grad/pcm.py @@ -24,7 +24,7 @@ from pyscf import lib from pyscf import gto from pyscf.grad import rhf as rhf_grad - +from gpu4pyscf.gto import int3c1e from gpu4pyscf.solvent.pcm import PI, switch_h, libsolvent from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2 from gpu4pyscf.lib.cupy_helper import contract @@ -239,11 +239,16 @@ def grad_qv(pcmobj, dm): grid_coords = pcmobj.surface['grid_coords'] q_sym = pcmobj._intermediates['q_sym'] - dvj = int1e_grids_ip1(mol, grid_coords, dm = dm, charges = q_sym, direct_scf_tol = 1e-14, charge_exponents = charge_exp**2) - dq = int1e_grids_ip2(mol, grid_coords, dm = dm, charges = q_sym, direct_scf_tol = 1e-14, charge_exponents = charge_exp**2) + intopt = int3c1e.VHFOpt(mol) + intopt.build(1e-14, aosym=False) + dvj = int1e_grids_ip1(mol, grid_coords, dm = dm, charges = q_sym, + direct_scf_tol = 1e-14, charge_exponents = charge_exp**2, + intopt=intopt) + dq = int1e_grids_ip2(mol, grid_coords, dm = dm, charges = q_sym, + direct_scf_tol = 1e-14, charge_exponents = charge_exp**2, + intopt=intopt) aoslice = mol.aoslice_by_atom() - aoslice = cupy.array(aoslice) dvj = 2.0 * cupy.asarray([cupy.sum(dvj[:,p0:p1], axis=1) for p0,p1 in aoslice[:,2:]]) dq = cupy.asarray([cupy.sum(dq[:,p0:p1], axis=1) for p0,p1 in gridslice]) de = dq + dvj diff --git a/gpu4pyscf/solvent/grad/smd.py b/gpu4pyscf/solvent/grad/smd.py index a3d850db..32ebc2ee 100644 --- a/gpu4pyscf/solvent/grad/smd.py +++ b/gpu4pyscf/solvent/grad/smd.py @@ -25,100 +25,10 @@ from gpu4pyscf.solvent import pcm, smd from gpu4pyscf.solvent.grad import pcm as pcm_grad from gpu4pyscf.lib import logger -from gpu4pyscf.lib.cupy_helper import contract def get_cds(smdobj): return smd.get_cds_legacy(smdobj)[1] -""" -def grad_solver(smdobj, dm): - ''' - dE = 0.5*v* d(K^-1 R) *v + q*dv - v^T* d(K^-1 R)v = v^T*K^-1(dR - dK K^-1R)v = v^T K^-1(dR - dK q) - ''' - mol = smdobj.mol - log = logger.new_logger(mol, mol.verbose) - t1 = log.init_timer() - if not smdobj._intermediates: - smdobj.build() - dm_cache = smdobj._intermediates.get('dm', None) - if dm_cache is not None and cupy.linalg.norm(dm_cache - dm) < 1e-10: - pass - else: - smdobj._get_vind(dm) - - gridslice = smdobj.surface['gslice_by_atom'] - v_grids = smdobj._intermediates['v_grids'] - A = smdobj._intermediates['A'] - D = smdobj._intermediates['D'] - S = smdobj._intermediates['S'] - K = smdobj._intermediates['K'] - q = smdobj._intermediates['q'] - - vK_1 = cupy.linalg.solve(K.T, v_grids) - - dF, dA = pcm_grad.get_dF_dA(smdobj.surface) - - with_D = smdobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SS(V)PE', 'SMD'] - dD, dS, dSii = pcm_grad.get_dD_dS(smdobj.surface, dF, with_D=with_D, with_S=True) - - epsilon = smdobj.eps - de = cupy.zeros([smdobj.mol.natm,3]) - - def contract_bra(a, B, c): - ''' i,xij,j->jx ''' - tmp = a.dot(B) - return (tmp * c).T - - def contract_ket(a, B, c): - ''' i,xij,j->ix ''' - tmp = B.dot(c) - return (a*tmp).T - - # IEF-PCM and SS(V)PE formally are the same in gradient calculation - # dR = f_eps/(2*pi) * (dD*A + D*dA), - # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS) - f_epsilon = (epsilon - 1.0)/(epsilon + 1.0) - fac = f_epsilon/(2.0*np.pi) - - Av = A*v_grids - de_dR = 0.5*fac * contract_ket(vK_1, dD, Av) - de_dR -= 0.5*fac * contract_bra(vK_1, dD, Av) - de_dR = cupy.asarray([cupy.sum(de_dR[p0:p1], axis=0) for p0,p1 in gridslice]) - - vK_1_D = vK_1.dot(D) - vK_1_Dv = vK_1_D * v_grids - de_dR += 0.5*fac * contract('j,xjn->nx', vK_1_Dv, dA) - - de_dS0 = 0.5*contract_ket(vK_1, dS, q) - de_dS0 -= 0.5*contract_bra(vK_1, dS, q) - de_dS0 = cupy.asarray([cupy.sum(de_dS0[p0:p1], axis=0) for p0,p1 in gridslice]) - - vK_1_q = vK_1 * q - de_dS0 += 0.5*contract('i,xin->nx', vK_1_q, dSii) - - vK_1_DA = vK_1_D*A - de_dS1 = 0.5*contract_ket(vK_1_DA, dS, q) - de_dS1 -= 0.5*contract_bra(vK_1_DA, dS, q) - de_dS1 = cupy.asarray([cupy.sum(de_dS1[p0:p1], axis=0) for p0,p1 in gridslice]) - - vK_1_DAq = vK_1_DA*q - de_dS1 += 0.5*contract('j,xjn->nx', vK_1_DAq, dSii) - - Sq = cupy.dot(S,q) - ASq = A*Sq - de_dD = 0.5*contract_ket(vK_1, dD, ASq) - de_dD -= 0.5*contract_bra(vK_1, dD, ASq) - de_dD = cupy.asarray([cupy.sum(de_dD[p0:p1], axis=0) for p0,p1 in gridslice]) - - de_dA = 0.5*contract('j,xjn->nx', vK_1_D*Sq, dA) # 0.5*cupy.einsum('j,xjn,j->nx', vK_1_D, dA, Sq) - - de_dK = de_dS0 - fac * (de_dD + de_dA + de_dS1) - de += de_dR - de_dK - - t1 = log.timer_debug1('grad solver', *t1) - return de.get() -""" grad_solver = pcm_grad.grad_solver def make_grad_object(grad_method): diff --git a/gpu4pyscf/solvent/hessian/pcm.py b/gpu4pyscf/solvent/hessian/pcm.py index 29b41588..538cb859 100644 --- a/gpu4pyscf/solvent/hessian/pcm.py +++ b/gpu4pyscf/solvent/hessian/pcm.py @@ -22,12 +22,16 @@ from pyscf import lib, gto from gpu4pyscf import scf from gpu4pyscf.solvent.pcm import PI -from gpu4pyscf.solvent.grad.pcm import grad_qv, grad_solver, grad_nuc +from gpu4pyscf.solvent.grad.pcm import grad_qv, grad_solver, grad_nuc, get_dD_dS, get_dF_dA, get_dSii from gpu4pyscf.df import int3c2e -from gpu4pyscf.lib.cupy_helper import contract from gpu4pyscf.lib import logger +from gpu4pyscf.hessian.jk import _ao2mo +from gpu4pyscf.gto.int3c1e_ip import int1e_grids_ip1, int1e_grids_ip2 +from gpu4pyscf.gto import int3c1e +from gpu4pyscf.gto.int3c1e import int1e_grids def hess_nuc(pcmobj): + raise NotImplementedError("Not tested") if not pcmobj._intermediates: pcmobj.build() mol = pcmobj.mol @@ -149,76 +153,282 @@ def pcm_grad_scanner(mol): pcmobj.reset(pmol) return de -def fd_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None, verbose=None): - ''' - dv_solv / da - slow version with finite difference - ''' - log = logger.new_logger(pcmobj, verbose) - t1 = log.init_timer() - pmol = pcmobj.mol.copy() - mol = pmol.copy() - if atmlst is None: - atmlst = range(mol.natm) - nao, nmo = mo_coeff.shape - mocc = mo_coeff[:,mo_occ>0] - nocc = mocc.shape[1] - coords = mol.atom_coords(unit='Bohr') - def pcm_vmat_scanner(mol): - pcmobj.reset(mol) - e, v = pcmobj._get_vind(dm) - return v +def get_dqsym_dx_fix_vgrids(pcmobj, atmlst, inverse_K): + assert pcmobj._intermediates is not None - mol.verbose = 0 - vmat = cupy.empty([len(atmlst), 3, nao, nocc]) - eps = 1e-3 - for i0, ia in enumerate(atmlst): - for ix in range(3): - dv = numpy.zeros_like(coords) - dv[ia,ix] = eps - mol.set_geom_(coords + dv, unit='Bohr') - vmat0 = pcm_vmat_scanner(mol) + gridslice = pcmobj.surface['gslice_by_atom'] + v_grids = pcmobj._intermediates['v_grids'] + A = pcmobj._intermediates['A'] + D = pcmobj._intermediates['D'] + S = pcmobj._intermediates['S'] + R = pcmobj._intermediates['R'] + q_sym = pcmobj._intermediates['q_sym'] + f_epsilon = pcmobj._intermediates['f_epsilon'] - mol.set_geom_(coords - dv, unit='Bohr') - vmat1 = pcm_vmat_scanner(mol) + ngrids = q_sym.shape[0] - grad_vmat = (vmat0 - vmat1)/2.0/eps - grad_vmat = contract("ij,jq->iq", grad_vmat, mocc) - grad_vmat = contract("iq,ip->pq", grad_vmat, mo_coeff) - vmat[i0,ix] = grad_vmat - t1 = log.timer_debug1('computing solvent grad veff', *t1) - pcmobj.reset(pmol) - return vmat + def get_dS_dot_q(dS, dSii, q, atmlst, gridslice): + output = cupy.einsum('diA,i->Adi', dSii[:,:,atmlst], q) + for i_atom in atmlst: + g0,g1 = gridslice[i_atom] + output[i_atom, :, g0:g1] += cupy.einsum('dij,j->di', dS[:,g0:g1,:], q) + output[i_atom, :, :] -= cupy.einsum('dij,j->di', dS[:,:,g0:g1], q[g0:g1]) + return output + def get_dST_dot_q(dS, dSii, q, atmlst, gridslice): + return get_dS_dot_q(-dS.transpose(0,2,1), dSii, q, atmlst, gridslice) + + def get_dA_dot_q(dA, q, atmlst, gridslice): + return cupy.einsum('diA,i->Adi', dA[:,:,atmlst], q) + + def get_dD_dot_q(dD, q, atmlst, gridslice): + output = cupy.zeros([len(atmlst), 3, ngrids]) + for i_atom in atmlst: + g0,g1 = gridslice[i_atom] + output[i_atom, :, g0:g1] += cupy.einsum('dij,j->di', dD[:,g0:g1,:], q) + output[i_atom, :, :] -= cupy.einsum('dij,j->di', dD[:,:,g0:g1], q[g0:g1]) + return output + def get_dDT_dot_q(dD, q, atmlst, gridslice): + return get_dD_dot_q(-dD.transpose(0,2,1), q, atmlst, gridslice) + + if pcmobj.method.upper() in ['C-PCM', 'CPCM', 'COSMO']: + _, dS = get_dD_dS(pcmobj.surface, with_D=False, with_S=True) + dF, _ = get_dF_dA(pcmobj.surface) + dSii = get_dSii(pcmobj.surface, dF) + dF = None + + # dR = 0, dK = dS + dSdx_dot_q = get_dS_dot_q(dS, dSii, q_sym, atmlst, gridslice) + + dqdx_fix_Vq = cupy.einsum('ij,Adj->Adi', inverse_K, dSdx_dot_q) + + elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SMD']: + dF, dA = get_dF_dA(pcmobj.surface) + dSii = get_dSii(pcmobj.surface, dF) + dF = None + + dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True) + + # dR = f_eps/(2*pi) * (dD*A + D*dA) + # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS) + f_eps_over_2pi = f_epsilon/(2.0*PI) + + q = inverse_K @ R @ v_grids + dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice) + + DA = D*A + dKdx_dot_q = dSdx_dot_q - f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q) + + dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst, gridslice) + dKdx_dot_q -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq) + + AS = (A * S.T).T # It's just diag(A) @ S + dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice) + dKdx_dot_q -= f_eps_over_2pi * dDdx_dot_ASq + + dqdx_fix_Vq = -cupy.einsum('ij,Adj->Adi', inverse_K, dKdx_dot_q) + + dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst, gridslice) + + dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice) + + dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V)) + dqdx_fix_Vq += cupy.einsum('ij,Adj->Adi', inverse_K, dRdx_dot_V) + + invKT_V = inverse_K.T @ v_grids + dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice) + + DT_invKT_V = D.T @ invKT_V + dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst, gridslice) + dqdx_fix_Vq += f_eps_over_2pi * (cupy.einsum('i,Adi->Adi', A, dDdxT_dot_invKT_V) + dAdxT_dot_DT_invKT_V) + + dSdxT_dot_invKT_V = get_dST_dot_q(dS, dSii, invKT_V, atmlst, gridslice) + dKdxT_dot_invKT_V = dSdxT_dot_invKT_V + + dKdxT_dot_invKT_V -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', AS.T, dDdxT_dot_invKT_V) + dKdxT_dot_invKT_V -= f_eps_over_2pi * cupy.einsum('ij,Adj->Adi', S.T, dAdxT_dot_DT_invKT_V) + + dSdxT_dot_AT_DT_invKT_V = get_dST_dot_q(dS, dSii, DA.T @ invKT_V, atmlst, gridslice) + dKdxT_dot_invKT_V -= f_eps_over_2pi * dSdxT_dot_AT_DT_invKT_V + + dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T @ inverse_K.T, dKdxT_dot_invKT_V) + + dqdx_fix_Vq *= -0.5 + + elif pcmobj.method.upper() in ['SS(V)PE']: + dF, dA = get_dF_dA(pcmobj.surface) + dSii = get_dSii(pcmobj.surface, dF) + dF = None + + dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True) + + f_eps_over_4pi = f_epsilon/(4.0*PI) -""" -def analytic_grad_vmat(pcmobj, mo_coeff, mo_occ, atmlst=None, verbose=None): + def dK_dot_q(q): + dSdx_dot_q = get_dS_dot_q(dS, dSii, q, atmlst, gridslice) + + DA = D*A + dKdx_dot_q = dSdx_dot_q - f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', DA, dSdx_dot_q) + + dAdx_dot_Sq = get_dA_dot_q(dA, S @ q, atmlst, gridslice) + dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', D, dAdx_dot_Sq) + + AS = (A * S.T).T # It's just diag(A) @ S + dDdx_dot_ASq = get_dD_dot_q(dD, AS @ q, atmlst, gridslice) + dKdx_dot_q -= f_eps_over_4pi * dDdx_dot_ASq + + dDdxT_dot_q = get_dDT_dot_q(dD, q, atmlst, gridslice) + dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', AS.T, dDdxT_dot_q) + + dAdxT_dot_DT_q = get_dA_dot_q(dA, D.T @ q, atmlst, gridslice) + dKdx_dot_q -= f_eps_over_4pi * cupy.einsum('ij,Adj->Adi', S.T, dAdxT_dot_DT_q) + + dSdxT_dot_AT_DT_q = get_dST_dot_q(dS, dSii, DA.T @ q, atmlst, gridslice) + dKdx_dot_q -= f_eps_over_4pi * dSdxT_dot_AT_DT_q + + return dKdx_dot_q + + f_eps_over_2pi = f_epsilon/(2.0*PI) + + q = inverse_K @ R @ v_grids + dKdx_dot_q = dK_dot_q(q) + dqdx_fix_Vq = -cupy.einsum('ij,Adj->Adi', inverse_K, dKdx_dot_q) + + dAdx_dot_V = get_dA_dot_q(dA, v_grids, atmlst, gridslice) + + dDdx_dot_AV = get_dD_dot_q(dD, A * v_grids, atmlst, gridslice) + + dRdx_dot_V = f_eps_over_2pi * (dDdx_dot_AV + cupy.einsum('ij,Adj->Adi', D, dAdx_dot_V)) + dqdx_fix_Vq += cupy.einsum('ij,Adj->Adi', inverse_K, dRdx_dot_V) + + invKT_V = inverse_K.T @ v_grids + dDdxT_dot_invKT_V = get_dDT_dot_q(dD, invKT_V, atmlst, gridslice) + + DT_invKT_V = D.T @ invKT_V + dAdxT_dot_DT_invKT_V = get_dA_dot_q(dA, DT_invKT_V, atmlst, gridslice) + dqdx_fix_Vq += f_eps_over_2pi * (cupy.einsum('i,Adi->Adi', A, dDdxT_dot_invKT_V) + dAdxT_dot_DT_invKT_V) + + dKdx_dot_invKT_V = dK_dot_q(invKT_V) + dqdx_fix_Vq += -cupy.einsum('ij,Adj->Adi', R.T @ inverse_K.T, dKdx_dot_invKT_V) + + dqdx_fix_Vq *= -0.5 + + else: + raise RuntimeError(f"Unknown implicit solvent model: {pcmobj.method}") + + return dqdx_fix_Vq + +def get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, inverse_K, intopt_derivative): + assert pcmobj._intermediates is not None + + mol = pcmobj.mol + gridslice = pcmobj.surface['gslice_by_atom'] + charge_exp = pcmobj.surface['charge_exp'] + grid_coords = pcmobj.surface['grid_coords'] + R = pcmobj._intermediates['R'] + + atom_coords = mol.atom_coords(unit='B') + atom_charges = numpy.asarray(mol.atom_charges(), dtype=numpy.float64) + atom_coords = atom_coords[atmlst] + atom_charges = atom_charges[atmlst] + fakemol_nuc = gto.fakemol_for_charges(atom_coords) + fakemol = gto.fakemol_for_charges(grid_coords.get(), expnt=charge_exp.get()**2) + int2c2e_ip1 = mol._add_suffix('int2c2e_ip1') + v_ng_ip1 = gto.mole.intor_cross(int2c2e_ip1, fakemol_nuc, fakemol) + v_ng_ip1 = cupy.array(v_ng_ip1) + dV_on_charge_dx = cupy.einsum('dAq,A->Adq', v_ng_ip1, atom_charges) + + v_ng_ip2 = gto.mole.intor_cross(int2c2e_ip1, fakemol, fakemol_nuc) + v_ng_ip2 = cupy.array(v_ng_ip2) + for i_atom in atmlst: + g0,g1 = gridslice[i_atom] + dV_on_charge_dx[i_atom,:,g0:g1] += cupy.einsum('dqA,A->dq', v_ng_ip2[:,g0:g1,:], atom_charges) + + dIdA = int1e_grids_ip1(mol, grid_coords, dm = dm + dm.T, intopt = intopt_derivative, charge_exponents = charge_exp**2) + dV_on_charge_dx[atmlst,:,:] -= dIdA[atmlst,:,:] + + dIdC = int1e_grids_ip2(mol, grid_coords, intopt = intopt_derivative, dm = dm, charge_exponents = charge_exp**2) + for i_atom in atmlst: + g0,g1 = gridslice[i_atom] + dV_on_charge_dx[i_atom,:,g0:g1] -= dIdC[:,g0:g1] + + KR_symmetrized = 0.5 * (inverse_K @ R + R.T @ inverse_K.T) + dqdx_fix_K_R = cupy.einsum('ij,Adj->Adi', KR_symmetrized, dV_on_charge_dx) + + return dqdx_fix_K_R + +def get_dqsym_dx(pcmobj, dm, atmlst, intopt_derivative): + K = pcmobj._intermediates['K'] + inverse_K = cupy.linalg.inv(K) + return get_dqsym_dx_fix_vgrids(pcmobj, atmlst, inverse_K) + get_dqsym_dx_fix_K_R(pcmobj, dm, atmlst, inverse_K, intopt_derivative) + +def analytic_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None, verbose=None): ''' dv_solv / da - slow version with finite difference ''' + if not pcmobj._intermediates: + pcmobj.build() + dm_cache = pcmobj._intermediates.get('dm', None) + if dm_cache is not None and cupy.linalg.norm(dm_cache - dm) < 1e-10: + pass + else: + pcmobj._get_vind(dm) + mol = pcmobj.mol log = logger.new_logger(pcmobj, verbose) t1 = log.init_timer() - pmol = pcmobj.mol.copy() - mol = pmol.copy() - if atmlst is None: - atmlst = range(mol.natm) + nao, nmo = mo_coeff.shape mocc = mo_coeff[:,mo_occ>0] nocc = mocc.shape[1] - dm = cupy.dot(mocc, mocc.T) * 2 - coords = mol.atom_coords(unit='Bohr') - # TODO: add those contributions - # contribution due to _get_v - # contribution due to linear solver - # contribution due to _get_vmat + if atmlst is None: + atmlst = range(mol.natm) - vmat = cupy.zeros([len(atmlst), 3, nao, nocc]) + gridslice = pcmobj.surface['gslice_by_atom'] + charge_exp = pcmobj.surface['charge_exp'] + grid_coords = pcmobj.surface['grid_coords'] + q_sym = pcmobj._intermediates['q_sym'] + + aoslice = mol.aoslice_by_atom() + aoslice = numpy.array(aoslice) + + intopt_fock = int3c1e.VHFOpt(mol) + intopt_fock.build(cutoff = 1e-14, aosym = True) + intopt_derivative = int3c1e.VHFOpt(mol) + intopt_derivative.build(cutoff = 1e-14, aosym = False) + + dIdx_mo = cupy.empty([len(atmlst), 3, nmo, nocc]) + + dIdA = int1e_grids_ip1(mol, grid_coords, charges = q_sym, intopt = intopt_derivative, charge_exponents = charge_exp**2) + for i_atom in atmlst: + p0,p1 = aoslice[i_atom, 2:] + # dIdx[i_atom, :, :, :] = 0 + # dIdx[i_atom, :, p0:p1, :] += dIdA[:, p0:p1, :] + # dIdx[i_atom, :, :, p0:p1] += dIdA[:, p0:p1, :].transpose(0,2,1) + dIdA_mo = dIdA[:, p0:p1, :] @ mocc + dIdA_mo = cupy.einsum('ip,dpj->dij', mo_coeff[p0:p1, :].T, dIdA_mo) + dIdB_mo = dIdA[:, p0:p1, :].transpose(0,2,1) @ mocc[p0:p1, :] + dIdB_mo = cupy.einsum('ip,dpj->dij', mo_coeff.T, dIdB_mo) + dIdx_mo[i_atom, :, :, :] = dIdA_mo + dIdB_mo + + for i_atom in atmlst: + g0,g1 = gridslice[i_atom] + dIdC = int1e_grids_ip2(mol, grid_coords[g0:g1,:], charges = q_sym[g0:g1], + intopt = intopt_derivative, charge_exponents = charge_exp[g0:g1]**2) + dIdC_mo = dIdC @ mocc + dIdC_mo = cupy.einsum('ip,dpj->dij', mo_coeff.T, dIdC_mo) + dIdx_mo[i_atom, :, :, :] += dIdC_mo + + dV_on_molecule_dx_mo = dIdx_mo + + dqdx = get_dqsym_dx(pcmobj, dm, atmlst, intopt_derivative) + for i_atom in atmlst: + for i_xyz in range(3): + dIdx_from_dqdx = int1e_grids(mol, grid_coords, charges = dqdx[i_atom, i_xyz, :], + intopt = intopt_fock, charge_exponents = charge_exp**2) + dV_on_molecule_dx_mo[i_atom, i_xyz, :, :] += mo_coeff.T @ dIdx_from_dqdx @ mocc t1 = log.timer_debug1('computing solvent grad veff', *t1) - pcmobj.reset(pmol) - return vmat -""" + return dV_on_molecule_dx_mo def make_hess_object(hess_method): if hess_method.base.with_solvent.frozen: @@ -273,7 +483,7 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): h1ao = super().make_h1(mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) if isinstance(self.base, scf.hf.RHF): dm = self.base.make_rdm1(ao_repr=True) - dv = fd_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) + dv = analytic_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) for i0, ia in enumerate(atmlst): h1ao[i0] += dv[i0] return h1ao @@ -282,14 +492,38 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): solvent = self.base.with_solvent dm = self.base.make_rdm1(ao_repr=True) dm = dm[0] + dm[1] - dva = fd_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose) - dvb = fd_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose) + dva = analytic_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose) + dvb = analytic_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose) for i0, ia in enumerate(atmlst): h1aoa[i0] += dva[i0] h1aob[i0] += dvb[i0] return h1aoa, h1aob else: raise NotImplementedError('Base object is not supported') + + def get_veff_resp_mo(self, mol, dms, mo_coeff, mo_occ, hermi=1): + v1vo = super().get_veff_resp_mo(mol, dms, mo_coeff, mo_occ, hermi=hermi) + if not self.base.with_solvent.equilibrium_solvation: + return v1vo + v_solvent = self.base.with_solvent._B_dot_x(dms) + if isinstance(self.base, scf.uhf.UHF): + n_dm = dms.shape[1] + mocca = mo_coeff[0][:,mo_occ[0]>0] + moccb = mo_coeff[1][:,mo_occ[1]>0] + moa, mob = mo_coeff + nmoa = moa.shape[1] + nocca = mocca.shape[1] + v1vo_sol = v_solvent[0] + v_solvent[1] + v1vo[:,:nmoa*nocca] += _ao2mo(v1vo_sol, mocca, moa).reshape(n_dm,-1) + v1vo[:,nmoa*nocca:] += _ao2mo(v1vo_sol, moccb, mob).reshape(n_dm,-1) + elif isinstance(self.base, scf.hf.RHF): + n_dm = dms.shape[0] + mocc = mo_coeff[:,mo_occ>0] + v1vo += _ao2mo(v_solvent, mocc, mo_coeff).reshape(n_dm,-1) + else: + raise NotImplementedError('Base object is not supported') + return v1vo + def _finalize(self): # disable _finalize. It is called in grad_method.kernel method # where self.de was not yet initialized. diff --git a/gpu4pyscf/solvent/hessian/smd.py b/gpu4pyscf/solvent/hessian/smd.py index 58cc637f..49897d74 100644 --- a/gpu4pyscf/solvent/hessian/smd.py +++ b/gpu4pyscf/solvent/hessian/smd.py @@ -25,6 +25,7 @@ from gpu4pyscf.solvent.grad import smd as smd_grad from gpu4pyscf.solvent.grad import pcm as pcm_grad from gpu4pyscf.solvent.hessian import pcm as pcm_hess +from gpu4pyscf.hessian.jk import _ao2mo def get_cds(smdobj): mol = smdobj.mol @@ -153,7 +154,7 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): h1ao = super().make_h1(mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) if isinstance(self.base, scf.hf.RHF): dm = self.base.make_rdm1(ao_repr=True) - dv = pcm_hess.fd_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) + dv = pcm_hess.analytic_grad_vmat(self.base.with_solvent, dm, mo_coeff, mo_occ, atmlst=atmlst, verbose=verbose) for i0, ia in enumerate(atmlst): h1ao[i0] += dv[i0] return h1ao @@ -162,14 +163,39 @@ def make_h1(self, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): solvent = self.base.with_solvent dm = self.base.make_rdm1(ao_repr=True) dm = dm[0] + dm[1] - dva = pcm_hess.fd_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose) - dvb = pcm_hess.fd_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose) + dva = pcm_hess.analytic_grad_vmat(solvent, dm, mo_coeff[0], mo_occ[0], atmlst=atmlst, verbose=verbose) + dvb = pcm_hess.analytic_grad_vmat(solvent, dm, mo_coeff[1], mo_occ[1], atmlst=atmlst, verbose=verbose) for i0, ia in enumerate(atmlst): h1aoa[i0] += dva[i0] h1aob[i0] += dvb[i0] return h1aoa, h1aob else: raise NotImplementedError('Base object is not supported') + + def get_veff_resp_mo(self, mol, dms, mo_coeff, mo_occ, hermi=1): + v1vo = super().get_veff_resp_mo(mol, dms, mo_coeff, mo_occ, hermi=hermi) + if not self.base.with_solvent.equilibrium_solvation: + return v1vo + v_solvent = self.base.with_solvent._B_dot_x(dms) + + if isinstance(self.base, scf.uhf.UHF): + n_dm = dms.shape[1] + mocca = mo_coeff[0][:,mo_occ[0]>0] + moccb = mo_coeff[1][:,mo_occ[1]>0] + moa, mob = mo_coeff + nmoa = moa.shape[1] + nocca = mocca.shape[1] + v1vo_sol = v_solvent[0] + v_solvent[1] + v1vo[:,:nmoa*nocca] += _ao2mo(v1vo_sol, mocca, moa).reshape(n_dm,-1) + v1vo[:,nmoa*nocca:] += _ao2mo(v1vo_sol, moccb, mob).reshape(n_dm,-1) + elif isinstance(self.base, scf.hf.RHF): + n_dm = dms.shape[0] + mocc = mo_coeff[:,mo_occ>0] + v1vo += _ao2mo(v_solvent, mocc, mo_coeff).reshape(n_dm,-1) + else: + raise NotImplementedError('Base object is not supported') + return v1vo + def _finalize(self): # disable _finalize. It is called in grad_method.kernel method # where self.de was not yet initialized. diff --git a/gpu4pyscf/solvent/tests/test_pcm_hessian.py b/gpu4pyscf/solvent/tests/test_pcm_hessian.py index 33bf0e67..c7076f29 100644 --- a/gpu4pyscf/solvent/tests/test_pcm_hessian.py +++ b/gpu4pyscf/solvent/tests/test_pcm_hessian.py @@ -14,12 +14,15 @@ import unittest import numpy as np +import cupy as cp import pyscf import pytest from pyscf import gto from gpu4pyscf.solvent import pcm from gpu4pyscf import scf, dft from packaging import version +from gpu4pyscf.solvent.hessian.pcm import analytic_grad_vmat +from gpu4pyscf.lib.cupy_helper import contract pyscf_25 = version.parse(pyscf.__version__) <= version.parse('2.5.0') @@ -50,7 +53,7 @@ def _make_mf(method='C-PCM', restricted=True, density_fit=True): mf = dft.rks.RKS(mol, xc=xc) else: mf = dft.uks.UKS(mol, xc=xc) - + if density_fit: mf = mf.density_fit() mf = mf.PCM() @@ -89,6 +92,44 @@ def _check_hessian(mf, h, ix=0, iy=0): print(f'Norm of H({ix},{iy}) diff, {np.linalg.norm(h[ix,:,iy,:] - h_fd)}') assert(np.linalg.norm(h[ix,:,iy,:] - h_fd) < tol) +def _fd_grad_vmat(pcmobj, dm, mo_coeff, mo_occ, atmlst=None): + ''' + dv_solv / da + slow version with finite difference + ''' + pmol = pcmobj.mol.copy() + mol = pmol.copy() + if atmlst is None: + atmlst = range(mol.natm) + nao, nmo = mo_coeff.shape + mocc = mo_coeff[:,mo_occ>0] + nocc = mocc.shape[1] + coords = mol.atom_coords(unit='Bohr') + def pcm_vmat_scanner(mol): + pcmobj.reset(mol) + e, v = pcmobj._get_vind(dm) + return v + + mol.verbose = 0 + vmat = cp.empty([len(atmlst), 3, nao, nocc]) + eps = 1e-5 + for i0, ia in enumerate(atmlst): + for ix in range(3): + dv = np.zeros_like(coords) + dv[ia,ix] = eps + mol.set_geom_(coords + dv, unit='Bohr') + vmat0 = pcm_vmat_scanner(mol) + + mol.set_geom_(coords - dv, unit='Bohr') + vmat1 = pcm_vmat_scanner(mol) + + grad_vmat = (vmat0 - vmat1)/2.0/eps + grad_vmat = contract("ij,jq->iq", grad_vmat, mocc) + grad_vmat = contract("iq,ip->pq", grad_vmat, mo_coeff) + vmat[i0,ix] = grad_vmat + pcmobj.reset(pmol) + return vmat + @unittest.skipIf(pcm.libsolvent is None, "solvent extension not compiled") class KnownValues(unittest.TestCase): def test_df_hess_cpcm(self): @@ -142,6 +183,48 @@ def test_uks_hess_iefpcm(self): _check_hessian(mf, h, ix=0, iy=0) _check_hessian(mf, h, ix=0, iy=1) + def test_grad_vmat_cpcm(self): + print("testing C-PCM dV_solv/dx") + mf = _make_mf(method='C-PCM') + hobj = mf.Hessian() + + dm = mf.make_rdm1() + mo_coeff = mf.mo_coeff + mo_occ = mf.mo_occ + + test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) + ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) + + cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10) + + def test_grad_vmat_iefpcm(self): + print("testing IEF-PCM dV_solv/dx") + mf = _make_mf(method='IEF-PCM') + hobj = mf.Hessian() + + dm = mf.make_rdm1() + mo_coeff = mf.mo_coeff + mo_occ = mf.mo_occ + + test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) + ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) + + cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10) + + def test_grad_vmat_ssvpe(self): + print("testing SS(V)PE dV_solv/dx") + mf = _make_mf(method='SS(V)PE') + hobj = mf.Hessian() + + dm = mf.make_rdm1() + mo_coeff = mf.mo_coeff + mo_occ = mf.mo_occ + + test_grad_vmat = analytic_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) + ref_grad_vmat = _fd_grad_vmat(hobj.base.with_solvent, dm, mo_coeff, mo_occ) + + cp.testing.assert_allclose(ref_grad_vmat, test_grad_vmat, atol = 1e-10) + @pytest.mark.skipif(pyscf_25, reason='requires pyscf 2.6 or higher') def test_to_gpu(self): import pyscf @@ -187,7 +270,7 @@ def test_to_cpu(self): mol.basis = 'sto-3g' mol.output = '/dev/null' mol.build(verbose=0) - + mf = dft.RKS(mol, xc='b3lyp').PCM() mf.conv_tol = 1e-12 mf.conv_tol_cpscf = 1e-7 @@ -209,6 +292,7 @@ def test_to_cpu(self): hessobj = hessobj.to_cpu() hess_cpu = hessobj.kernel() assert np.linalg.norm(hess_cpu - hess_gpu) < 1e-5 + if __name__ == "__main__": print("Full Tests for Hessian of PCMs") unittest.main() diff --git a/gpu4pyscf/tests/020_Vitamin_C.xyz b/gpu4pyscf/tests/020_Vitamin_C.xyz new file mode 100644 index 00000000..e119c6d3 --- /dev/null +++ b/gpu4pyscf/tests/020_Vitamin_C.xyz @@ -0,0 +1,22 @@ +20 +Vitamin C +C -0.07551087 1.68127663 -0.10745193 +O 1.33621755 1.87147409 -0.39326987 +C 1.67074668 2.95729545 0.49387976 +C 0.41740763 3.77281969 0.78495878 +C -0.60481480 3.07572636 0.28906224 +H -0.19316298 1.01922455 0.72486113 +O 0.35092043 5.03413298 1.45545728 +H 0.42961487 5.74279041 0.81264173 +O -1.95331750 3.53349874 0.15912025 +H -2.55333895 2.78846397 0.23972698 +O 2.81976302 3.20110148 0.94542226 +C -0.81772499 1.09230218 -1.32146482 +H -0.70955636 1.74951833 -2.15888136 +C -2.31163857 0.93420736 -0.98260166 +H -2.72575463 1.89080093 -0.74107186 +H -2.41980721 0.27699120 -0.14518512 +O -0.26428017 -0.18613595 -1.64425697 +H -0.72695910 -0.55328886 -2.40104423 +O -3.00083741 0.38730252 -2.10989934 +H -3.93210821 0.28874990 -1.89865997 diff --git a/gpu4pyscf/tests/057_Tamoxifen.xyz b/gpu4pyscf/tests/057_Tamoxifen.xyz new file mode 100644 index 00000000..b51df6f5 --- /dev/null +++ b/gpu4pyscf/tests/057_Tamoxifen.xyz @@ -0,0 +1,59 @@ +57 +Tamoxifen +C -1.42666665 1.35988349 0.01780185 +C -0.75139234 2.53486079 0.01780185 +C -2.96666665 1.35988349 0.01780185 +C -3.66418809 0.15160568 0.01780185 +C -3.66417225 2.56778831 0.01791304 +C -5.05890001 0.15132789 0.01723115 +H -3.11399504 -0.80051230 0.01693694 +C -5.05931013 2.56768367 0.01833813 +H -3.11457497 3.52019148 0.01809296 +C -5.75673144 1.35973487 0.01785909 +H -5.60876287 -0.80100973 0.01659711 +H -5.60899513 3.52021733 0.01884114 +H -6.85641138 1.35926586 0.01746817 +C -1.51874951 3.87006226 0.01780185 +C -1.63823871 4.60590036 -1.16149287 +C -2.09440347 4.34371845 1.19670832 +C -2.33266580 5.81544273 -1.16163975 +H -1.18363273 4.23258432 -2.09058400 +C -2.78991814 5.55312706 1.19651365 +H -2.00047584 3.76380313 2.12622693 +C -2.90901419 6.28907563 0.01764434 +H -2.42635385 6.39580205 -2.09099551 +H -3.24404320 5.92613353 2.12608927 +C 0.78860766 2.53486079 0.01780185 +C 1.48612910 3.74313859 0.01780185 +C 1.48611327 1.32695597 0.01791304 +C 2.88084102 3.74341639 0.01723115 +H 0.93593606 4.69525658 0.01693694 +C 2.88125115 1.32706060 0.01833813 +H 0.93651599 0.37455279 0.01809296 +C 3.57867246 2.53500940 0.01785909 +H 3.43070389 4.69575400 0.01659711 +H 3.43093615 0.37452694 0.01884114 +H 4.67835240 2.53547842 0.01746817 +C -0.65930948 0.02468201 0.01780185 +H -0.04466478 -0.03344716 -0.85611628 +H -0.04386363 -0.03298673 0.89118649 +C -1.66236338 -1.14385651 0.01856968 +H -2.27713573 -1.08561745 0.89239069 +H -2.27768159 -1.08629703 -0.85491210 +H -1.12919956 -2.07156136 0.01876393 +O -3.62101473 7.52921876 0.01715974 +C -2.69982994 8.60858726 0.19402752 +H -2.03011871 8.64615667 -0.63962434 +H -2.14108178 8.45680900 1.09384076 +C -3.47584819 9.93535894 0.28927757 +H -4.05456450 10.07469158 -0.59986462 +H -4.12694690 9.90759901 1.13792346 +C -1.65137806 10.90285045 1.72438609 +H -2.24764703 10.40869908 2.46274761 +H -0.79110440 10.30633800 1.50302183 +H -1.33836538 11.85545774 2.09783276 +C -3.25771829 12.42866058 0.53449492 +H -2.56611180 13.24181825 0.60767325 +H -3.86037095 12.55070987 -0.34118410 +H -3.88574784 12.41553739 1.40069735 +N -2.48185199 11.10154878 0.44281205 diff --git a/gpu4pyscf/tests/095_Azadirachtin.xyz b/gpu4pyscf/tests/095_Azadirachtin.xyz new file mode 100644 index 00000000..8c03f7bb --- /dev/null +++ b/gpu4pyscf/tests/095_Azadirachtin.xyz @@ -0,0 +1,97 @@ +95 +Azadirachtin +C 0.24028400 -0.96854600 0.05735800 +C 1.49955800 -0.38999400 0.79976500 +C 1.84405900 1.11309900 0.52612700 +C 0.61115200 2.06994900 0.41027500 +C -0.38718900 1.44909800 -0.58288900 +C -0.81198100 0.11367700 0.01403200 +H 1.34464500 -0.48336800 1.89667000 +H 0.90815500 3.09474100 0.10955200 +H 0.07146500 1.40030200 -1.59457300 +H -1.08538000 0.33936800 1.09841400 +O -0.03234300 2.14051500 1.69756400 +H 0.43832200 2.76739400 2.27637900 +O -1.64345600 2.15598600 -0.77527600 +C -2.74935800 1.17918600 -0.75355500 +H -3.33770900 1.41858200 0.14457000 +H -3.31820200 1.39744800 -1.66649800 +C -2.11058900 -0.22990000 -0.71994400 +C 2.72998200 1.32748400 -0.70483200 +H 2.81316800 2.38444500 -0.97758400 +H 3.74960400 0.95856700 -0.53283000 +H 2.35200700 0.78104000 -1.58051000 +C 2.60140000 -1.34386400 0.30659000 +C 0.84678200 -1.40613600 -1.29617000 +H 0.88274800 -0.59319600 -2.03951200 +H 0.38815200 -2.30137400 -1.74034600 +O 2.22547600 -1.78168600 -1.02946800 +C -0.42290800 -2.19363100 0.75277400 +H -0.32012900 -3.08353500 0.10236100 +C -1.91400700 -2.00763500 1.11237500 +H -2.33420900 -2.99527800 1.38379200 +H -1.98093100 -1.38866600 2.03106200 +C -2.81353800 -1.37055100 0.02719800 +H -3.12020000 -2.14713900 -0.69849000 +C -1.82661295 -0.68751599 -2.16270012 +O -1.03585236 -0.24261727 -2.99355789 +O -2.59156054 -1.74766325 -2.52650357 +C -2.29916153 -2.14198817 -3.86960099 +H -2.96290254 -2.92828960 -4.16299137 +H -2.42740743 -1.30452633 -4.52313804 +H -1.28838658 -2.48820275 -3.92764814 +O -4.01986539 -0.90962471 0.64138134 +C -4.89301012 -1.93494775 0.80793745 +O -4.54153100 -3.05110585 0.42818050 +C -6.20834727 -1.48087047 1.46771166 +H -6.70958996 -0.78829922 0.82428269 +H -6.83594045 -2.33131805 1.63434212 +H -5.99341406 -1.00749899 2.40292455 +O 0.29104226 -2.52037085 1.94793763 +C 0.31248536 -3.86361432 2.13937213 +O -0.25336168 -4.56806573 1.30443072 +C 1.07328546 -4.25938123 3.41849362 +C 1.18469713 -5.56341278 3.77014145 +H 0.75137836 -6.32562659 3.15681858 +C 1.70966559 -3.16955354 4.30104443 +H 2.52793619 -2.72004059 3.77829081 +H 0.97813456 -2.42251044 4.52839648 +H 2.06508607 -3.60889199 5.20964665 +C 1.93754031 -5.94957419 5.05688405 +H 1.46239499 -5.49165555 5.89917107 +H 1.92238977 -7.01309886 5.17344190 +H 2.95091533 -5.61227499 4.99207421 +C 3.99823568 -0.71610148 0.14421916 +O 4.54063921 0.18499764 0.78248292 +O 4.69984280 -1.27738694 -0.87269582 +O 2.69271189 -2.53050618 1.09933364 +H 3.60067733 -2.84219679 1.10624230 +C 5.98847134 -0.66885730 -0.99113633 +H 6.49970371 -0.73075570 -0.05320774 +H 6.55618159 -1.17887968 -1.74112449 +H 5.87374685 0.35839671 -1.26770006 +C 2.63486992 1.58151749 1.76176538 +C 2.13434327 2.21842175 3.11643757 +C 3.90461234 2.45387090 1.74128354 +O 2.44467967 0.78466796 2.96396625 +C 3.35337126 2.98709450 3.79243900 +C 0.74513758 2.60743687 3.44136489 +O 5.00327683 3.19196370 1.11718214 +C 4.47769203 2.16352749 3.16877423 +H 3.15573566 3.35599353 1.51547111 +C 3.84794511 4.41584726 3.25643717 +H 3.24116904 2.99889070 4.88162906 +H 0.00697023 1.93995296 2.97068106 +H 0.55491721 2.57388288 4.52449549 +H 0.54134467 3.63458255 3.09753074 +C 4.84981258 4.42246076 1.92099071 +H 4.49637929 1.09030804 3.43212004 +H 5.51163803 2.50195502 3.32489990 +C 4.76579887 5.04464694 4.26535741 +O 2.75093022 5.20578033 2.83652107 +H 4.60685318 5.22136931 1.20459035 +O 6.17282363 4.70855901 2.47193815 +H 4.42807865 5.31783232 5.24674785 +C 6.01838353 5.12565144 3.78006571 +H 2.50011685 5.87405238 3.50751412 +H 6.95619123 5.44887224 4.20308201 diff --git a/gpu4pyscf/tests/benchmark_results/v1.3.0_rks_1v100.json b/gpu4pyscf/tests/benchmark_results/v1.3.0_rks_1v100.json new file mode 100644 index 00000000..1c5a9fc2 --- /dev/null +++ b/gpu4pyscf/tests/benchmark_results/v1.3.0_rks_1v100.json @@ -0,0 +1,873 @@ +{ + "machine_info": { + "node": "mlxlabzskrh20v669fd914-20240723162348-uim1c3-7vr5b9-worker", + "processor": "", + "machine": "x86_64", + "python_compiler": "GCC 10.2.1 20210110", + "python_implementation": "CPython", + "python_implementation_version": "3.9.2", + "python_version": "3.9.2", + "python_build": [ + "default", + "Feb 28 2021 17:03:44" + ], + "release": "5.4.143.bsk.7-amd64", + "system": "Linux", + "cpu": { + "python_version": "3.9.2.final.0 (64 bit)", + "cpuinfo_version": [ + 9, + 0, + 0 + ], + "cpuinfo_version_string": "9.0.0", + "arch": "X86_64", + "bits": 64, + "count": 96, + "arch_string_raw": "x86_64", + "vendor_id_raw": "GenuineIntel", + "brand_raw": "Intel(R) Xeon(R) Platinum 8260 CPU @ 2.40GHz", + "hz_advertised_friendly": "2.4000 GHz", + "hz_actual_friendly": "3.1000 GHz", + "hz_advertised": [ + 2400000000, + 0 + ], + "hz_actual": [ + 3100005000, + 0 + ], + "stepping": 7, + "model": 85, + "family": 6, + "flags": [ + "3dnowprefetch", + "abm", + "acpi", + "adx", + "aes", + "aperfmperf", + "apic", + "arat", + "arch_capabilities", + "arch_perfmon", + "art", + "avx", + "avx2", + "avx512_vnni", + "avx512bw", + "avx512cd", + "avx512dq", + "avx512f", + "avx512vl", + "avx512vnni", + "bmi1", + "bmi2", + "bts", + "cat_l3", + "cdp_l3", + "clflush", + "clflushopt", + "clwb", + "cmov", + "constant_tsc", + "cpuid", + "cpuid_fault", + "cqm", + "cqm_llc", + "cqm_mbm_local", + "cqm_mbm_total", + "cqm_occup_llc", + "cx16", + "cx8", + "dca", + "de", + "ds_cpl", + "dtes64", + "dtherm", + "dts", + "epb", + "ept", + "ept_ad", + "erms", + "est", + "f16c", + "flexpriority", + "flush_l1d", + "fma", + "fpu", + "fsgsbase", + "fxsr", + "ht", + "hwp", + "hwp_act_window", + "hwp_epp", + "hwp_pkg_req", + "ibpb", + "ibrs", + "ibrs_enhanced", + "ida", + "intel_ppin", + "intel_pt", + "invpcid", + "invpcid_single", + "lahf_lm", + "lm", + "mba", + "mca", + "mce", + "md_clear", + "mmx", + "movbe", + "mpx", + "msr", + "mtrr", + "nonstop_tsc", + "nopl", + "nx", + "ospke", + "osxsave", + "pae", + "pat", + "pbe", + "pcid", + "pclmulqdq", + "pdcm", + "pdpe1gb", + "pebs", + "pge", + "pku", + "pln", + "pni", + "popcnt", + "pqe", + "pqm", + "pse", + "pse36", + "pts", + "rdrand", + "rdrnd", + "rdseed", + "rdt_a", + "rdtscp", + "rep_good", + "sdbg", + "sep", + "smap", + "smep", + "smx", + "ss", + "ssbd", + "sse", + "sse2", + "sse4_1", + "sse4_2", + "ssse3", + "stibp", + "syscall", + "tm", + "tm2", + "tpr_shadow", + "tsc", + "tsc_adjust", + "tsc_deadline_timer", + "tscdeadline", + "vme", + "vmx", + "vnmi", + "vpid", + "x2apic", + "xgetbv1", + "xsave", + "xsavec", + "xsaveopt", + "xsaves", + "xtopology", + "xtpr" + ], + "l3_cache_size": 37486592, + "l2_cache_size": 50331648, + "l1_data_cache_size": "1.5 MiB", + "l1_instruction_cache_size": "1.5 MiB", + "l2_cache_line_size": 256, + "l2_cache_associativity": 6 + } + }, + "commit_info": { + "id": "1ed8e5eccf5aaae256f4ad591db81f6689dd8a13", + "time": "2025-01-05T23:21:10+00:00", + "author_time": "2025-01-05T23:21:10+00:00", + "dirty": false, + "project": "gpu4pyscf", + "branch": "benchmark_ci" + }, + "benchmarks": [ + { + "group": null, + "name": "test_df_rb3lyp", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 3, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 2.725358221679926, + "max": 2.835785958915949, + "mean": 2.782431565846006, + "stddev": 0.055307723110869685, + "rounds": 3, + "median": 2.7861505169421434, + "iqr": 0.08282080292701721, + "q1": 2.7405562954954803, + "q3": 2.8233770984224975, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 2.725358221679926, + "hd15iqr": 2.835785958915949, + "ops": 0.35939787783997024, + "total": 8.347294697538018, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_grad", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_grad", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 3, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 4.394210334867239, + "max": 4.473813105374575, + "mean": 4.42994485112528, + "stddev": 0.04041990275091787, + "rounds": 3, + "median": 4.4218111131340265, + "iqr": 0.05970207788050175, + "q1": 4.401110529433936, + "q3": 4.460812607314438, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 4.394210334867239, + "hd15iqr": 4.473813105374575, + "ops": 0.22573644449455918, + "total": 13.28983455337584, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_hessian", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_hessian", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 1, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 43.774112831801176, + "max": 43.774112831801176, + "mean": 43.774112831801176, + "stddev": 0, + "rounds": 1, + "median": 43.774112831801176, + "iqr": 0.0, + "q1": 43.774112831801176, + "q3": 43.774112831801176, + "iqr_outliers": 0, + "stddev_outliers": 0, + "outliers": "0;0", + "ld15iqr": 43.774112831801176, + "hd15iqr": 43.774112831801176, + "ops": 0.022844552072189946, + "total": 43.774112831801176, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_rb3lyp", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 3, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 40.097773076966405, + "max": 40.15744375810027, + "mean": 40.11991243995726, + "stddev": 0.03267769513443882, + "rounds": 3, + "median": 40.10452048480511, + "iqr": 0.04475301085039973, + "q1": 40.09945992892608, + "q3": 40.14421293977648, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 40.097773076966405, + "hd15iqr": 40.15744375810027, + "ops": 0.024925278725285903, + "total": 120.35973731987178, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_rb3lyp_grad", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_grad", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 3, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 48.99313645064831, + "max": 49.26371451281011, + "mean": 49.142610578487314, + "stddev": 0.13750190122656403, + "rounds": 3, + "median": 49.17098077200353, + "iqr": 0.20293354662135243, + "q1": 49.037597530987114, + "q3": 49.240531077608466, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 48.99313645064831, + "hd15iqr": 49.26371451281011, + "ops": 0.02034893930599935, + "total": 147.42783173546195, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_rb3lyp_hessian", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_hessian", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 1, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 615.0911720395088, + "max": 615.0911720395088, + "mean": 615.0911720395088, + "stddev": 0, + "rounds": 1, + "median": 615.0911720395088, + "iqr": 0.0, + "q1": 615.0911720395088, + "q3": 615.0911720395088, + "iqr_outliers": 0, + "stddev_outliers": 0, + "outliers": "0;0", + "ld15iqr": 615.0911720395088, + "hd15iqr": 615.0911720395088, + "ops": 0.0016257752435044988, + "total": 615.0911720395088, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_medium", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_medium", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 3, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 18.244548039510846, + "max": 18.375720830634236, + "mean": 18.312131161491077, + "stddev": 0.06567751542153955, + "rounds": 3, + "median": 18.316124614328146, + "iqr": 0.09837959334254265, + "q1": 18.26244218321517, + "q3": 18.360821776557714, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 18.244548039510846, + "hd15iqr": 18.375720830634236, + "ops": 0.05460860842362896, + "total": 54.93639348447323, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_grad_medium", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_grad_medium", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 3, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 30.697130125015974, + "max": 30.711910048499703, + "mean": 30.70534764789045, + "stddev": 0.00752768934207856, + "rounds": 3, + "median": 30.70700277015567, + "iqr": 0.011084942612797022, + "q1": 30.699598286300898, + "q3": 30.710683228913695, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 30.697130125015974, + "hd15iqr": 30.711910048499703, + "ops": 0.03256761693329022, + "total": 92.11604294367135, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_hessian_medium", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_hessian_medium", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 1, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 667.9882875829935, + "max": 667.9882875829935, + "mean": 667.9882875829935, + "stddev": 0, + "rounds": 1, + "median": 667.9882875829935, + "iqr": 0.0, + "q1": 667.9882875829935, + "q3": 667.9882875829935, + "iqr_outliers": 0, + "stddev_outliers": 0, + "outliers": "0;0", + "ld15iqr": 667.9882875829935, + "hd15iqr": 667.9882875829935, + "ops": 0.0014970322363260838, + "total": 667.9882875829935, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_rb3lyp_medium", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_medium", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 3, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 460.72668202780187, + "max": 461.77398146130145, + "mean": 461.4145879279822, + "stddev": 0.5959440470695604, + "rounds": 3, + "median": 461.7431002948433, + "iqr": 0.785474575124681, + "q1": 460.98078659456223, + "q3": 461.7662611696869, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 460.72668202780187, + "hd15iqr": 461.77398146130145, + "ops": 0.0021672483405662944, + "total": 1384.2437637839466, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_rb3lyp_grad_medium", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_grad_medium", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 3, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 552.0836905632168, + "max": 553.4436832498759, + "mean": 552.8364644367248, + "stddev": 0.6915813282891417, + "rounds": 3, + "median": 552.9820194970816, + "iqr": 1.0199945149943233, + "q1": 552.308272796683, + "q3": 553.3282673116773, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 552.0836905632168, + "hd15iqr": 553.4436832498759, + "ops": 0.0018088531859396832, + "total": 1658.5093933101743, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_631gs", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 3, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 1.6017291732132435, + "max": 1.647629827260971, + "mean": 1.6208390643199284, + "stddev": 0.02389486042236203, + "rounds": 3, + "median": 1.613158192485571, + "iqr": 0.03442549053579569, + "q1": 1.6045864280313253, + "q3": 1.639011918567121, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 1.6017291732132435, + "hd15iqr": 1.647629827260971, + "ops": 0.6169643994973554, + "total": 4.8625171929597855, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_631gs_grad", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_grad", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 3, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 2.1184212770313025, + "max": 2.20925628952682, + "mean": 2.15202548665305, + "stddev": 0.04981377124137081, + "rounds": 3, + "median": 2.1283988934010267, + "iqr": 0.0681262593716383, + "q1": 2.1209156811237335, + "q3": 2.189041940495372, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 2.1184212770313025, + "hd15iqr": 2.20925628952682, + "ops": 0.46467851157063006, + "total": 6.456076459959149, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_631gs_hessian", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_hessian", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 1, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 16.1142161693424, + "max": 16.1142161693424, + "mean": 16.1142161693424, + "stddev": 0, + "rounds": 1, + "median": 16.1142161693424, + "iqr": 0.0, + "q1": 16.1142161693424, + "q3": 16.1142161693424, + "iqr_outliers": 0, + "stddev_outliers": 0, + "outliers": "0;0", + "ld15iqr": 16.1142161693424, + "hd15iqr": 16.1142161693424, + "ops": 0.06205700541007504, + "total": 16.1142161693424, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_rb3lyp_631gs_large", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_631gs_large", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 3, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 55.4929311927408, + "max": 56.77203128859401, + "mean": 56.066467080265284, + "stddev": 0.6496905970719544, + "rounds": 3, + "median": 55.934438759461045, + "iqr": 0.9593250718899071, + "q1": 55.60330808442086, + "q3": 56.56263315631077, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 55.4929311927408, + "hd15iqr": 56.77203128859401, + "ops": 0.01783597312397784, + "total": 168.19940124079585, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_rb3lyp_631gs_grad_large", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_rb3lyp_631gs_grad_large", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 3, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 70.14288471080363, + "max": 70.61111964285374, + "mean": 70.3403081515183, + "stddev": 0.24259089508559126, + "rounds": 3, + "median": 70.26692010089755, + "iqr": 0.3511761990375817, + "q1": 70.17389355832711, + "q3": 70.52506975736469, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 70.14288471080363, + "hd15iqr": 70.61111964285374, + "ops": 0.014216599646477592, + "total": 211.02092445455492, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_631gs_solvent", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_solvent", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 3, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 2.51676319912076, + "max": 2.569052016362548, + "mean": 2.540054644147555, + "stddev": 0.02660729798277223, + "rounds": 3, + "median": 2.5343487169593573, + "iqr": 0.03921661293134093, + "q1": 2.5211595785804093, + "q3": 2.56037619151175, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 2.51676319912076, + "hd15iqr": 2.569052016362548, + "ops": 0.393692317723976, + "total": 7.620163932442665, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_631gs_solvent_grad", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_solvent_grad", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 3, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 3.7774324007332325, + "max": 3.8614633549004793, + "mean": 3.8227184594919286, + "stddev": 0.04239564161614309, + "rounds": 3, + "median": 3.8292596228420734, + "iqr": 0.06302321562543511, + "q1": 3.7903892062604427, + "q3": 3.853412421885878, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 3.7774324007332325, + "hd15iqr": 3.8614633549004793, + "ops": 0.26159394436097405, + "total": 11.468155378475785, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_rb3lyp_631gs_solvent_hessian", + "fullname": "gpu4pyscf/tests/test_benchmark_rks.py::test_df_rb3lyp_631gs_solvent_hessian", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 1, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 122.75680537335575, + "max": 122.75680537335575, + "mean": 122.75680537335575, + "stddev": 0, + "rounds": 1, + "median": 122.75680537335575, + "iqr": 0.0, + "q1": 122.75680537335575, + "q3": 122.75680537335575, + "iqr_outliers": 0, + "stddev_outliers": 0, + "outliers": "0;0", + "ld15iqr": 122.75680537335575, + "hd15iqr": 122.75680537335575, + "ops": 0.00814618787902287, + "total": 122.75680537335575, + "iterations": 1 + } + } + ], + "datetime": "2025-01-06T03:31:22.391433+00:00", + "version": "5.1.0" +} \ No newline at end of file diff --git a/gpu4pyscf/tests/benchmark_results/v1.3.0_uks_1v100.json b/gpu4pyscf/tests/benchmark_results/v1.3.0_uks_1v100.json new file mode 100644 index 00000000..7bfabd8a --- /dev/null +++ b/gpu4pyscf/tests/benchmark_results/v1.3.0_uks_1v100.json @@ -0,0 +1,418 @@ +{ + "machine_info": { + "node": "mlxlabzskrh20v669fd914-20240723162348-uim1c3-7vr5b9-worker", + "processor": "", + "machine": "x86_64", + "python_compiler": "GCC 10.2.1 20210110", + "python_implementation": "CPython", + "python_implementation_version": "3.9.2", + "python_version": "3.9.2", + "python_build": [ + "default", + "Feb 28 2021 17:03:44" + ], + "release": "5.4.143.bsk.7-amd64", + "system": "Linux", + "cpu": { + "python_version": "3.9.2.final.0 (64 bit)", + "cpuinfo_version": [ + 9, + 0, + 0 + ], + "cpuinfo_version_string": "9.0.0", + "arch": "X86_64", + "bits": 64, + "count": 96, + "arch_string_raw": "x86_64", + "vendor_id_raw": "GenuineIntel", + "brand_raw": "Intel(R) Xeon(R) Platinum 8260 CPU @ 2.40GHz", + "hz_advertised_friendly": "2.4000 GHz", + "hz_actual_friendly": "3.1000 GHz", + "hz_advertised": [ + 2400000000, + 0 + ], + "hz_actual": [ + 3100012000, + 0 + ], + "stepping": 7, + "model": 85, + "family": 6, + "flags": [ + "3dnowprefetch", + "abm", + "acpi", + "adx", + "aes", + "aperfmperf", + "apic", + "arat", + "arch_capabilities", + "arch_perfmon", + "art", + "avx", + "avx2", + "avx512_vnni", + "avx512bw", + "avx512cd", + "avx512dq", + "avx512f", + "avx512vl", + "avx512vnni", + "bmi1", + "bmi2", + "bts", + "cat_l3", + "cdp_l3", + "clflush", + "clflushopt", + "clwb", + "cmov", + "constant_tsc", + "cpuid", + "cpuid_fault", + "cqm", + "cqm_llc", + "cqm_mbm_local", + "cqm_mbm_total", + "cqm_occup_llc", + "cx16", + "cx8", + "dca", + "de", + "ds_cpl", + "dtes64", + "dtherm", + "dts", + "epb", + "ept", + "ept_ad", + "erms", + "est", + "f16c", + "flexpriority", + "flush_l1d", + "fma", + "fpu", + "fsgsbase", + "fxsr", + "ht", + "hwp", + "hwp_act_window", + "hwp_epp", + "hwp_pkg_req", + "ibpb", + "ibrs", + "ibrs_enhanced", + "ida", + "intel_ppin", + "intel_pt", + "invpcid", + "invpcid_single", + "lahf_lm", + "lm", + "mba", + "mca", + "mce", + "md_clear", + "mmx", + "movbe", + "mpx", + "msr", + "mtrr", + "nonstop_tsc", + "nopl", + "nx", + "ospke", + "osxsave", + "pae", + "pat", + "pbe", + "pcid", + "pclmulqdq", + "pdcm", + "pdpe1gb", + "pebs", + "pge", + "pku", + "pln", + "pni", + "popcnt", + "pqe", + "pqm", + "pse", + "pse36", + "pts", + "rdrand", + "rdrnd", + "rdseed", + "rdt_a", + "rdtscp", + "rep_good", + "sdbg", + "sep", + "smap", + "smep", + "smx", + "ss", + "ssbd", + "sse", + "sse2", + "sse4_1", + "sse4_2", + "ssse3", + "stibp", + "syscall", + "tm", + "tm2", + "tpr_shadow", + "tsc", + "tsc_adjust", + "tsc_deadline_timer", + "tscdeadline", + "vme", + "vmx", + "vnmi", + "vpid", + "x2apic", + "xgetbv1", + "xsave", + "xsavec", + "xsaveopt", + "xsaves", + "xtopology", + "xtpr" + ], + "l3_cache_size": 37486592, + "l2_cache_size": 50331648, + "l1_data_cache_size": "1.5 MiB", + "l1_instruction_cache_size": "1.5 MiB", + "l2_cache_line_size": 256, + "l2_cache_associativity": 6 + } + }, + "commit_info": { + "id": "1ed8e5eccf5aaae256f4ad591db81f6689dd8a13", + "time": "2025-01-05T23:21:10+00:00", + "author_time": "2025-01-05T23:21:10+00:00", + "dirty": false, + "project": "gpu4pyscf", + "branch": "benchmark_ci" + }, + "benchmarks": [ + { + "group": null, + "name": "test_df_ub3lyp", + "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_df_ub3lyp", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 6, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 6.552961312234402, + "max": 6.817228589206934, + "mean": 6.699132799791793, + "stddev": 0.10053109169956066, + "rounds": 6, + "median": 6.730765865184367, + "iqr": 0.15081804990768433, + "q1": 6.606128558516502, + "q3": 6.756946608424187, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 6.552961312234402, + "hd15iqr": 6.817228589206934, + "ops": 0.14927305218237794, + "total": 40.19479679875076, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_ub3lyp_grad", + "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_df_ub3lyp_grad", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 6, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 13.294025084003806, + "max": 14.571726197376847, + "mean": 13.735415458368758, + "stddev": 0.5932420341119666, + "rounds": 6, + "median": 13.415598810650408, + "iqr": 1.1223390139639378, + "q1": 13.296602416783571, + "q3": 14.418941430747509, + "iqr_outliers": 0, + "stddev_outliers": 2, + "outliers": "2;0", + "ld15iqr": 13.294025084003806, + "hd15iqr": 14.571726197376847, + "ops": 0.07280449601476865, + "total": 82.41249275021255, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_df_ub3lyp_hessian", + "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_df_ub3lyp_hessian", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 1, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 93.588756557554, + "max": 93.588756557554, + "mean": 93.588756557554, + "stddev": 0, + "rounds": 1, + "median": 93.588756557554, + "iqr": 0.0, + "q1": 93.588756557554, + "q3": 93.588756557554, + "iqr_outliers": 0, + "stddev_outliers": 0, + "outliers": "0;0", + "ld15iqr": 93.588756557554, + "hd15iqr": 93.588756557554, + "ops": 0.01068504419529319, + "total": 93.588756557554, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_ub3lyp", + "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_ub3lyp", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 6, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 6.713842295110226, + "max": 7.0260709673166275, + "mean": 6.852823034239312, + "stddev": 0.11983568503202911, + "rounds": 6, + "median": 6.869919722899795, + "iqr": 0.19665820337831974, + "q1": 6.720263646915555, + "q3": 6.916921850293875, + "iqr_outliers": 0, + "stddev_outliers": 3, + "outliers": "3;0", + "ld15iqr": 6.713842295110226, + "hd15iqr": 7.0260709673166275, + "ops": 0.14592526247994722, + "total": 41.11693820543587, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_ub3lyp_grad", + "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_ub3lyp_grad", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 6, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": 2 + }, + "stats": { + "min": 7.483015248551965, + "max": 7.855705849826336, + "mean": 7.595327176774542, + "stddev": 0.14647552264068445, + "rounds": 6, + "median": 7.529973562806845, + "iqr": 0.19051661528646946, + "q1": 7.491389110684395, + "q3": 7.681905725970864, + "iqr_outliers": 0, + "stddev_outliers": 1, + "outliers": "1;0", + "ld15iqr": 7.483015248551965, + "hd15iqr": 7.855705849826336, + "ops": 0.13165989781952533, + "total": 45.57196306064725, + "iterations": 1 + } + }, + { + "group": null, + "name": "test_ub3lyp_hessian", + "fullname": "gpu4pyscf/tests/test_benchmark_uks.py::test_ub3lyp_hessian", + "params": null, + "param": null, + "extra_info": {}, + "options": { + "disable_gc": false, + "timer": "perf_counter", + "min_rounds": 1, + "max_time": 1.0, + "min_time": 5e-06, + "warmup": false + }, + "stats": { + "min": 61.551909405738115, + "max": 61.551909405738115, + "mean": 61.551909405738115, + "stddev": 0, + "rounds": 1, + "median": 61.551909405738115, + "iqr": 0.0, + "q1": 61.551909405738115, + "q3": 61.551909405738115, + "iqr_outliers": 0, + "stddev_outliers": 0, + "outliers": "0;0", + "ld15iqr": 61.551909405738115, + "hd15iqr": 61.551909405738115, + "ops": 0.016246449698387032, + "total": 61.551909405738115, + "iterations": 1 + } + } + ], + "datetime": "2025-01-06T03:46:22.404689+00:00", + "version": "5.1.0" +} \ No newline at end of file diff --git a/gpu4pyscf/tests/test_benchmark_rks.py b/gpu4pyscf/tests/test_benchmark_rks.py new file mode 100644 index 00000000..c367ac90 --- /dev/null +++ b/gpu4pyscf/tests/test_benchmark_rks.py @@ -0,0 +1,289 @@ +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +import pyscf +import pytest +from gpu4pyscf.dft import rks +CUDA_VISIBLE_DEVICES=0 +# Any task taking more than 1000s will be marked as 'slow' + +# How to run +# 1. run test only +# pytest test_benchmark_rks.py --benchmark-disable -s -v -m "not slow" --durations=20 + +# 2. benchmark less expensive tasks +# pytest test_benchmark_rks.py -v -m "not slow" + +# 3. benchmark all the tests +# pytest test_benchmark_rks.py -v + +# 4. save benchmark results +# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-save=v1.3.0_rks_1v100 + +# 5. compare benchmark results, fail if performance regresses by more than 10% +# pytest test_benchmark_rks.py -s -v -m "not slow and not high_memory" --benchmark-compare-fail=min:10% --benchmark-compare=1v100 --benchmark-storage=benchmark_results/ + +current_folder = os.path.dirname(os.path.abspath(__file__)) +small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz') +medium_mol = os.path.join(current_folder, '057_Tamoxifen.xyz') +large_mol = os.path.join(current_folder, '095_Azadirachtin.xyz') + +def run_rb3lyp(atom, basis, with_df, with_solvent, disp=None): + mol = pyscf.M(atom=atom, basis=basis, verbose=0) + mf = rks.RKS(mol, xc='b3lyp') + if with_df: + mf = mf.density_fit() + if with_solvent: + mf = mf.PCM() + mf.with_solvent.method = 'IEF-PCM' + if disp is not None: + mf.disp = disp + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-10 + return mf.kernel() + +def run_rb3lyp_grad(atom, basis, with_df, with_solvent, disp=None): + mol = pyscf.M(atom=atom, basis=basis, verbose=0) + mf = rks.RKS(mol, xc='b3lyp') + if with_df: + mf = mf.density_fit() + if with_solvent: + mf = mf.PCM() + mf.with_solvent.method = 'IEF-PCM' + if disp is not None: + mf.disp = disp + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-10 + mf.kernel() + g = mf.nuc_grad_method().kernel() + return g + +def run_rb3lyp_hessian(atom, basis, with_df, with_solvent, disp=None): + mol = pyscf.M(atom=atom, basis=basis, verbose=0) + mf = rks.RKS(mol, xc='b3lyp') + if with_df: + mf = mf.density_fit() + if with_solvent: + mf = mf.PCM() + mf.with_solvent.method = 'IEF-PCM' + if disp is not None: + mf.disp = disp + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-6 + mf.kernel() + hobj = mf.Hessian() + if with_df: + hobj.auxbasis_response = 2 + h = hobj.kernel() + return h + +####### +# DF +####### +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_df_rb3lyp(benchmark): + e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp') + assert np.isclose(np.linalg.norm(e), 684.9998712035579, atol=1e-7, rtol=1e-16) +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_df_rb3lyp_grad(benchmark): + g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp grad') + assert np.isclose(np.linalg.norm(g), 0.17435941081837686, atol=1e-5, rtol=1e-16) +@pytest.mark.benchmark(warmup=False, min_rounds=1) +def test_df_rb3lyp_hessian(benchmark): + h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp hessian') + assert np.isclose(np.linalg.norm(h), 3.7587394873290885, atol=1e-4, rtol=1e-16) + +################ +# Direct SCF +################ +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_rb3lyp(benchmark): + e = benchmark(run_rb3lyp, small_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp') + assert np.isclose(np.linalg.norm(e), 684.999735850967, atol=1e-7, rtol=1e-16) +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_rb3lyp_grad(benchmark): + g = benchmark(run_rb3lyp_grad, small_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp grad') + assert np.isclose(np.linalg.norm(g), 0.1744127474130983, atol=1e-5, rtol=1e-16) +@pytest.mark.benchmark(warmup=False, min_rounds=1) +def test_rb3lyp_hessian(benchmark): + h = benchmark(run_rb3lyp_hessian, small_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp hessian') + assert np.isclose(np.linalg.norm(h), 3.7588443634477833, atol=1e-4, rtol=1e-16) + +#################### +# Medium molecule +#################### +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_df_rb3lyp_medium(benchmark): + e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp medium') + assert np.isclose(np.linalg.norm(e), 1138.371390377773, atol=1e-7, rtol=1e-16) +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_df_rb3lyp_grad_medium(benchmark): + g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp grad medium') + assert np.isclose(np.linalg.norm(g), 0.26010545073602614, atol=1e-5, rtol=1e-16) +@pytest.mark.benchmark(warmup=False, min_rounds=1) +def test_df_rb3lyp_hessian_medium(benchmark): + h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp hessian medium') + assert np.isclose(np.linalg.norm(h), 6.31265424196621, atol=1e-4, rtol=1e-16) + +@pytest.mark.benchmark(warmup=False, min_rounds=1) +def test_rb3lyp_medium(benchmark): + e = benchmark(run_rb3lyp, medium_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp medium') + assert np.isclose(np.linalg.norm(e), 1138.3710752128077, atol=1e-7, rtol=1e-16) +@pytest.mark.benchmark(warmup=False, min_rounds=1) +def test_rb3lyp_grad_medium(benchmark): + g = benchmark(run_rb3lyp_grad, medium_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp grad medium') + assert np.isclose(np.linalg.norm(g), 0.2601443836937988, atol=1e-5, rtol=1e-16) +@pytest.mark.slow +@pytest.mark.benchmark(warmup=False, min_rounds=1) +def test_rb3lyp_hessian_medium(benchmark): + h = benchmark(run_rb3lyp_hessian, medium_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp hessian medium') + assert np.isclose(np.linalg.norm(h), 6.312714778020796, atol=1e-4, rtol=1e-16) + +#################### +# large molecule +#################### +@pytest.mark.high_memory +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_df_rb3lyp_large(benchmark): + e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp large') + assert np.isclose(np.linalg.norm(e), 2564.198712152175, atol=1e-7, rtol=1e-16) +@pytest.mark.high_memory +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_df_rb3lyp_grad_large(benchmark): + g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp grad large') + assert np.isclose(np.linalg.norm(g), 0.3784358687859323, atol=1e-5, rtol=1e-16) +@pytest.mark.high_memory +@pytest.mark.slow +@pytest.mark.benchmark(warmup=False, min_rounds=1) +def test_df_rb3lyp_hessian_large(benchmark): + h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', True, False) + print('testing df rb3lyp hessian large') + assert np.isclose(np.linalg.norm(h), 7.583208736873523, atol=1e-4, rtol=1e-16) +@pytest.mark.slow +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_rb3lyp_large(benchmark): + e = benchmark(run_rb3lyp, large_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp large') + assert np.isclose(np.linalg.norm(e), 2564.198099576358, atol=1e-7, rtol=1e-16) +@pytest.mark.slow +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_rb3lyp_grad_large(benchmark): + g = benchmark(run_rb3lyp_grad, large_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp grad large') + assert np.isclose(np.linalg.norm(g), 0.3784664384209763, atol=1e-5, rtol=1e-16) + +# Hessian for large molecule with large basis set is too slow +''' +@pytest.mark.slow +@pytest.mark.benchmark +def test_rb3lyp_hessian_large(benchmark): + h = benchmark(run_rb3lyp_hessian, large_mol, 'def2-tzvpp', False, False) + print('testing rb3lyp hessian large') + print(np.linalg.norm(h)) +''' + +##################### +# Small basis set +##################### +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_df_rb3lyp_631gs(benchmark): + e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, False) + print('testing df rb3lyp 631gs') + assert np.isclose(np.linalg.norm(e), 684.6646008642876, atol=1e-7, rtol=1e-16) +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_df_rb3lyp_631gs_grad(benchmark): + g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, False) + print('testing df rb3lyp 631gs grad') + assert np.isclose(np.linalg.norm(g), 0.17530687343398219, atol=1e-5, rtol=1e-16) +@pytest.mark.benchmark(warmup=False, min_rounds=1) +def test_df_rb3lyp_631gs_hessian(benchmark): + h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, False) + print('testing df rb3lyp 631gs hessian') + assert np.isclose(np.linalg.norm(h), 3.9071846157996553, atol=1e-4, rtol=1e-16) + +######################################### +# Small basis set for large molecule +######################################### +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_rb3lyp_631gs_large(benchmark): + e = benchmark(run_rb3lyp, large_mol, '6-31gs', False, False) + print('testing rb3lyp 631gs large') + assert np.isclose(np.linalg.norm(e), 2563.1171191823423, atol=1e-7, rtol=1e-16) +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_rb3lyp_631gs_grad_large(benchmark): + g = benchmark(run_rb3lyp_grad, large_mol, '6-31gs', False, False) + print('testing df rb3lyp 631gs grad large') + assert np.isclose(np.linalg.norm(g), 0.37778228700247984, atol=1e-5, rtol=1e-16) +@pytest.mark.slow +@pytest.mark.benchmark(warmup=False, min_rounds=1) +def test_rb3lyp_631gs_hessian_large(benchmark): + h = benchmark(run_rb3lyp_hessian, large_mol, '6-31gs', False, False) + print('testing df rb3lyp 631gs hessian large') + assert np.isclose(np.linalg.norm(h), 7.920764634100053, atol=1e-4, rtol=1e-16) + +################### +# Solvent model +################### +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_df_rb3lyp_631gs_solvent(benchmark): + e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True) + print('testing df rb3lyp 631gs solvent') + assert np.isclose(np.linalg.norm(e), 684.6985561053816, atol=1e-7, rtol=1e-16) +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_df_rb3lyp_631gs_solvent_grad(benchmark): + g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, True) + print('testing df rb3lyp 631gs solvent grad') + assert np.isclose(np.linalg.norm(g), 0.16956999476137297, atol=1e-5, rtol=1e-16) +@pytest.mark.benchmark(warmup=False, min_rounds=1) +def test_df_rb3lyp_631gs_solvent_hessian(benchmark): + h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True) + print('testing df rb3lyp 631gs solvent hessian') + assert np.isclose(np.linalg.norm(h), 3.8991230592666737, atol=1e-4, rtol=1e-16) + +# No need to test d3bj generally +''' +# b3lyp d3bj +@pytest.mark.benchmark +def test_df_rb3lyp_631gs_d3bj(benchmark): + e = benchmark(run_rb3lyp, small_mol, '6-31gs', True, True, 'd3bj') + print('testing df rb3lyp 631gs solvent') + assert np.isclose(np.linalg.norm(e), 684.7313814096565, atol=1e-7) +@pytest.mark.benchmark +def test_df_rb3lyp_631gs_d3bj_grad(benchmark): + g = benchmark(run_rb3lyp_grad, small_mol, '6-31gs', True, True, 'd3bj') + print('testing df rb3lyp 631gs solvent grad') + assert np.isclose(np.linalg.norm(g), 0.17010044498887264, atol=1e-5) +@pytest.mark.benchmark +def test_df_rb3lyp_631gs_d3bj_hessian(benchmark): + h = benchmark(run_rb3lyp_hessian, small_mol, '6-31gs', True, True, 'd3bj') + print('testing df rb3lyp 631gs solvent hessian') + assert np.isclose(np.linalg.norm(h), 3.902367554157861, atol=1e-4) +''' diff --git a/gpu4pyscf/tests/test_benchmark_uks.py b/gpu4pyscf/tests/test_benchmark_uks.py new file mode 100644 index 00000000..236a433b --- /dev/null +++ b/gpu4pyscf/tests/test_benchmark_uks.py @@ -0,0 +1,100 @@ +# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +import pyscf +import pytest +from gpu4pyscf.dft import uks + +current_folder = os.path.dirname(os.path.abspath(__file__)) +small_mol = os.path.join(current_folder, '020_Vitamin_C.xyz') + +def run_ub3lyp(atom, basis, with_df, with_solvent): + mol = pyscf.M(atom=atom, basis=basis, verbose=0) + mf = uks.UKS(mol, xc='b3lyp') + if with_df: + mf = mf.density_fit() + if with_solvent: + mf = mf.PCM() + mf.with_solvent.method = 'IEF-PCM' + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-10 + return mf.kernel() + +def run_ub3lyp_grad(atom, basis, with_df, with_solvent): + mol = pyscf.M(atom=atom, basis=basis, verbose=0) + mf = uks.UKS(mol, xc='b3lyp') + if with_df: + mf = mf.density_fit() + if with_solvent: + mf = mf.PCM() + mf.with_solvent.method = 'IEF-PCM' + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-10 + mf.kernel() + g = mf.nuc_grad_method().kernel() + return g + +def run_ub3lyp_hessian(atom, basis, with_df, with_solvent): + mol = pyscf.M(atom=atom, basis=basis, verbose=0) + mf = uks.UKS(mol, xc='b3lyp') + if with_df: + mf = mf.density_fit() + if with_solvent: + mf = mf.PCM() + mf.with_solvent.method = 'IEF-PCM' + mf.grids.atom_grid = (99,590) + mf.conv_tol = 1e-10 + mf.conv_tol_cpscf = 1e-6 + mf.kernel() + hobj = mf.Hessian() + if with_df: + hobj.auxbasis_response = 2 + h = hobj.kernel() + return h + +########## +# UKS +########## +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_df_ub3lyp(benchmark): + e = benchmark(run_ub3lyp, small_mol, 'def2-tzvpp', True, False) + print('testing df ub3lyp') + assert np.isclose(np.linalg.norm(e), 684.9998712035856, atol=1e-7, rtol=1e-16) +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_df_ub3lyp_grad(benchmark): + g = benchmark(run_ub3lyp_grad, small_mol, 'def2-tzvpp', True, False) + print('testing df ub3lyp grad') + assert np.isclose(np.linalg.norm(g), 0.17435842214665462, atol=1e-5, rtol=1e-16) +@pytest.mark.benchmark(warmup=False, min_rounds=1) +def test_df_ub3lyp_hessian(benchmark): + h = benchmark(run_ub3lyp_hessian, small_mol, 'def2-tzvpp', True, False) + print('testing df ub3lyp hessian') + assert np.isclose(np.linalg.norm(h), 3.758810345806532, atol=1e-4, rtol=1e-16) +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_ub3lyp(benchmark): + e = benchmark(run_ub3lyp, small_mol, '6-31gs', False, False) + print('testing ub3lyp') + assert np.isclose(np.linalg.norm(e), 684.6643858622429, atol=1e-7, rtol=1e-16) +@pytest.mark.benchmark(warmup=True, warmup_iterations=2, min_rounds=3) +def test_ub3lyp_grad(benchmark): + g = benchmark(run_ub3lyp_grad, small_mol, '6-31gs', False, False) + print('testing ub3lyp grad') + assert np.isclose(np.linalg.norm(g), 0.17540045665419984, atol=1e-5, rtol=1e-16) +@pytest.mark.benchmark(warmup=False, min_rounds=1) +def test_ub3lyp_hessian(benchmark): + h = benchmark(run_ub3lyp_hessian, small_mol, '6-31gs', False, False) + print('testing ub3lyp hessian') + assert np.isclose(np.linalg.norm(h), 3.907289414559395, atol=1e-4, rtol=1e-16) diff --git a/gpu4pyscf/tests/test_dft.py b/gpu4pyscf/tests/test_dft.py deleted file mode 100644 index d6f09839..00000000 --- a/gpu4pyscf/tests/test_dft.py +++ /dev/null @@ -1,180 +0,0 @@ -# Copyright 2021-2024 The PySCF Developers. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest -import numpy as np -import pyscf -import pytest -import cupy -from gpu4pyscf.dft import rks, uks - -def setUpModule(): - global mol - atom = ''' -C -0.07551087 1.68127663 -0.10745193 -O 1.33621755 1.87147409 -0.39326987 -C 1.67074668 2.95729545 0.49387976 -C 0.41740763 3.77281969 0.78495878 -C -0.60481480 3.07572636 0.28906224 -H -0.19316298 1.01922455 0.72486113 -O 0.35092043 5.03413298 1.45545728 -H 0.42961487 5.74279041 0.81264173 -O -1.95331750 3.53349874 0.15912025 -H -2.55333895 2.78846397 0.23972698 -O 2.81976302 3.20110148 0.94542226 -C -0.81772499 1.09230218 -1.32146482 -H -0.70955636 1.74951833 -2.15888136 -C -2.31163857 0.93420736 -0.98260166 -H -2.72575463 1.89080093 -0.74107186 -H -2.41980721 0.27699120 -0.14518512 -O -0.26428017 -0.18613595 -1.64425697 -H -0.72695910 -0.55328886 -2.40104423 -O -3.00083741 0.38730252 -2.10989934 -H -3.93210821 0.28874990 -1.89865997 -''' - - mol = pyscf.M(atom=atom, basis='def2-tzvpp', max_memory=32000, cart=0) - mol.output = '/dev/null' - mol.build() - mol.verbose = 1 - -def tearDownModule(): - global mol - mol.stdout.close() - del mol - -class KnownValues(unittest.TestCase): - @pytest.mark.smoke - def test_b3lyp_with_d3bj(self): - print('-------- DFRKS with D3(BJ) -------') - mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0326965348272) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4 - - @pytest.mark.smoke - def test_b3lyp_d3bj(self): - print('-------- DFRKS with D3(BJ) -------') - mf = rks.RKS(mol, xc='b3lyp-d3bj').density_fit(auxbasis='def2-tzvpp-jkfit') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0326965348272) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.17498362161082373) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.7684319231335377) < 1e-4 - - @pytest.mark.smoke - def test_DFUKS(self): - print('------- DFUKS with D3(BJ) -------') - mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0326965349493) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.17498264516108836) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.768429871470736) < 1e-4 - - @pytest.mark.smoke - def test_RKS(self): - print('-------- RKS with D3(BJ) -------') - mf = rks.RKS(mol, xc='b3lyp') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-12 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0325611822375) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.760381249106394) < 1e-4 - - @pytest.mark.smoke - def test_UKS(self): - print('-------- UKS with D3(BJ) -------') - mf = uks.UKS(mol, xc='b3lyp') - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-12 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0325611822375) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.1750368231223345) < 1e-6 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.760381249106394) < 1e-4 - - @pytest.mark.smoke - def test_DFRKS_with_SMD(self): - print('----- DFRKS with SMD -----') - mf = rks.RKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') - mf = mf.SMD() - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.0578838805443) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.16804945458657145) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.741783814494321) < 1e-4 - - @pytest.mark.smoke - def test_DFUKS_with_SMD(self): - print('------- DFUKS with SMD ---------') - mf = uks.UKS(mol, xc='b3lyp').density_fit(auxbasis='def2-tzvpp-jkfit') - mf = mf.SMD() - mf.grids.atom_grid = (99,590) - mf.conv_tol = 1e-10 - mf.conv_tol_cpscf = 1e-8 - mf.disp = 'd3bj' - e_dft = mf.kernel() - assert np.abs(e_dft - -685.05788388063) < 1e-7 - - g = mf.nuc_grad_method().kernel() - assert np.abs(cupy.linalg.norm(g) - 0.1680496465773684) < 1e-5 - - h = mf.Hessian().kernel() - assert np.abs(cupy.linalg.norm(h) - 3.7417788481647563) < 1e-4 - -if __name__ == "__main__": - print("Full Smoke Tests") - unittest.main() diff --git a/setup.py b/setup.py index edbe56c1..c0aa6f5c 100755 --- a/setup.py +++ b/setup.py @@ -134,7 +134,7 @@ def initialize_with_default_plat_name(self): ], cmdclass={'build_py': CMakeBuildPy}, install_requires=[ - 'pyscf~=2.7.0', + 'pyscf~=2.8.0', 'pyscf-dispersion', f'cupy-cuda{CUDA_VERSION}>=13.0', # Due to expm in cupyx.scipy.linalg and cutensor 2.0 'geometric',