Skip to content

Commit

Permalink
to_gpu and group grids (#69)
Browse files Browse the repository at this point in the history
* fixed a bug in screen_index

* added unit test for to_gpu

* new grids group scheme

* use grid_aligned in gpu4pyscf.__config__
  • Loading branch information
wxj6000 authored Dec 27, 2023
1 parent 43f21be commit 2e5bc47
Show file tree
Hide file tree
Showing 18 changed files with 521 additions and 203 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Installation
> [!NOTE]
> The compiled binary packages support compute capability 7.0 and later (Volta and later, such as Tesla V100, RTX 20 series and later). For older GPUs (GTX 10**, Tesla P100), please compile the package with the source code as follows.
Run ```nvidia-smi``` in your terminal to check the installed CUDA version.
Run ```nvidia-smi``` in your terminal to check the installed CUDA version.

Choose the proper package based on your CUDA environment.

Expand All @@ -15,7 +15,7 @@ Choose the proper package based on your CUDA environment.
| **CUDA 11.x** | ```pip3 install gpu4pyscf-cuda11x``` |
| **CUDA 12.x** | ```pip3 install gpu4pyscf-cuda12x``` |

```cuTensor``` is **highly recommended** to be installed for accelerating tensor contractions.
```cuTensor``` is **highly recommended** for accelerating tensor contractions.

For **CUDA 11.x**, ```python -m cupyx.tools.install_library --cuda 11.x --library cutensor```

Expand Down Expand Up @@ -59,7 +59,7 @@ Limitations
- Rys roots up to 9 for direct scf scheme;
- Atomic basis up to g orbitals;
- Auxiliary basis up to h orbitals;
- Up to ~168 atoms with def2-tzvpd basis, consuming a large amount of CPU memory;
- Density fitting scheme up to ~168 atoms with def2-tzvpd basis, bounded CPU memory;
- Hessian is unavailable for Direct SCF yet;
- meta-GGA without density laplacian;

Expand Down
22 changes: 20 additions & 2 deletions examples/00-h2o.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,11 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import numpy as np
import cupy
import pyscf
from pyscf import lib
from pyscf.hessian import thermo
from gpu4pyscf.dft import rks
lib.num_threads(8)

Expand All @@ -30,13 +33,14 @@
scf_tol = 1e-10
max_scf_cycles = 50
screen_tol = 1e-14
grids_level = 3
grids_level = 5

mol = pyscf.M(atom=atom, basis=bas, max_memory=32000, output='./pyscf.log')
mol = pyscf.M(atom=atom, basis=bas, max_memory=32000)

mol.verbose = 4
mf_GPU = rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis)
mf_GPU.grids.level = grids_level
mf_GPU.grids.atom_grid = (99,590)
mf_GPU.conv_tol = scf_tol
mf_GPU.max_cycle = max_scf_cycles
mf_GPU.screen_tol = screen_tol
Expand All @@ -55,3 +59,17 @@
h = mf_GPU.Hessian()
h.auxbasis_response = 2
h_dft = h.kernel()

# harmonic analysis
results = thermo.harmonic_analysis(mol, h_dft)
thermo.dump_normal_mode(mol, results)

results = thermo.thermo(mf_GPU, results['freq_au'], 298.15, 101325)
thermo.dump_thermo(mol, results)

# force translational symmetry
natm = mol.natm
h_dft = h_dft.transpose([0,2,1,3]).reshape(3*natm,3*natm)
h_diag = h_dft.sum(axis=0)
h_dft -= np.diag(h_diag)
print(h_dft[:3,:3])
6 changes: 3 additions & 3 deletions gpu4pyscf/__config__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,23 @@
# such as A100-80G
if props['totalGlobalMem'] >= 64 * GB:
min_ao_blksize = 128
min_grid_blksize = 128*128
min_grid_blksize = 256*256#128*128
ao_aligned = 32
grid_aligned = 128
mem_fraction = 0.9
number_of_threads = 2048 * 108
# such as V100-32G
elif props['totalGlobalMem'] >= 32 * GB:
min_ao_blksize = 128
min_grid_blksize = 128*128
min_grid_blksize = 256*256#128*128
ao_aligned = 32
grid_aligned = 128
mem_fraction = 0.9
number_of_threads = 1024 * 80
# such as A30-24GB
elif props['totalGlobalMem'] >= 16 * GB:
min_ao_blksize = 128
min_grid_blksize = 128*128
min_grid_blksize = 256*256#128*128
ao_aligned = 32
grid_aligned = 128
mem_fraction = 0.9
Expand Down
2 changes: 1 addition & 1 deletion gpu4pyscf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from . import lib, grad, hessian, solvent, scf, dft
__version__ = '0.6.11'
__version__ = '0.6.12'

# monkey patch libxc reference due to a bug in nvcc
from pyscf.dft import libxc
Expand Down
10 changes: 8 additions & 2 deletions gpu4pyscf/df/df.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
class DF(df.DF):
from gpu4pyscf.lib.utils import to_gpu, device

_keys = {'intopt'}

def __init__(self, mol, auxbasis=None):
super().__init__(mol, auxbasis)
self.auxmol = None
Expand Down Expand Up @@ -210,8 +212,12 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
if(not use_gpu_memory):
log.debug("Not enough GPU memory")
# TODO: async allocate memory
mem = cupy.cuda.alloc_pinned_memory(naux * npair * 8)
cderi = np.ndarray([naux, npair], dtype=np.float64, order='C', buffer=mem)
try:
mem = cupy.cuda.alloc_pinned_memory(naux * npair * 8)
cderi = np.ndarray([naux, npair], dtype=np.float64, order='C', buffer=mem)
except Exception:
raise RuntimeError('Out of CPU memory')

data_stream = cupy.cuda.stream.Stream(non_blocking=False)
count = 0
nq = len(intopt.log_qs)
Expand Down
8 changes: 6 additions & 2 deletions gpu4pyscf/df/grad/rhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.


import copy
import numpy
import cupy
from cupyx.scipy.linalg import solve_triangular
Expand Down Expand Up @@ -44,9 +44,13 @@ def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega
if key in mf_grad.base.with_df._rsh_df:
with_df = mf_grad.base.with_df._rsh_df[key]
else:
raise RuntimeError(f'omega={omega} is not calculated in SCF')
dfobj = mf_grad.base.with_df
with_df = dfobj._rsh_df[key] = copy.copy(dfobj).reset()
#raise RuntimeError(f'omega={omega} is not calculated in SCF')

auxmol = with_df.auxmol
if not hasattr(with_df, 'intopt') or with_df._cderi is None:
with_df.build(omega=omega)
intopt = with_df.intopt

naux = with_df.naux
Expand Down
7 changes: 6 additions & 1 deletion gpu4pyscf/df/hessian/rhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,16 +383,21 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
for ia, h1, vj1, vk1 in _gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
atmlst, verbose, True):
h1 += vj1 - vk1 * .5
h1ao[ia] = h1
'''
if chkfile is None:
h1ao[ia] = h1
else:
key = 'scf_f1ao/%d' % ia
lib.chkfile.save(chkfile, key, h1)
'''
return h1ao
'''
if chkfile is None:
return h1ao
else:
return chkfile

'''
def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
verbose=None, with_k=True, omega=None):
log = logger.new_logger(hessobj, verbose)
Expand Down
6 changes: 5 additions & 1 deletion gpu4pyscf/df/hessian/rks.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,17 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
for ia, h1, vj1_lr, vk1_lr in df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
atmlst, verbose, True, omega=omega):
h1ao[ia] -= .5 * (alpha - hyb) * vk1_lr
return h1ao

# support chkfile?
'''
if chkfile is None:
return h1ao
else:
for ia in atmlst:
lib.chkfile.save(chkfile, 'scf_f1ao/%d'%ia, h1ao[ia])
return chkfile

'''

class Hessian(rks_hess.Hessian):
'''Non-relativistic RKS hessian'''
Expand Down
60 changes: 53 additions & 7 deletions gpu4pyscf/dft/gen_grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@
from cupyx.scipy.spatial.distance import cdist
from gpu4pyscf.dft import radi
from gpu4pyscf.lib.cupy_helper import load_library

from gpu4pyscf import __config__ as __gpu4pyscf_config__
libdft = lib.load_library('libdft')
libgdft = load_library('libgdft')

from pyscf.dft.gen_grid import GROUP_BOUNDARY_PENALTY, NELEC_ERROR_TOL, LEBEDEV_ORDER, LEBEDEV_NGRID

GROUP_BOX_SIZE = 3.0
ALIGNMENT_UNIT = 32
ALIGNMENT_UNIT = getattr(__gpu4pyscf_config__, 'grid_aligned', 128)
# SG0
# S. Chien and P. Gill, J. Comput. Chem. 27 (2006) 730-739.

Expand Down Expand Up @@ -334,6 +334,7 @@ def gen_grid_partition(coords):

coords_all = []
weights_all = []
# support atomic_radii_adjust = None
assert radii_adjust == radi.treutler_atomic_radii_adjust
a = -radi.get_treutler_fac(mol, atomic_radii)
for ia in range(mol.natm):
Expand Down Expand Up @@ -377,6 +378,49 @@ def make_mask(mol, coords, relativity=0, shls_slice=None, cutoff=CUTOFF,
'''
return make_screen_index(mol, coords, shls_slice, cutoff)

def atomic_group_grids(mol, coords):
'''
partition the entire space based on atomic position
'''
from scipy.spatial import distance_matrix
natm = mol.natm
ngrids = coords.shape[0]
atom_coords = mol.atom_coords()
dist = distance_matrix(atom_coords, atom_coords)
visited = numpy.zeros(natm, dtype=bool)
current_node = numpy.argmin(atom_coords[:,0])
# greedy traverse atoms
path = [current_node]
while len(path) < natm:
visited[current_node] = True
# Set distances to visited nodes as infinity so they won't be chosen
distances_to_unvisited = numpy.where(visited, numpy.inf, dist[current_node])
next_node = numpy.argmin(distances_to_unvisited)
path.append(next_node)
current_node = next_node

atom_coords = cupy.asarray(atom_coords[path])
#dij = cupy.sum((atom_coords[:,None,:] - coords[None,:,:])**2, axis=2)
#group_ids = cupy.argmin(dij, axis=0)

coords = cupy.asarray(coords, order='F')
atom_coords = cupy.asarray(atom_coords, order='F')
group_ids = cupy.empty([ngrids], dtype=numpy.int32)
stream = cupy.cuda.get_current_stream()
err = libgdft.GDFTgroup_grids(
ctypes.cast(stream.ptr, ctypes.c_void_p),
ctypes.cast(group_ids.data.ptr, ctypes.c_void_p),
ctypes.cast(atom_coords.data.ptr, ctypes.c_void_p),
ctypes.cast(coords.data.ptr, ctypes.c_void_p),
ctypes.c_int(natm),
ctypes.c_int(ngrids)
)
if err != 0:
raise RuntimeError('CUDA Error')

return group_ids.argsort()


def arg_group_grids(mol, coords, box_size=GROUP_BOX_SIZE):
'''
Parition the entire space into small boxes according to the input box_size.
Expand Down Expand Up @@ -510,11 +554,6 @@ def build(self, mol=None, with_non0tab=False, sort_grids=True, **kwargs):
self.coords, self.weights = self.get_partition(
mol, atom_grids_tab, self.radii_adjust, self.atomic_radii, self.becke_scheme)

if sort_grids:
idx = arg_group_grids(mol, self.coords)
self.coords = self.coords[idx]
self.weights = self.weights[idx]

if self.alignment > 1:
padding = _padding_size(self.size, self.alignment)
logger.debug(self, 'Padding %d grids', padding)
Expand All @@ -523,6 +562,13 @@ def build(self, mol=None, with_non0tab=False, sort_grids=True, **kwargs):
self.coords = cupy.vstack(
[self.coords, numpy.repeat([[1e4]*3], padding, axis=0)])
self.weights = cupy.hstack([self.weights, numpy.zeros(padding)])

if sort_grids:
#idx = arg_group_grids(mol, self.coords)
idx = atomic_group_grids(mol, self.coords)
self.coords = self.coords[idx]
self.weights = self.weights[idx]

if with_non0tab:
self.non0tab = self.make_mask(mol, self.coords)
self.screen_index = self.non0tab
Expand Down
2 changes: 2 additions & 0 deletions gpu4pyscf/dft/numint.py
Original file line number Diff line number Diff line change
Expand Up @@ -1306,6 +1306,8 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
class NumInt(numint.NumInt):
from gpu4pyscf.lib.utils import to_cpu, to_gpu, device

_keys = {'screen_idx', 'xcfuns', 'gdftopt'}

def __init__(self, xc='LDA'):
super().__init__()
self.gdftopt = None
Expand Down
Loading

0 comments on commit 2e5bc47

Please sign in to comment.