Skip to content

Commit

Permalink
refactor solvent models (#139)
Browse files Browse the repository at this point in the history
* refactor solvent models

* flake

* resolve conflict

* fixed bugs due to refactor

* fixed typos

* fixed one more type

* add more unit test for SMD model

* more unit test for SMD gradient and Hessian

* added to_cpu

* to_cpu && to_gpu for solvent

* switch to legacy SMD model

* pricise unit test for solvent models

* remove .mod

* address comments in PySCF

* bug fix

* move python SMD code to experiments

* move SMD functionality to SMD module from pcm module
  • Loading branch information
wxj6000 authored May 13, 2024
1 parent ab7085e commit 9c1feba
Show file tree
Hide file tree
Showing 31 changed files with 4,730 additions and 896 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ ignore =
E701, E731, E741, E275,
F401, C901, W391, W503, W504, W291, W292, W293

exclude = test, tests, .git, __pycache__, build, dist, __init__.py .eggs, *.egg, .coveragerc
exclude = test, tests, .git, __pycache__, build, dist, __init__.py .eggs, *.egg, .coveragerc, mnsol.F
max-line-length = 160

per-file-ignores =
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ output/
*.swp
*~
*.log
*.mod
*.h5
**/qchem_input.in
.coverage
Expand Down
4 changes: 2 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ adding features to the library for PySCF functions running on GPU devices.
* While examples or documentation are not mandatory, it is highly recommended to
include examples of how to invoke the new module.

* CUDA compute capability 70 (sm_70) is required. Please avoid using features
that are only available on CUDA compute capability 80 or newer. The CUDA code
* CUDA compute capability 60 (sm_60) is required. Please avoid using features
that are only available on CUDA compute capability 70 or newer. The CUDA code
should be compiled and run using CUDA 11 and CUDA 12 toolkits.

Thank you for your contributions!
24 changes: 23 additions & 1 deletion benchmarks/cupy_helper/benchmark_cart2sph.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,26 @@
t_cutensor = perf_cutensor.gpu_times.mean()
print('kernel:', t_kernel)
print('cutensor:', t_cutensor)
print('memory bandwidth:',(a.nbytes+b.nbytes)/t_kernel/1024**3, 'GB/s')
print('memory bandwidth:',(a.nbytes+b.nbytes)/t_kernel/1024**3, 'GB/s')

print('benchmarking cart2sph when ang=5')
a = cupy.random.random([512,21*128,512])
b = cupy.random.random([512,11*128,512])
perf_kernel = profiler.benchmark(cart2sph, (a,1,5,b), n_repeat=20, n_warmup=3)
perf_cutensor = profiler.benchmark(cart2sph_cutensor, (a,1,5), n_repeat=20, n_warmup=3)
t_kernel = perf_kernel.gpu_times.mean()
t_cutensor = perf_cutensor.gpu_times.mean()
print('kernel:', t_kernel)
print('cutensor:', t_cutensor)
print('memory bandwidth:',(a.nbytes+b.nbytes)/t_kernel/1024**3, 'GB/s')

print('benchmarking cart2sph when ang=6')
a = cupy.random.random([512,28*128,512])
b = cupy.random.random([512,13*128,512])
perf_kernel = profiler.benchmark(cart2sph, (a,1,6,b), n_repeat=20, n_warmup=3)
perf_cutensor = profiler.benchmark(cart2sph_cutensor, (a,1,6), n_repeat=20, n_warmup=3)
t_kernel = perf_kernel.gpu_times.mean()
t_cutensor = perf_cutensor.gpu_times.mean()
print('kernel:', t_kernel)
print('cutensor:', t_cutensor)
print('memory bandwidth:',(a.nbytes+b.nbytes)/t_kernel/1024**3, 'GB/s')
3 changes: 1 addition & 2 deletions gpu4pyscf/lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,9 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.

cmake_minimum_required (VERSION 3.19 FATAL_ERROR) # 3.19 is required by cutlass
project (gpu4pyscf C CXX CUDA)
project (gpu4pyscf C CXX CUDA Fortran)

set(CMAKE_C_STANDARD "99")

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

Expand Down
2 changes: 1 addition & 1 deletion gpu4pyscf/lib/cupy_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ def hermi_triu(mat, hermi=1, inplace=True):

def cart2sph_cutensor(t, axis=0, ang=1, out=None):
'''
transform 'axis' of a tensor from cartesian basis into spherical basis
transform 'axis' of a tensor from cartesian basis into spherical basis with cutensor
'''
if(ang <= 1):
if(out is not None): out[:] = t
Expand Down
165 changes: 164 additions & 1 deletion gpu4pyscf/lib/cupy_helper/cart2sph.cu
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,166 @@ static void _cart2sph_ang4(double *cart, double *sph, int stride, int count){
sph[sph_offset+8*stride] = 0.625835735449176134 * (g0 + g10) - 3.755014412695056800 * g3;
}

__global__
static void _cart2sph_ang5(double *cart, double *sph, int stride, int count){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= count){
return;
}
int i = idx / stride;
int j = idx % stride;
int sph_offset = 11 * stride * i + j;
int cart_offset = 21 * stride * i + j;
double g0 = cart[cart_offset+0*stride];
double g1 = cart[cart_offset+1*stride];
double g2 = cart[cart_offset+2*stride];
double g3 = cart[cart_offset+3*stride];
double g4 = cart[cart_offset+4*stride];
double g5 = cart[cart_offset+5*stride];
double g6 = cart[cart_offset+6*stride];
double g7 = cart[cart_offset+7*stride];
double g8 = cart[cart_offset+8*stride];
double g9 = cart[cart_offset+9*stride];
double g10 = cart[cart_offset+10*stride];
double g11 = cart[cart_offset+11*stride];
double g12 = cart[cart_offset+12*stride];
double g13 = cart[cart_offset+13*stride];
double g14 = cart[cart_offset+14*stride];
double g15 = cart[cart_offset+15*stride];
double g16 = cart[cart_offset+16*stride];
double g17 = cart[cart_offset+17*stride];
double g18 = cart[cart_offset+18*stride];
double g19 = cart[cart_offset+19*stride];
double g20 = cart[cart_offset+20*stride];
sph[sph_offset+0*stride] = 3.2819102842008507 * (g1 - 2.0 * g6) + 0.6563820568401701 * g15;
sph[sph_offset+1*stride] = 8.3026492595241645 * (g4 - g11);
sph[sph_offset+2*stride] = -1.4677148983057511 * g1 + 11.7417191864460086 * g8 + 0.4892382994352504 * (g15 - 2.0 * g6) + -3.9139063954820030 * g17;
sph[sph_offset+3*stride] = -4.7935367849733241 * (g4 + g11) + 9.5870735699466483 * g13;
sph[sph_offset+4*stride] = 0.4529466511956969 * (g1 + g15 + 2.0 * g6) + -5.4353598143483630 * (g8 + g17) + 3.6235732095655755 * g19;
sph[sph_offset+5*stride] = 1.7542548368013540 * (g2 + g16) + 3.5085096736027079 * g7 + -4.6780128981369442 * (g9 + g18) + 0.9356025796273888 * g20;
sph[sph_offset+6*stride] = 0.4529466511956969 * (g0 + g10 + 2.0 * g3) + -5.4353598143483630 * (g5 + g12) + 3.6235732095655755 * g14;
sph[sph_offset+7*stride] = -2.3967683924866621 * (g2 - g16) + 4.7935367849733241 * (g9 - g18);
sph[sph_offset+8*stride] = -0.4892382994352504 * (g0 - 2.0 * g3) + 3.9139063954820030 * g5 + 1.4677148983057511 * g10 + -11.7417191864460086 * g12;
sph[sph_offset+9*stride] = 2.0756623148810411 * (g2 + g16) + -12.4539738892862477 * g7;
sph[sph_offset+10*stride] = 0.6563820568401701 * g0 + 3.2819102842008507 * (g10 - 2.0*g3);
}

__global__
static void _cart2sph_ang6(double *cart, double *sph, int stride, int count){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= count){
return;
}
int i = idx / stride;
int j = idx % stride;
int sph_offset = 13 * stride * i + j;
int cart_offset = 28 * stride * i + j;
double g0 = cart[cart_offset+0*stride];
double g1 = cart[cart_offset+1*stride];
double g2 = cart[cart_offset+2*stride];
double g3 = cart[cart_offset+3*stride];
double g4 = cart[cart_offset+4*stride];
double g5 = cart[cart_offset+5*stride];
double g6 = cart[cart_offset+6*stride];
double g7 = cart[cart_offset+7*stride];
double g8 = cart[cart_offset+8*stride];
double g9 = cart[cart_offset+9*stride];
double g10 = cart[cart_offset+10*stride];
double g11 = cart[cart_offset+11*stride];
double g12 = cart[cart_offset+12*stride];
double g13 = cart[cart_offset+13*stride];
double g14 = cart[cart_offset+14*stride];
double g15 = cart[cart_offset+15*stride];
double g16 = cart[cart_offset+16*stride];
double g17 = cart[cart_offset+17*stride];
double g18 = cart[cart_offset+18*stride];
double g19 = cart[cart_offset+19*stride];
double g20 = cart[cart_offset+20*stride];
double g21 = cart[cart_offset+21*stride];
double g22 = cart[cart_offset+22*stride];
double g23 = cart[cart_offset+23*stride];
double g24 = cart[cart_offset+24*stride];
double g25 = cart[cart_offset+25*stride];
double g26 = cart[cart_offset+26*stride];
double g27 = cart[cart_offset+27*stride];
sph[sph_offset+0*stride] = 4.0991046311514863 * (g1 + g15) + -13.6636821038382887 * g6;
sph[sph_offset+1*stride] = 11.8330958111587634 * g4 + -23.6661916223175268 * g11 + 2.3666191622317525 * g22;
sph[sph_offset+2*stride] = -2.0182596029148963 * (g1 - g15) + 20.1825960291489679 * (g8 - g17);
sph[sph_offset+3*stride] = -8.2908473356343109 * g4 + -5.5272315570895412 * g11 + 22.1089262283581647 * g13 + 2.7636157785447706 * g22 + -7.3696420761193888 * g24;
sph[sph_offset+4*stride] = 0.9212052595149236 * (g1 + g15 + 2.0 * g6) + -14.7392841522387776 * (g8 + g17 - g19);
sph[sph_offset+5*stride] = 2.9131068125936568 * (g4 + g22) + 5.8262136251873136 * g11 + -11.6524272503746271 * (g13 + g24) + 4.6609709001498505 * g26;
sph[sph_offset+6*stride] = -0.3178460113381421 * (g0 + g21 + 3.0*g3 + 3.0*g10) + 5.7212282040865583 * (g5 + g23) + 11.4424564081731166 * g12 + -7.6283042721154111 * (g14 + g25) + 1.0171072362820548 * g27;
sph[sph_offset+7*stride] = 2.9131068125936568 * (g2 + g16) + 5.8262136251873136 * g7 + -11.6524272503746271 * (g9 + g18) + 4.6609709001498505 * g20;
sph[sph_offset+8*stride] = 0.4606026297574618 * (g0 - g10) + 0.4606026297574618 * (g3 - g21) + -7.3696420761193888 * (g5 - g14 - g23 + g25);
sph[sph_offset+9*stride] = -2.7636157785447706 * (g2 - 2.0 * g7) + 7.3696420761193888 * g9 + 8.2908473356343109 * g16 + -22.1089262283581647 * g18;
sph[sph_offset+10*stride] = -0.5045649007287241 * (g0 + g21) + 2.5228245036436201 * (g3 + g10) + 5.0456490072872420 * (g5 + g23) + -30.2738940437234518 * g12;
sph[sph_offset+11*stride] = 2.3666191622317525 * g2 + 11.8330958111587634 * (g16 - 2.0 * g7);
sph[sph_offset+12*stride] = 0.6831841051919144 * (g0 - g21) + -10.2477615778787161 * (g3 - g10);
}

__global__
static void _cart2sph_ang7(double *cart, double *sph, int stride, int count){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= count){
return;
}
int i = idx / stride;
int j = idx % stride;
int sph_offset = 15 * stride * i + j;
int cart_offset =36 * stride * i + j;
double g0 = cart[cart_offset+0*stride];
double g1 = cart[cart_offset+1*stride];
double g2 = cart[cart_offset+2*stride];
double g3 = cart[cart_offset+3*stride];
double g4 = cart[cart_offset+4*stride];
double g5 = cart[cart_offset+5*stride];
double g6 = cart[cart_offset+6*stride];
double g7 = cart[cart_offset+7*stride];
double g8 = cart[cart_offset+8*stride];
double g9 = cart[cart_offset+9*stride];
double g10 = cart[cart_offset+10*stride];
double g11 = cart[cart_offset+11*stride];
double g12 = cart[cart_offset+12*stride];
double g13 = cart[cart_offset+13*stride];
double g14 = cart[cart_offset+14*stride];
double g15 = cart[cart_offset+15*stride];
double g16 = cart[cart_offset+16*stride];
double g17 = cart[cart_offset+17*stride];
double g18 = cart[cart_offset+18*stride];
double g19 = cart[cart_offset+19*stride];
double g20 = cart[cart_offset+20*stride];
double g21 = cart[cart_offset+21*stride];
double g22 = cart[cart_offset+22*stride];
double g23 = cart[cart_offset+23*stride];
double g24 = cart[cart_offset+24*stride];
double g25 = cart[cart_offset+25*stride];
double g26 = cart[cart_offset+26*stride];
double g27 = cart[cart_offset+27*stride];
double g28 = cart[cart_offset+28*stride];
double g29 = cart[cart_offset+29*stride];
double g30 = cart[cart_offset+30*stride];
double g31 = cart[cart_offset+31*stride];
double g32 = cart[cart_offset+32*stride];
double g33 = cart[cart_offset+33*stride];
double g34 = cart[cart_offset+34*stride];
double g35 = cart[cart_offset+35*stride];
sph[sph_offset+0*stride] = 4.9501391276721742 * g1 + -24.7506956383608703 * g6 + 14.8504173830165218 * g15 + -0.7071627325245963 * g28;
sph[sph_offset+1*stride] = 15.8757639708114002 * (g4 + g22) + -52.9192132360380043 * g11;
sph[sph_offset+2*stride] = -2.5945778936013020 * (g1 - g6) + 31.1349347232156219 * g8 + 4.6702402084823440 * g15 + -62.2698694464312439 * g17 + -0.5189155787202604 * g28 + 6.2269869446431247 * g30;
sph[sph_offset+3*stride] = -12.4539738892862495 * (g4 - g22) + 41.5132462976208316 * (g13 - g24);
sph[sph_offset+4*stride] = 1.4081304047606462 * g1 + 2.3468840079344107 * g6 + -28.1626080952129243 * g8 + 0.4693768015868821 * (g15 - g28) + -18.7750720634752817 * g17 + 37.5501441269505705 * g19 + 9.3875360317376408 * g30 + -12.5167147089835229 * g32;
sph[sph_offset+5*stride] = 6.6379903866747414 * (g4 + g22) + 13.2759807733494828 * g11 + -35.4026153955986160 * (g13 + g24) + 21.2415692373591725 * g26;
sph[sph_offset+6*stride] = -0.4516580379125866 * (g1 + g28) + -1.3549741137377600 * (g6 + g15) + 10.8397929099020782 * (g8 + g30) + 21.6795858198041564 * (g17 - g19 - g32) + 5.7812228852811094 * g34;
sph[sph_offset+7*stride] = -2.3899496919201728 * (g2 + g29) + -7.1698490757605189 * (g7 + g16) + 14.3396981515210360 * (g9 + g31) + 28.6793963030420720 * g18 + -11.4717585212168292 * (g20 + g33) + 1.0925484305920790 * g35;
sph[sph_offset+8*stride] = -0.4516580379125866 * (g0 + g21) + -1.3549741137377600 * (g3 + g10) + 10.8397929099020782 * (g5 + g23) + 21.6795858198041564 * g12 + -21.6795858198041564 * (g14 + g25) + 5.7812228852811094 * g27;
sph[sph_offset+9*stride] = 3.3189951933373707 * (g2 + g7 - g16 - g29) + -17.7013076977993080 * (g9 - g31) + 10.6207846186795862 * (g20 - g33);
sph[sph_offset+10*stride] = 0.4693768015868821 * (g0 - g3) + -9.3875360317376408 * g5 + -2.3468840079344107 * g10 + 18.7750720634752817 * g12 + 12.5167147089835229 * g14 + -1.4081304047606462 * g21 + 28.1626080952129243 * g23 + -37.5501441269505705 * g25;
sph[sph_offset+11*stride] = -3.1134934723215624 * (g2 + g29) + 15.5674673616078110 * (g7 + g16) + 10.3783115744052079 * (g9 + g31) + -62.2698694464312439 * g18;
sph[sph_offset+12*stride] = -0.5189155787202604 * g0 + 4.6702402084823440 * g3 + 6.2269869446431247 * g5 + 2.5945778936013020 * (g10 - g21) + -62.2698694464312439 * g12 + 31.1349347232156219 * g23;
sph[sph_offset+13*stride] = 2.6459606618019000 * (g2 - g29) + -39.6894099270284997 * (g7 - g16);
sph[sph_offset+14*stride] = 0.7071627325245963 * g0 + -14.8504173830165218 * g3 + 24.7506956383608703 * g10 + -4.9501391276721742 * g21;
}

extern "C" {
__host__
int cart2sph(cudaStream_t stream, double *cart_gto, double *sph_gto, int stride, int count, int ang)
Expand All @@ -123,8 +283,11 @@ int cart2sph(cudaStream_t stream, double *cart_gto, double *sph_gto, int stride,
case 2: _cart2sph_ang2 <<<blocks, threads, 0, stream>>> (cart_gto, sph_gto, stride, count); break;
case 3: _cart2sph_ang3 <<<blocks, threads, 0, stream>>> (cart_gto, sph_gto, stride, count); break;
case 4: _cart2sph_ang4 <<<blocks, threads, 0, stream>>> (cart_gto, sph_gto, stride, count); break;
case 5: _cart2sph_ang5 <<<blocks, threads, 0, stream>>> (cart_gto, sph_gto, stride, count); break;
case 6: _cart2sph_ang6 <<<blocks, threads, 0, stream>>> (cart_gto, sph_gto, stride, count); break;
case 7: _cart2sph_ang7 <<<blocks, threads, 0, stream>>> (cart_gto, sph_gto, stride, count); break;
default:
fprintf(stderr, "Ang > 4 is not supported!");
fprintf(stderr, "Ang > 7 is not supported!\n");
return 1;
}

Expand Down
17 changes: 9 additions & 8 deletions gpu4pyscf/lib/gvhf/nr_jk_driver_int3c2e_pass1.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ static int GINTrun_tasks_int3c2e_pass1_j(JKMatrix *jk, BasisProdOffsets *offsets
case 0b0000: GINTint3c2e_pass1_j_kernel0000<<<blocks, threads, 0, stream>>>(*envs, *jk, *offsets); break;
case 0b0010: GINTint3c2e_pass1_j_kernel0010<<<blocks, threads, 0, stream>>>(*envs, *jk, *offsets); break;
case 0b1000: GINTint3c2e_pass1_j_kernel1000<<<blocks, threads, 0, stream>>>(*envs, *jk, *offsets); break;
default: fprintf(stderr, "roots=1 type_ijkl %d\n", type_ijkl);
default: fprintf(stderr, "rys roots 1 type_ijkl %d\n", type_ijkl);
return 1;
}
break;
case 2: GINTint3c2e_pass1_j_kernel<2, GSIZE2_INT3C> <<<blocks, threads, 0, stream>>>(*envs, *jk, *offsets); break;
Expand All @@ -78,7 +79,7 @@ static int GINTrun_tasks_int3c2e_pass1_j(JKMatrix *jk, BasisProdOffsets *offsets

extern "C" { __host__
int GINTbuild_j_int3c2e_pass1(BasisProdCache *bpcache,
double *dm, double *rhoj,
double *dm, double *rhoj,
int nao, int naux, int n_dm,
int *bins_locs_ij, int *bins_locs_kl,
int ncp_ij, int ncp_kl)
Expand All @@ -98,22 +99,22 @@ int GINTbuild_j_int3c2e_pass1(BasisProdCache *bpcache,
jk.ao_offsets_j = 0;
jk.ao_offsets_k = nao + 1;
jk.ao_offsets_l = nao;

int *bas_pairs_locs = bpcache->bas_pairs_locs;
int *primitive_pairs_locs = bpcache->primitive_pairs_locs;
cudaStream_t streams[MAX_STREAMS];
for (n = 0; n < MAX_STREAMS; n++){
checkCudaErrors(cudaStreamCreate(&streams[n]));
}

int *idx = (int *)malloc(sizeof(int) * TOT_NF * 3);
int *l_locs = (int *)malloc(sizeof(int) * (GPU_LMAX + 2));
int *l_locs = (int *)malloc(sizeof(int) * (GPU_LMAX + 2));
GINTinit_index1d_xyz(idx, l_locs);
checkCudaErrors(cudaMemcpyToSymbol(c_idx, idx, sizeof(int) * TOT_NF*3));
checkCudaErrors(cudaMemcpyToSymbol(c_l_locs, l_locs, sizeof(int) * (GPU_LMAX + 2)));
free(idx);
free(l_locs);

for (int cp_ij_id = 0; cp_ij_id < ncp_ij; cp_ij_id++){
for (int k = 0; k < ncp_kl; k++, n++){
int n_stream = n % MAX_STREAMS;
Expand All @@ -132,7 +133,7 @@ int GINTbuild_j_int3c2e_pass1(BasisProdCache *bpcache,
int ntasks_kl = bins_locs_kl[k+1] - bins_locs_kl[k];
if (ntasks_kl <= 0) continue;
if (ntasks_ij <= 0) continue;

BasisProdOffsets offsets;
offsets.ntasks_ij = ntasks_ij;
offsets.ntasks_kl = ntasks_kl;
Expand All @@ -151,7 +152,7 @@ int GINTbuild_j_int3c2e_pass1(BasisProdCache *bpcache,
checkCudaErrors(cudaStreamSynchronize(streams[n]));
checkCudaErrors(cudaStreamDestroy(streams[n]));
}

return 0;
}

Expand Down
Loading

0 comments on commit 9c1feba

Please sign in to comment.