Skip to content

Commit

Permalink
Added new code using the SMILES pair encoder tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
akvasan2 committed Jun 1, 2023
1 parent 0f2e903 commit a9bf3f3
Show file tree
Hide file tree
Showing 8 changed files with 7,470 additions and 0 deletions.
3,002 changes: 3,002 additions & 0 deletions Pilot1/ST1/VocabFiles_spe/SPE_ChEMBL.txt

Large diffs are not rendered by default.

3,132 changes: 3,132 additions & 0 deletions Pilot1/ST1/VocabFiles_spe/vocab_spe.txt

Large diffs are not rendered by default.

52 changes: 52 additions & 0 deletions Pilot1/ST1/config_st_spe_training.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"general": {
"use_hvd": true,
"batch_size": 64,
"epochs": 400,
"lr": 0.00000991301767144166,
"loss_fn": "mean_squared_error"
},

"data_loading": {
"data_path": "/lus/grand/projects/datascience/avasan/Data_Docking/2M-flatten",
"rec": "3CLPro_7BQY_A_1_F",
"pattern": "Orderable_zinc_db_enaHLL.sorted.4col.descriptors.parquet.xform-smiles.csv.reg"
},

"tokenization": {
"vocab_size": 3132,
"maxlen": 45,
"tokenizer": {
"category": "smilespair",
"spe_file": "VocabFiles/SPE_ChEMBL.txt",
"vocab_file": "VocabFiles/vocab_spe.txt"
}
},

"architecture": {
"embedding": {
"embed_dim": 128
},
"transformer_block": {
"num_blocks": 5,
"activation": "selu",
"ff_dim": 128,
"num_heads": 21,
"dr1": 0.12717945391278226,
"dr2": 0.12717945391278226,
"drop_mha": true
},
"regressor_head": {
"activation": "selu",
"dr": 0.04990303516069576
}
},

"callbacks": {
"checkpt_file": "smile_regress.autosave.model.h5",
"log_csv": "smile_regress.training.log",
"patience_red_lr": 20,
"patience_early_stop": 100
}

}
27 changes: 27 additions & 0 deletions Pilot1/ST1/polaris_sub_smiles_regress_transformer_spe.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash
#PBS -N st_spe
#PBS -l select=4
#PBS -l walltime=12:00:00
#PBS -q preemptable
#PBS -l filesystems=grand
#PBS -A datascience
#PBS -o logs/
#PBS -e logs/
#PBS -m abe
#PBS -M [email protected]

module load conda/2022-09-08
conda activate

cd /grand/datascience/avasan/ST_Benchmarks/Test_Tokenizers/SMILESPair_Encoder_continue

NP=16
PPN=4
OUT=logfile.log
let NDEPTH=64/$NP
let NTHREADS=$NDEPTH

TF_GPU_ALLOCATOR=cuda_malloc_async
export TF_FORCE_GPU_ALLOW_GROWTH=true

mpiexec --np 16 -ppn 4 --cpu-bind verbose,list:0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 -env NCCL_COLLNET_ENABLE=1 -env NCCL_NET_GDR_LEVEL=PHB python smiles_regress_transformer_run.py > $OUT
Loading

0 comments on commit a9bf3f3

Please sign in to comment.