Added new code using the SMILES pair encoder tokenizer

ECP-CANDLE · Jun 1, 2023 · a9bf3f3 · a9bf3f3
1 parent 0f2e903
commit a9bf3f3
Show file tree

Hide file tree

Showing 8 changed files with 7,470 additions and 0 deletions.
diff --git a/Pilot1/ST1/VocabFiles_spe/SPE_ChEMBL.txt b/Pilot1/ST1/VocabFiles_spe/SPE_ChEMBL.txt
diff --git a/Pilot1/ST1/VocabFiles_spe/vocab_spe.txt b/Pilot1/ST1/VocabFiles_spe/vocab_spe.txt
diff --git a/Pilot1/ST1/config_st_spe_training.json b/Pilot1/ST1/config_st_spe_training.json
@@ -0,0 +1,52 @@
+{
+    "general": {
+        "use_hvd": true,
+        "batch_size": 64,
+        "epochs": 400,
+        "lr": 0.00000991301767144166,
+        "loss_fn": "mean_squared_error"
+    },
+
+    "data_loading": {
+        "data_path": "/lus/grand/projects/datascience/avasan/Data_Docking/2M-flatten",
+        "rec": "3CLPro_7BQY_A_1_F",
+        "pattern": "Orderable_zinc_db_enaHLL.sorted.4col.descriptors.parquet.xform-smiles.csv.reg"
+    },
+
+    "tokenization": {
+        "vocab_size": 3132,
+        "maxlen": 45,
+        "tokenizer": {
+            "category": "smilespair",
+            "spe_file": "VocabFiles/SPE_ChEMBL.txt",
+            "vocab_file": "VocabFiles/vocab_spe.txt"
+        }
+    },
+
+    "architecture": {
+        "embedding": {
+            "embed_dim": 128 
+        },
+        "transformer_block": {
+            "num_blocks": 5,
+            "activation": "selu",
+            "ff_dim": 128,
+            "num_heads": 21,
+            "dr1": 0.12717945391278226,
+            "dr2": 0.12717945391278226,
+            "drop_mha": true
+        },
+        "regressor_head": {
+            "activation": "selu",
+            "dr": 0.04990303516069576
+        }
+    },
+
+    "callbacks": {
+        "checkpt_file": "smile_regress.autosave.model.h5",
+        "log_csv": "smile_regress.training.log",
+        "patience_red_lr": 20,
+        "patience_early_stop": 100
+    }
+
+}
diff --git a/Pilot1/ST1/polaris_sub_smiles_regress_transformer_spe.sh b/Pilot1/ST1/polaris_sub_smiles_regress_transformer_spe.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+#PBS -N st_spe
+#PBS -l select=4
+#PBS -l walltime=12:00:00
+#PBS -q preemptable
+#PBS -l filesystems=grand
+#PBS -A datascience
+#PBS -o logs/
+#PBS -e logs/
+#PBS -m abe
+#PBS -M [email protected]
+
+module load conda/2022-09-08
+conda activate
+
+cd /grand/datascience/avasan/ST_Benchmarks/Test_Tokenizers/SMILESPair_Encoder_continue 
+
+NP=16
+PPN=4
+OUT=logfile.log
+let NDEPTH=64/$NP
+let NTHREADS=$NDEPTH
+
+TF_GPU_ALLOCATOR=cuda_malloc_async
+export TF_FORCE_GPU_ALLOW_GROWTH=true
+
+mpiexec --np 16 -ppn 4 --cpu-bind verbose,list:0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 -env NCCL_COLLNET_ENABLE=1 -env NCCL_NET_GDR_LEVEL=PHB python smiles_regress_transformer_run.py > $OUT