Added New MultiNode version that creates output dirs + checkpoints ea…

…ch run
ECP-CANDLE · Oct 16, 2023 · a1a1318 · a1a1318
1 parent d89d845
commit a1a1318
Show file tree

Hide file tree

Showing 22 changed files with 438,366 additions and 0 deletions.
diff --git a/Pilot1/ST1/Inf_STRev_MultiNode_CheckList/VocabFiles/SPE_ChEMBL.txt b/Pilot1/ST1/Inf_STRev_MultiNode_CheckList/VocabFiles/SPE_ChEMBL.txt
diff --git a/Pilot1/ST1/Inf_STRev_MultiNode_CheckList/VocabFiles/vocab_spe.txt b/Pilot1/ST1/Inf_STRev_MultiNode_CheckList/VocabFiles/vocab_spe.txt
diff --git a/Pilot1/ST1/Inf_STRev_MultiNode_CheckList/checklist.dat b/Pilot1/ST1/Inf_STRev_MultiNode_CheckList/checklist.dat
diff --git a/Pilot1/ST1/Inf_STRev_MultiNode_CheckList/clr_callback.py b/Pilot1/ST1/Inf_STRev_MultiNode_CheckList/clr_callback.py
@@ -0,0 +1,133 @@
+from tensorflow.keras.callbacks import *
+from tensorflow.keras import backend as K
+import numpy as np
+
+class CyclicLR(Callback):
+    """This callback implements a cyclical learning rate policy (CLR).
+    The method cycles the learning rate between two boundaries with
+    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
+    The amplitude of the cycle can be scaled on a per-iteration or 
+    per-cycle basis.
+    This class has three built-in policies, as put forth in the paper.
+    "triangular":
+        A basic triangular cycle w/ no amplitude scaling.
+    "triangular2":
+        A basic triangular cycle that scales initial amplitude by half each cycle.
+    "exp_range":
+        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
+        cycle iteration.
+    For more detail, please see paper.
+    
+    # Example
+        ```python
+            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
+                                step_size=2000., mode='triangular')
+            model.fit(X_train, Y_train, callbacks=[clr])
+        ```
+    
+    Class also supports custom scaling functions:
+        ```python
+            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
+            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
+                                step_size=2000., scale_fn=clr_fn,
+                                scale_mode='cycle')
+            model.fit(X_train, Y_train, callbacks=[clr])
+        ```    
+    # Arguments
+        base_lr: initial learning rate which is the
+            lower boundary in the cycle.
+        max_lr: upper boundary in the cycle. Functionally,
+            it defines the cycle amplitude (max_lr - base_lr).
+            The lr at any cycle is the sum of base_lr
+            and some scaling of the amplitude; therefore 
+            max_lr may not actually be reached depending on
+            scaling function.
+        step_size: number of training iterations per
+            half cycle. Authors suggest setting step_size
+            2-8 x training iterations in epoch.
+        mode: one of {triangular, triangular2, exp_range}.
+            Default 'triangular'.
+            Values correspond to policies detailed above.
+            If scale_fn is not None, this argument is ignored.
+        gamma: constant in 'exp_range' scaling function:
+            gamma**(cycle iterations)
+        scale_fn: Custom scaling policy defined by a single
+            argument lambda function, where 
+            0 <= scale_fn(x) <= 1 for all x >= 0.
+            mode paramater is ignored 
+        scale_mode: {'cycle', 'iterations'}.
+            Defines whether scale_fn is evaluated on 
+            cycle number or cycle iterations (training
+            iterations since start of cycle). Default is 'cycle'.
+    """
+
+    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
+                 gamma=1., scale_fn=None, scale_mode='cycle'):
+        super(CyclicLR, self).__init__()
+
+        self.base_lr = base_lr
+        self.max_lr = max_lr
+        self.step_size = step_size
+        self.mode = mode
+        self.gamma = gamma
+        if scale_fn == None:
+            if self.mode == 'triangular':
+                self.scale_fn = lambda x: 1.
+                self.scale_mode = 'cycle'
+            elif self.mode == 'triangular2':
+                self.scale_fn = lambda x: 1/(2.**(x-1))
+                self.scale_mode = 'cycle'
+            elif self.mode == 'exp_range':
+                self.scale_fn = lambda x: gamma**(x)
+                self.scale_mode = 'iterations'
+        else:
+            self.scale_fn = scale_fn
+            self.scale_mode = scale_mode
+        self.clr_iterations = 0.
+        self.trn_iterations = 0.
+        self.history = {}
+
+        self._reset()
+
+    def _reset(self, new_base_lr=None, new_max_lr=None,
+               new_step_size=None):
+        """Resets cycle iterations.
+        Optional boundary/step size adjustment.
+        """
+        if new_base_lr != None:
+            self.base_lr = new_base_lr
+        if new_max_lr != None:
+            self.max_lr = new_max_lr
+        if new_step_size != None:
+            self.step_size = new_step_size
+        self.clr_iterations = 0.
+
+    def clr(self):
+        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
+        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
+        if self.scale_mode == 'cycle':
+            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
+        else:
+            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
+
+    def on_train_begin(self, logs={}):
+        logs = logs or {}
+
+        if self.clr_iterations == 0:
+            K.set_value(self.model.optimizer.lr, self.base_lr)
+        else:
+            K.set_value(self.model.optimizer.lr, self.clr())        
+
+    def on_batch_end(self, epoch, logs=None):
+
+        logs = logs or {}
+        self.trn_iterations += 1
+        self.clr_iterations += 1
+
+        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
+        self.history.setdefault('iterations', []).append(self.trn_iterations)
+
+        for k, v in logs.items():
+            self.history.setdefault(k, []).append(v)
+
+        K.set_value(self.model.optimizer.lr, self.clr())
diff --git a/Pilot1/ST1/Inf_STRev_MultiNode_CheckList/config.json b/Pilot1/ST1/Inf_STRev_MultiNode_CheckList/config.json
@@ -0,0 +1,62 @@
+{
+    "general": {
+        "output": "/lus/gila/projects/candle_aesp_CNDA/avasan/DockingSurrogates/Inference/Inference_Scaling/ST_Sort_CheckList/output",
+        "use_hvd": false,
+        "use_mpi": true,
+        "batch_size": 512,
+        "epochs": 400,
+        "lr": 0.00000991301767144166,
+        "loss_fn": "mean_squared_error",
+        "checklist": "/lus/gila/projects/candle_aesp_CNDA/avasan/DockingSurrogates/Inference/Inference_Scaling/ST_Sort_CheckList/checklist.dat",
+        "restart": true
+    },
+
+    "inference_data": {
+        "data_dir": "/lus/gila/projects/Shared/avasan/Inference_Data",
+        "databases": ["BDB","CAS", "CHM", "DBK",  "DCL",  "DUD",  "E15",  "EDB",  "EMO",  "ENA",  "FFI",  "G13",  "G17",  "HOP",  "LIT",  "MCU",  "MOS",  "PCH.incomplete",  "QM9",  "REP",  "SAV.incomplete",  "SUR",  "ZIN"]
+    },
+
+    "data_loading": {
+        "data_path": "/lus/grand/projects/datascience/avasan/Data_Docking/2M-flatten",
+        "rec": "3CLPro_7BQY_A_1_F",
+        "pattern": "Orderable_zinc_db_enaHLL.sorted.4col.descriptors.parquet.xform-smiles.csv.reg"
+    },
+
+    "tokenization": {
+        "vocab_size": 3132,
+        "maxlen": 45,
+        "tokenizer": {
+            "category": "smilespair",
+            "spe_file": "VocabFiles/SPE_ChEMBL.txt",
+            "vocab_file": "VocabFiles/vocab_spe.txt"
+        }
+    },
+
+    "architecture": {
+        "compile": false,
+        "embedding": {
+            "embed_dim": 128 
+        },
+        "transformer_block": {
+            "num_blocks": 5,
+            "activation": "selu",
+            "ff_dim": 128,
+            "num_heads": 21,
+            "dr1": 0.12717945391278226,
+            "dr2": 0.12717945391278226,
+            "drop_mha": true
+        },
+        "regressor_head": {
+            "activation": "selu",
+            "dr": 0.04990303516069576
+        }
+    },
+
+    "callbacks": {
+        "checkpt_file": "smile_regress.autosave.model.h5",
+        "log_csv": "smile_regress.training.log",
+        "patience_red_lr": 20,
+        "patience_early_stop": 100
+    }
+
+}
diff --git a/Pilot1/ST1/Inf_STRev_MultiNode_CheckList/gpu_affinity.sh b/Pilot1/ST1/Inf_STRev_MultiNode_CheckList/gpu_affinity.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+
+display_help() {
+  echo " Will map gpu tile to rank in compact and then round-robin fashion"
+  echo " Usage:"
+  echo "   mpiexec -np N gpu_tile_compact.sh ./a.out"
+  echo
+  echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
+  echo "   0 Rank 0.0"
+  echo "   1 Rank 0.1"
+  echo "   2 Rank 1.0"
+  echo "   3 Rank 1.1"
+  echo "   4 Rank 2.0"
+  echo "   5 Rank 2.1"
+  echo "   6 Rank 0.0"
+  echo 
+  echo " Hacked together by [email protected], please contact if bug found"
+  exit 1
+}
+
+#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
+num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci:i915/* |& grep -v Unknown | grep -c "P: /devices")
+num_tile=2
+
+if [[ -v PROCS_PER_TILE ]]; then
+    _PROCS_PER_TILE=$PROCS_PER_TILE
+    _PROCS_PER_GPU=$((PROCS_PER_TILE*2))
+fi
+
+if [[ -v PROCS_PER_GPU ]]; then
+    _PROCS_PER_GPU=$PROCS_PER_GPU
+    if [[ $_PROCS_PER_GPU -gt $((_PROCS_PER_TILE*2)) ]]; then
+       echo "PROCS_PER_GPU cannot be greater than 2*PROCS_PER_TILE"
+       exit 1
+    fi
+fi
+
+if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ] ; then
+  display_help
+fi
+
+# Get the RankID from different launcher
+if [[ -v MPI_LOCALRANKID ]]; then
+  _MPI_RANKID=$MPI_LOCALRANKID 
+elif [[ -v PALS_LOCAL_RANKID ]]; then
+  _MPI_RANKID=$PALS_LOCAL_RANKID
+else
+  display_help
+fi
+
+if [[ $_MPI_RANKID -eq 0 ]]; then
+    echo "Number of GPUs: $num_gpu"
+fi
+if [[ $_PROCS_PER_GPU -eq 1 ]]; then
+    gpu_id=$((_MPI_RANKID % num_gpu ))
+    tile_id=0
+elif [[ -v _PROCS_PER_TILE ]]; then
+    gpu_id=$(((_MPI_RANKID / $_PROCS_PER_GPU) % num_gpu))
+    if [[ $_PROCS_PER_TILE -eq $_PROCS_PER_GPU ]]; then
+        tile_id=0
+    else
+        tile_id=$(((_MPI_RANKID / $_PROCS_PER_TILE) % num_tile))
+    fi
+else
+    gpu_id=$(((_MPI_RANKID / num_tile) % num_gpu))
+    tile_id=$((_MPI_RANKID % num_tile))
+fi
+unset EnableWalkerPartition
+export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
+export ZE_AFFINITY_MASK=$gpu_id.$tile_id
+echo "AFFINITY_MASK [$_MPI_RANKID]: $ZE_AFFINITY_MASK"
+#https://stackoverflow.com/a/28099707/7674852
+"$@"
diff --git a/Pilot1/ST1/Inf_STRev_MultiNode_CheckList/logs/output.log b/Pilot1/ST1/Inf_STRev_MultiNode_CheckList/logs/output.log