Skip to content

Commit

Permalink
Added New MultiNode version that creates output dirs + checkpoints ea…
Browse files Browse the repository at this point in the history
…ch run
  • Loading branch information
akvasan2 committed Oct 16, 2023
1 parent d89d845 commit a1a1318
Show file tree
Hide file tree
Showing 22 changed files with 438,366 additions and 0 deletions.
3,002 changes: 3,002 additions & 0 deletions Pilot1/ST1/Inf_STRev_MultiNode_CheckList/VocabFiles/SPE_ChEMBL.txt

Large diffs are not rendered by default.

3,132 changes: 3,132 additions & 0 deletions Pilot1/ST1/Inf_STRev_MultiNode_CheckList/VocabFiles/vocab_spe.txt

Large diffs are not rendered by default.

418,857 changes: 418,857 additions & 0 deletions Pilot1/ST1/Inf_STRev_MultiNode_CheckList/checklist.dat

Large diffs are not rendered by default.

133 changes: 133 additions & 0 deletions Pilot1/ST1/Inf_STRev_MultiNode_CheckList/clr_callback.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
from tensorflow.keras.callbacks import *
from tensorflow.keras import backend as K
import numpy as np

class CyclicLR(Callback):
"""This callback implements a cyclical learning rate policy (CLR).
The method cycles the learning rate between two boundaries with
some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
The amplitude of the cycle can be scaled on a per-iteration or
per-cycle basis.
This class has three built-in policies, as put forth in the paper.
"triangular":
A basic triangular cycle w/ no amplitude scaling.
"triangular2":
A basic triangular cycle that scales initial amplitude by half each cycle.
"exp_range":
A cycle that scales initial amplitude by gamma**(cycle iterations) at each
cycle iteration.
For more detail, please see paper.
# Example
```python
clr = CyclicLR(base_lr=0.001, max_lr=0.006,
step_size=2000., mode='triangular')
model.fit(X_train, Y_train, callbacks=[clr])
```
Class also supports custom scaling functions:
```python
clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
clr = CyclicLR(base_lr=0.001, max_lr=0.006,
step_size=2000., scale_fn=clr_fn,
scale_mode='cycle')
model.fit(X_train, Y_train, callbacks=[clr])
```
# Arguments
base_lr: initial learning rate which is the
lower boundary in the cycle.
max_lr: upper boundary in the cycle. Functionally,
it defines the cycle amplitude (max_lr - base_lr).
The lr at any cycle is the sum of base_lr
and some scaling of the amplitude; therefore
max_lr may not actually be reached depending on
scaling function.
step_size: number of training iterations per
half cycle. Authors suggest setting step_size
2-8 x training iterations in epoch.
mode: one of {triangular, triangular2, exp_range}.
Default 'triangular'.
Values correspond to policies detailed above.
If scale_fn is not None, this argument is ignored.
gamma: constant in 'exp_range' scaling function:
gamma**(cycle iterations)
scale_fn: Custom scaling policy defined by a single
argument lambda function, where
0 <= scale_fn(x) <= 1 for all x >= 0.
mode paramater is ignored
scale_mode: {'cycle', 'iterations'}.
Defines whether scale_fn is evaluated on
cycle number or cycle iterations (training
iterations since start of cycle). Default is 'cycle'.
"""

def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
gamma=1., scale_fn=None, scale_mode='cycle'):
super(CyclicLR, self).__init__()

self.base_lr = base_lr
self.max_lr = max_lr
self.step_size = step_size
self.mode = mode
self.gamma = gamma
if scale_fn == None:
if self.mode == 'triangular':
self.scale_fn = lambda x: 1.
self.scale_mode = 'cycle'
elif self.mode == 'triangular2':
self.scale_fn = lambda x: 1/(2.**(x-1))
self.scale_mode = 'cycle'
elif self.mode == 'exp_range':
self.scale_fn = lambda x: gamma**(x)
self.scale_mode = 'iterations'
else:
self.scale_fn = scale_fn
self.scale_mode = scale_mode
self.clr_iterations = 0.
self.trn_iterations = 0.
self.history = {}

self._reset()

def _reset(self, new_base_lr=None, new_max_lr=None,
new_step_size=None):
"""Resets cycle iterations.
Optional boundary/step size adjustment.
"""
if new_base_lr != None:
self.base_lr = new_base_lr
if new_max_lr != None:
self.max_lr = new_max_lr
if new_step_size != None:
self.step_size = new_step_size
self.clr_iterations = 0.

def clr(self):
cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
if self.scale_mode == 'cycle':
return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
else:
return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)

def on_train_begin(self, logs={}):
logs = logs or {}

if self.clr_iterations == 0:
K.set_value(self.model.optimizer.lr, self.base_lr)
else:
K.set_value(self.model.optimizer.lr, self.clr())

def on_batch_end(self, epoch, logs=None):

logs = logs or {}
self.trn_iterations += 1
self.clr_iterations += 1

self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
self.history.setdefault('iterations', []).append(self.trn_iterations)

for k, v in logs.items():
self.history.setdefault(k, []).append(v)

K.set_value(self.model.optimizer.lr, self.clr())
62 changes: 62 additions & 0 deletions Pilot1/ST1/Inf_STRev_MultiNode_CheckList/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{
"general": {
"output": "/lus/gila/projects/candle_aesp_CNDA/avasan/DockingSurrogates/Inference/Inference_Scaling/ST_Sort_CheckList/output",
"use_hvd": false,
"use_mpi": true,
"batch_size": 512,
"epochs": 400,
"lr": 0.00000991301767144166,
"loss_fn": "mean_squared_error",
"checklist": "/lus/gila/projects/candle_aesp_CNDA/avasan/DockingSurrogates/Inference/Inference_Scaling/ST_Sort_CheckList/checklist.dat",
"restart": true
},

"inference_data": {
"data_dir": "/lus/gila/projects/Shared/avasan/Inference_Data",
"databases": ["BDB","CAS", "CHM", "DBK", "DCL", "DUD", "E15", "EDB", "EMO", "ENA", "FFI", "G13", "G17", "HOP", "LIT", "MCU", "MOS", "PCH.incomplete", "QM9", "REP", "SAV.incomplete", "SUR", "ZIN"]
},

"data_loading": {
"data_path": "/lus/grand/projects/datascience/avasan/Data_Docking/2M-flatten",
"rec": "3CLPro_7BQY_A_1_F",
"pattern": "Orderable_zinc_db_enaHLL.sorted.4col.descriptors.parquet.xform-smiles.csv.reg"
},

"tokenization": {
"vocab_size": 3132,
"maxlen": 45,
"tokenizer": {
"category": "smilespair",
"spe_file": "VocabFiles/SPE_ChEMBL.txt",
"vocab_file": "VocabFiles/vocab_spe.txt"
}
},

"architecture": {
"compile": false,
"embedding": {
"embed_dim": 128
},
"transformer_block": {
"num_blocks": 5,
"activation": "selu",
"ff_dim": 128,
"num_heads": 21,
"dr1": 0.12717945391278226,
"dr2": 0.12717945391278226,
"drop_mha": true
},
"regressor_head": {
"activation": "selu",
"dr": 0.04990303516069576
}
},

"callbacks": {
"checkpt_file": "smile_regress.autosave.model.h5",
"log_csv": "smile_regress.training.log",
"patience_red_lr": 20,
"patience_early_stop": 100
}

}
73 changes: 73 additions & 0 deletions Pilot1/ST1/Inf_STRev_MultiNode_CheckList/gpu_affinity.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/bin/env bash
display_help() {
echo " Will map gpu tile to rank in compact and then round-robin fashion"
echo " Usage:"
echo " mpiexec -np N gpu_tile_compact.sh ./a.out"
echo
echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
echo " 0 Rank 0.0"
echo " 1 Rank 0.1"
echo " 2 Rank 1.0"
echo " 3 Rank 1.1"
echo " 4 Rank 2.0"
echo " 5 Rank 2.1"
echo " 6 Rank 0.0"
echo
echo " Hacked together by [email protected], please contact if bug found"
exit 1
}
#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci:i915/* |& grep -v Unknown | grep -c "P: /devices")
num_tile=2
if [[ -v PROCS_PER_TILE ]]; then
_PROCS_PER_TILE=$PROCS_PER_TILE
_PROCS_PER_GPU=$((PROCS_PER_TILE*2))
fi
if [[ -v PROCS_PER_GPU ]]; then
_PROCS_PER_GPU=$PROCS_PER_GPU
if [[ $_PROCS_PER_GPU -gt $((_PROCS_PER_TILE*2)) ]]; then
echo "PROCS_PER_GPU cannot be greater than 2*PROCS_PER_TILE"
exit 1
fi
fi
if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ] ; then
display_help
fi
# Get the RankID from different launcher
if [[ -v MPI_LOCALRANKID ]]; then
_MPI_RANKID=$MPI_LOCALRANKID
elif [[ -v PALS_LOCAL_RANKID ]]; then
_MPI_RANKID=$PALS_LOCAL_RANKID
else
display_help
fi
if [[ $_MPI_RANKID -eq 0 ]]; then
echo "Number of GPUs: $num_gpu"
fi
if [[ $_PROCS_PER_GPU -eq 1 ]]; then
gpu_id=$((_MPI_RANKID % num_gpu ))
tile_id=0
elif [[ -v _PROCS_PER_TILE ]]; then
gpu_id=$(((_MPI_RANKID / $_PROCS_PER_GPU) % num_gpu))
if [[ $_PROCS_PER_TILE -eq $_PROCS_PER_GPU ]]; then
tile_id=0
else
tile_id=$(((_MPI_RANKID / $_PROCS_PER_TILE) % num_tile))
fi
else
gpu_id=$(((_MPI_RANKID / num_tile) % num_gpu))
tile_id=$((_MPI_RANKID % num_tile))
fi
unset EnableWalkerPartition
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
export ZE_AFFINITY_MASK=$gpu_id.$tile_id
echo "AFFINITY_MASK [$_MPI_RANKID]: $ZE_AFFINITY_MASK"
#https://stackoverflow.com/a/28099707/7674852
"$@"
Empty file.
Loading

0 comments on commit a1a1318

Please sign in to comment.