-
Notifications
You must be signed in to change notification settings - Fork 83
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added New MultiNode version that creates output dirs + checkpoints ea…
…ch run
- Loading branch information
Showing
22 changed files
with
438,366 additions
and
0 deletions.
There are no files selected for viewing
3,002 changes: 3,002 additions & 0 deletions
3,002
Pilot1/ST1/Inf_STRev_MultiNode_CheckList/VocabFiles/SPE_ChEMBL.txt
Large diffs are not rendered by default.
Oops, something went wrong.
3,132 changes: 3,132 additions & 0 deletions
3,132
Pilot1/ST1/Inf_STRev_MultiNode_CheckList/VocabFiles/vocab_spe.txt
Large diffs are not rendered by default.
Oops, something went wrong.
418,857 changes: 418,857 additions & 0 deletions
418,857
Pilot1/ST1/Inf_STRev_MultiNode_CheckList/checklist.dat
Large diffs are not rendered by default.
Oops, something went wrong.
133 changes: 133 additions & 0 deletions
133
Pilot1/ST1/Inf_STRev_MultiNode_CheckList/clr_callback.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
from tensorflow.keras.callbacks import * | ||
from tensorflow.keras import backend as K | ||
import numpy as np | ||
|
||
class CyclicLR(Callback): | ||
"""This callback implements a cyclical learning rate policy (CLR). | ||
The method cycles the learning rate between two boundaries with | ||
some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186). | ||
The amplitude of the cycle can be scaled on a per-iteration or | ||
per-cycle basis. | ||
This class has three built-in policies, as put forth in the paper. | ||
"triangular": | ||
A basic triangular cycle w/ no amplitude scaling. | ||
"triangular2": | ||
A basic triangular cycle that scales initial amplitude by half each cycle. | ||
"exp_range": | ||
A cycle that scales initial amplitude by gamma**(cycle iterations) at each | ||
cycle iteration. | ||
For more detail, please see paper. | ||
# Example | ||
```python | ||
clr = CyclicLR(base_lr=0.001, max_lr=0.006, | ||
step_size=2000., mode='triangular') | ||
model.fit(X_train, Y_train, callbacks=[clr]) | ||
``` | ||
Class also supports custom scaling functions: | ||
```python | ||
clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.)) | ||
clr = CyclicLR(base_lr=0.001, max_lr=0.006, | ||
step_size=2000., scale_fn=clr_fn, | ||
scale_mode='cycle') | ||
model.fit(X_train, Y_train, callbacks=[clr]) | ||
``` | ||
# Arguments | ||
base_lr: initial learning rate which is the | ||
lower boundary in the cycle. | ||
max_lr: upper boundary in the cycle. Functionally, | ||
it defines the cycle amplitude (max_lr - base_lr). | ||
The lr at any cycle is the sum of base_lr | ||
and some scaling of the amplitude; therefore | ||
max_lr may not actually be reached depending on | ||
scaling function. | ||
step_size: number of training iterations per | ||
half cycle. Authors suggest setting step_size | ||
2-8 x training iterations in epoch. | ||
mode: one of {triangular, triangular2, exp_range}. | ||
Default 'triangular'. | ||
Values correspond to policies detailed above. | ||
If scale_fn is not None, this argument is ignored. | ||
gamma: constant in 'exp_range' scaling function: | ||
gamma**(cycle iterations) | ||
scale_fn: Custom scaling policy defined by a single | ||
argument lambda function, where | ||
0 <= scale_fn(x) <= 1 for all x >= 0. | ||
mode paramater is ignored | ||
scale_mode: {'cycle', 'iterations'}. | ||
Defines whether scale_fn is evaluated on | ||
cycle number or cycle iterations (training | ||
iterations since start of cycle). Default is 'cycle'. | ||
""" | ||
|
||
def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular', | ||
gamma=1., scale_fn=None, scale_mode='cycle'): | ||
super(CyclicLR, self).__init__() | ||
|
||
self.base_lr = base_lr | ||
self.max_lr = max_lr | ||
self.step_size = step_size | ||
self.mode = mode | ||
self.gamma = gamma | ||
if scale_fn == None: | ||
if self.mode == 'triangular': | ||
self.scale_fn = lambda x: 1. | ||
self.scale_mode = 'cycle' | ||
elif self.mode == 'triangular2': | ||
self.scale_fn = lambda x: 1/(2.**(x-1)) | ||
self.scale_mode = 'cycle' | ||
elif self.mode == 'exp_range': | ||
self.scale_fn = lambda x: gamma**(x) | ||
self.scale_mode = 'iterations' | ||
else: | ||
self.scale_fn = scale_fn | ||
self.scale_mode = scale_mode | ||
self.clr_iterations = 0. | ||
self.trn_iterations = 0. | ||
self.history = {} | ||
|
||
self._reset() | ||
|
||
def _reset(self, new_base_lr=None, new_max_lr=None, | ||
new_step_size=None): | ||
"""Resets cycle iterations. | ||
Optional boundary/step size adjustment. | ||
""" | ||
if new_base_lr != None: | ||
self.base_lr = new_base_lr | ||
if new_max_lr != None: | ||
self.max_lr = new_max_lr | ||
if new_step_size != None: | ||
self.step_size = new_step_size | ||
self.clr_iterations = 0. | ||
|
||
def clr(self): | ||
cycle = np.floor(1+self.clr_iterations/(2*self.step_size)) | ||
x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1) | ||
if self.scale_mode == 'cycle': | ||
return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle) | ||
else: | ||
return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations) | ||
|
||
def on_train_begin(self, logs={}): | ||
logs = logs or {} | ||
|
||
if self.clr_iterations == 0: | ||
K.set_value(self.model.optimizer.lr, self.base_lr) | ||
else: | ||
K.set_value(self.model.optimizer.lr, self.clr()) | ||
|
||
def on_batch_end(self, epoch, logs=None): | ||
|
||
logs = logs or {} | ||
self.trn_iterations += 1 | ||
self.clr_iterations += 1 | ||
|
||
self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr)) | ||
self.history.setdefault('iterations', []).append(self.trn_iterations) | ||
|
||
for k, v in logs.items(): | ||
self.history.setdefault(k, []).append(v) | ||
|
||
K.set_value(self.model.optimizer.lr, self.clr()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
{ | ||
"general": { | ||
"output": "/lus/gila/projects/candle_aesp_CNDA/avasan/DockingSurrogates/Inference/Inference_Scaling/ST_Sort_CheckList/output", | ||
"use_hvd": false, | ||
"use_mpi": true, | ||
"batch_size": 512, | ||
"epochs": 400, | ||
"lr": 0.00000991301767144166, | ||
"loss_fn": "mean_squared_error", | ||
"checklist": "/lus/gila/projects/candle_aesp_CNDA/avasan/DockingSurrogates/Inference/Inference_Scaling/ST_Sort_CheckList/checklist.dat", | ||
"restart": true | ||
}, | ||
|
||
"inference_data": { | ||
"data_dir": "/lus/gila/projects/Shared/avasan/Inference_Data", | ||
"databases": ["BDB","CAS", "CHM", "DBK", "DCL", "DUD", "E15", "EDB", "EMO", "ENA", "FFI", "G13", "G17", "HOP", "LIT", "MCU", "MOS", "PCH.incomplete", "QM9", "REP", "SAV.incomplete", "SUR", "ZIN"] | ||
}, | ||
|
||
"data_loading": { | ||
"data_path": "/lus/grand/projects/datascience/avasan/Data_Docking/2M-flatten", | ||
"rec": "3CLPro_7BQY_A_1_F", | ||
"pattern": "Orderable_zinc_db_enaHLL.sorted.4col.descriptors.parquet.xform-smiles.csv.reg" | ||
}, | ||
|
||
"tokenization": { | ||
"vocab_size": 3132, | ||
"maxlen": 45, | ||
"tokenizer": { | ||
"category": "smilespair", | ||
"spe_file": "VocabFiles/SPE_ChEMBL.txt", | ||
"vocab_file": "VocabFiles/vocab_spe.txt" | ||
} | ||
}, | ||
|
||
"architecture": { | ||
"compile": false, | ||
"embedding": { | ||
"embed_dim": 128 | ||
}, | ||
"transformer_block": { | ||
"num_blocks": 5, | ||
"activation": "selu", | ||
"ff_dim": 128, | ||
"num_heads": 21, | ||
"dr1": 0.12717945391278226, | ||
"dr2": 0.12717945391278226, | ||
"drop_mha": true | ||
}, | ||
"regressor_head": { | ||
"activation": "selu", | ||
"dr": 0.04990303516069576 | ||
} | ||
}, | ||
|
||
"callbacks": { | ||
"checkpt_file": "smile_regress.autosave.model.h5", | ||
"log_csv": "smile_regress.training.log", | ||
"patience_red_lr": 20, | ||
"patience_early_stop": 100 | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
#!/usr/bin/env bash | ||
| ||
display_help() { | ||
echo " Will map gpu tile to rank in compact and then round-robin fashion" | ||
echo " Usage:" | ||
echo " mpiexec -np N gpu_tile_compact.sh ./a.out" | ||
echo | ||
echo " Example 3 GPU of 2 Tiles with 7 Ranks:" | ||
echo " 0 Rank 0.0" | ||
echo " 1 Rank 0.1" | ||
echo " 2 Rank 1.0" | ||
echo " 3 Rank 1.1" | ||
echo " 4 Rank 2.0" | ||
echo " 5 Rank 2.1" | ||
echo " 6 Rank 0.0" | ||
echo | ||
echo " Hacked together by [email protected], please contact if bug found" | ||
exit 1 | ||
} | ||
| ||
#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence. | ||
num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci:i915/* |& grep -v Unknown | grep -c "P: /devices") | ||
num_tile=2 | ||
| ||
if [[ -v PROCS_PER_TILE ]]; then | ||
_PROCS_PER_TILE=$PROCS_PER_TILE | ||
_PROCS_PER_GPU=$((PROCS_PER_TILE*2)) | ||
fi | ||
| ||
if [[ -v PROCS_PER_GPU ]]; then | ||
_PROCS_PER_GPU=$PROCS_PER_GPU | ||
if [[ $_PROCS_PER_GPU -gt $((_PROCS_PER_TILE*2)) ]]; then | ||
echo "PROCS_PER_GPU cannot be greater than 2*PROCS_PER_TILE" | ||
exit 1 | ||
fi | ||
fi | ||
| ||
if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ] ; then | ||
display_help | ||
fi | ||
| ||
# Get the RankID from different launcher | ||
if [[ -v MPI_LOCALRANKID ]]; then | ||
_MPI_RANKID=$MPI_LOCALRANKID | ||
elif [[ -v PALS_LOCAL_RANKID ]]; then | ||
_MPI_RANKID=$PALS_LOCAL_RANKID | ||
else | ||
display_help | ||
fi | ||
| ||
if [[ $_MPI_RANKID -eq 0 ]]; then | ||
echo "Number of GPUs: $num_gpu" | ||
fi | ||
if [[ $_PROCS_PER_GPU -eq 1 ]]; then | ||
gpu_id=$((_MPI_RANKID % num_gpu )) | ||
tile_id=0 | ||
elif [[ -v _PROCS_PER_TILE ]]; then | ||
gpu_id=$(((_MPI_RANKID / $_PROCS_PER_GPU) % num_gpu)) | ||
if [[ $_PROCS_PER_TILE -eq $_PROCS_PER_GPU ]]; then | ||
tile_id=0 | ||
else | ||
tile_id=$(((_MPI_RANKID / $_PROCS_PER_TILE) % num_tile)) | ||
fi | ||
else | ||
gpu_id=$(((_MPI_RANKID / num_tile) % num_gpu)) | ||
tile_id=$((_MPI_RANKID % num_tile)) | ||
fi | ||
unset EnableWalkerPartition | ||
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 | ||
export ZE_AFFINITY_MASK=$gpu_id.$tile_id | ||
echo "AFFINITY_MASK [$_MPI_RANKID]: $ZE_AFFINITY_MASK" | ||
#https://stackoverflow.com/a/28099707/7674852 | ||
"$@" |
Empty file.
Oops, something went wrong.