From 4e127323eedbb88f2989e43917f8416dda41e176 Mon Sep 17 00:00:00 2001 From: Tanima Dey Date: Tue, 15 Aug 2023 12:52:15 -0700 Subject: [PATCH] Added callbacks to measure performance throughput more precisely. By default, it shows the throughput for each epoch. Updated the optimizer API for keras==2.12 and keras==3.13 version --- Pilot1/Attn/attn_abstention_keras2.py | 3 + Pilot1/Attn/attn_baseline_keras2.py | 3 + Pilot1/Attn/attn_bin_working_jan7_h5.py | 6 +- Pilot1/Combo/combo_baseline_keras2.py | 8 +++ Pilot1/NT3/nt3_baseline_keras2.py | 6 +- Pilot1/P1B1/p1b1_baseline_keras2.py | 4 +- Pilot1/P1B2/p1b2_baseline_keras2.py | 6 +- Pilot1/P1B3/p1b3_baseline_keras2.py | 4 +- Pilot1/TC1/tc1_baseline_keras2.py | 4 +- Pilot2/P2B1/p2b1.py | 2 +- Pilot2/P2B1/p2b1_baseline_keras2.py | 4 +- Pilot3/P3B1/p3b1_baseline_keras2.py | 5 +- Pilot3/P3B2/p3b2_baseline_keras2.py | 38 ++++++++--- Pilot3/P3B2/p3b2_default_model.txt | 2 + Pilot3/P3B3/p3b3_baseline_keras2.py | 6 +- Pilot3/P3B4/dense_attention.py | 7 +- Pilot3/P3B4/tf2_mthisan.py | 12 +++- common/keras_utils.py | 66 +++++++++++++++++-- examples/ADRP/adrp_baseline_keras2.py | 6 +- .../xform-smiles/smiles_class_transformer.py | 10 ++- 20 files changed, 166 insertions(+), 36 deletions(-) diff --git a/Pilot1/Attn/attn_abstention_keras2.py b/Pilot1/Attn/attn_abstention_keras2.py index 9783acd1..d6cc54ab 100644 --- a/Pilot1/Attn/attn_abstention_keras2.py +++ b/Pilot1/Attn/attn_abstention_keras2.py @@ -21,6 +21,7 @@ import attn_viz_utils as attnviz from attn_baseline_keras2 import build_attention_model +from keras_utils import PerformanceReportCallback np.set_printoptions(precision=4) tf.compat.v1.disable_eager_execution() @@ -292,6 +293,8 @@ def run(params): epochs = params['epochs'] batch_size = params['batch_size'] + perf_callback = PerformanceReportCallback(batch_size) + callbacks.append(perf_callback) history = model.fit(X_train, Y_train, class_weight=d_class_weights, batch_size=batch_size, epochs=epochs, diff --git a/Pilot1/Attn/attn_baseline_keras2.py b/Pilot1/Attn/attn_baseline_keras2.py index 9bed2c39..7bfc4059 100644 --- a/Pilot1/Attn/attn_baseline_keras2.py +++ b/Pilot1/Attn/attn_baseline_keras2.py @@ -23,6 +23,7 @@ import candle import attn_viz_utils as attnviz +from keras_utils import PerformanceReportCallback np.set_printoptions(precision=4) tf.compat.v1.disable_eager_execution() @@ -247,6 +248,8 @@ def run(params): epochs = params['epochs'] batch_size = params['batch_size'] + perf_callback = PerformanceReportCallback(batch_size) + callbacks.append(perf_callback) history = model.fit(X_train, Y_train, class_weight=d_class_weights, batch_size=batch_size, epochs=epochs, diff --git a/Pilot1/Attn/attn_bin_working_jan7_h5.py b/Pilot1/Attn/attn_bin_working_jan7_h5.py index a3703468..922d76b9 100644 --- a/Pilot1/Attn/attn_bin_working_jan7_h5.py +++ b/Pilot1/Attn/attn_bin_working_jan7_h5.py @@ -26,11 +26,12 @@ from sklearn.utils.class_weight import compute_class_weight from sklearn.metrics import roc_auc_score, auc, roc_curve, f1_score, precision_recall_curve - file_path = os.path.dirname(os.path.realpath(__file__)) lib_path = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) sys.path.append(lib_path) +import candle +from keras_utils import PerformanceReportCallback psr = argparse.ArgumentParser(description='input agg csv file') psr.add_argument('--in', default='in_file') psr.add_argument('--ep', type=int, default=400) @@ -220,13 +221,14 @@ def load_data(): reduce_lr = ReduceLROnPlateau(monitor='val_tf_auc', factor=0.20, patience=40, verbose=1, mode='auto', min_delta=0.0001, cooldown=3, min_lr=0.000000001) early_stop = EarlyStopping(monitor='val_tf_auc', patience=200, verbose=1, mode='auto') +perf_callback = PerformanceReportCallback(BATCH) # history = parallel_model.fit(X_train, Y_train, history = model.fit(X_train, Y_train, class_weight=d_class_weights, batch_size=BATCH, epochs=EPOCH, verbose=1, validation_data=(X_val, Y_val), - callbacks=[checkpointer, csv_logger, reduce_lr, early_stop]) + callbacks=[checkpointer, csv_logger, reduce_lr, early_stop, perf_callback]) score = model.evaluate(X_test, Y_test, verbose=0) diff --git a/Pilot1/Combo/combo_baseline_keras2.py b/Pilot1/Combo/combo_baseline_keras2.py index 5791adb4..17713fd9 100644 --- a/Pilot1/Combo/combo_baseline_keras2.py +++ b/Pilot1/Combo/combo_baseline_keras2.py @@ -24,12 +24,18 @@ from sklearn.model_selection import StratifiedKFold, GroupKFold from scipy.stats.stats import pearsonr +import tensorflow as tf +options = tf.profiler.experimental.ProfilerOptions(host_tracer_level = 3, python_tracer_level = 1, device_tracer_level = 1) +tf.compat.v1.disable_eager_execution() + import matplotlib as mpl mpl.use('Agg') import NCI60 import combo import candle +from keras_utils import PerformanceReportCallback + logger = logging.getLogger(__name__) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' @@ -702,6 +708,8 @@ def warmup_scheduler(epoch): # callbacks = [history_logger, model_recorder] callbacks = [candle_monitor, timeout_monitor, history_logger, model_recorder] + perf_callback = PerformanceReportCallback(args.batch_size) + callbacks.append(perf_callback) if args.reduce_lr: callbacks.append(reduce_lr) if args.warmup_lr: diff --git a/Pilot1/NT3/nt3_baseline_keras2.py b/Pilot1/NT3/nt3_baseline_keras2.py index 3eceba0a..152950d3 100644 --- a/Pilot1/NT3/nt3_baseline_keras2.py +++ b/Pilot1/NT3/nt3_baseline_keras2.py @@ -15,7 +15,7 @@ import nt3 as bmk import candle - +from keras_utils import PerformanceReportCallback def initialize_parameters(default_model='nt3_default_model.txt'): @@ -193,7 +193,7 @@ def run(gParameters): reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0) candleRemoteMonitor = candle.CandleRemoteMonitor(params=gParameters) timeoutMonitor = candle.TerminateOnTimeOut(gParameters['timeout']) - + perf_callback = PerformanceReportCallback(gParameters['batch_size']) history = model.fit(X_train, Y_train, batch_size=gParameters['batch_size'], epochs=gParameters['epochs'], @@ -201,7 +201,7 @@ def run(gParameters): verbose=1, validation_data=(X_test, Y_test), callbacks=[csv_logger, reduce_lr, candleRemoteMonitor, timeoutMonitor, - ckpt]) + ckpt, perf_callback]) score = model.evaluate(X_test, Y_test, verbose=0) diff --git a/Pilot1/P1B1/p1b1_baseline_keras2.py b/Pilot1/P1B1/p1b1_baseline_keras2.py index d0892ada..dab295bf 100644 --- a/Pilot1/P1B1/p1b1_baseline_keras2.py +++ b/Pilot1/P1B1/p1b1_baseline_keras2.py @@ -24,7 +24,7 @@ import p1b1 import candle - +from keras_utils import PerformanceReportCallback np.set_printoptions(precision=4) @@ -324,6 +324,8 @@ def warmup_scheduler(epoch): callbacks.append(checkpointer) if params['tb']: callbacks.append(tensorboard) + perf_callback = PerformanceReportCallback(params['batch_size']) + callbacks.append(perf_callback) x_val2 = np.copy(x_val) np.random.shuffle(x_val2) diff --git a/Pilot1/P1B2/p1b2_baseline_keras2.py b/Pilot1/P1B2/p1b2_baseline_keras2.py index 4cb09757..bf2505bd 100644 --- a/Pilot1/P1B2/p1b2_baseline_keras2.py +++ b/Pilot1/P1B2/p1b2_baseline_keras2.py @@ -9,7 +9,7 @@ import p1b2 import candle - +from keras_utils import PerformanceReportCallback def initialize_parameters(default_model='p1b2_default_model.txt'): @@ -109,10 +109,12 @@ def run(gParameters): # Seed random generator for training np.random.seed(seed) + perf_callback = PerformanceReportCallback(gParameters['batch_size']) mlp.fit(X_train, y_train, batch_size=gParameters['batch_size'], epochs=gParameters['epochs'], - validation_data=(X_val, y_val) + validation_data=(X_val, y_val), + callbacks=[perf_callback] ) # model save diff --git a/Pilot1/P1B3/p1b3_baseline_keras2.py b/Pilot1/P1B3/p1b3_baseline_keras2.py index e1765053..f2faafc7 100644 --- a/Pilot1/P1B3/p1b3_baseline_keras2.py +++ b/Pilot1/P1B3/p1b3_baseline_keras2.py @@ -22,6 +22,7 @@ import p1b3 as benchmark import candle import tensorflow as tf +from keras_utils import PerformanceReportCallback tf.compat.v1.disable_eager_execution() @@ -344,6 +345,7 @@ def run(gParameters): np.random.seed(seed) candleRemoteMonitor = candle.CandleRemoteMonitor(params=gParameters) + perf_callback = PerformanceReportCallback(gParameters['batch_size']) # history = model.fit(train_gen, steps_per_epoch=train_steps, # this should be the deprecation fix history = model.fit(train_gen, steps_per_epoch=train_steps, @@ -351,7 +353,7 @@ def run(gParameters): validation_data=val_gen, validation_steps=val_steps, verbose=0, - callbacks=[checkpointer, loss_history, progbar, candleRemoteMonitor], + callbacks=[checkpointer, loss_history, progbar, candleRemoteMonitor, perf_callback], ) # callbacks=[checkpointer, loss_history, candleRemoteMonitor], # this just caused the job to hang on Biowulf diff --git a/Pilot1/TC1/tc1_baseline_keras2.py b/Pilot1/TC1/tc1_baseline_keras2.py index 013af0a3..8d7cf357 100644 --- a/Pilot1/TC1/tc1_baseline_keras2.py +++ b/Pilot1/TC1/tc1_baseline_keras2.py @@ -17,6 +17,7 @@ import tc1 as bmk import candle +from keras_utils import PerformanceReportCallback def initialize_parameters(default_model='tc1_default_model.txt'): @@ -122,12 +123,13 @@ def run(gParameters): reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0) + perf_callback = PerformanceReportCallback(gParameters['batch_size']) history = model.fit(X_train, Y_train, batch_size=gParameters['batch_size'], epochs=gParameters['epochs'], verbose=1, validation_data=(X_test, Y_test), - callbacks=[checkpointer, csv_logger, reduce_lr]) + callbacks=[checkpointer, csv_logger, reduce_lr, perf_callback]) score = model.evaluate(X_test, Y_test, verbose=0) diff --git a/Pilot2/P2B1/p2b1.py b/Pilot2/P2B1/p2b1.py index e3311d19..8e551c16 100644 --- a/Pilot2/P2B1/p2b1.py +++ b/Pilot2/P2B1/p2b1.py @@ -334,7 +334,7 @@ def train_ac(self): # for frame in random.sample(range(len(xt_all)), int(self.sampling_density*len(xt_all))): for frame in range(len(xt_all)): history = self.molecular_model.fit(xt_all[frame], yt_all[frame], epochs=1, - batch_size=self.batch_size, callbacks=self.callbacks[:2]) + batch_size=self.batch_size, callbacks=self.callbacks) frame_loss.append(history.history['loss']) frame_mse.append(history.history['mean_squared_error']) diff --git a/Pilot2/P2B1/p2b1_baseline_keras2.py b/Pilot2/P2B1/p2b1_baseline_keras2.py index c698fc74..534af317 100644 --- a/Pilot2/P2B1/p2b1_baseline_keras2.py +++ b/Pilot2/P2B1/p2b1_baseline_keras2.py @@ -23,6 +23,7 @@ import candle import p2b1_AE_models as AE_models +from keras_utils import PerformanceReportCallback HOME = os.environ['HOME'] @@ -249,7 +250,8 @@ def step_decay(epoch): history_logger = candle.LoggingCallback(logger.debug) candleRemoteMonitor = candle.CandleRemoteMonitor(params=GP) timeoutMonitor = candle.TerminateOnTimeOut(TIMEOUT) - callbacks = [history, history_logger, candleRemoteMonitor, timeoutMonitor] + perfCallback = PerformanceReportCallback(GP['batch_size']) + callbacks = [history, history_logger, candleRemoteMonitor, timeoutMonitor, perfCallback] # loss = 0. # ### Save the Model to disk diff --git a/Pilot3/P3B1/p3b1_baseline_keras2.py b/Pilot3/P3B1/p3b1_baseline_keras2.py index ed1e316c..98338ba0 100644 --- a/Pilot3/P3B1/p3b1_baseline_keras2.py +++ b/Pilot3/P3B1/p3b1_baseline_keras2.py @@ -11,7 +11,7 @@ import p3b1 as bmk import candle - +from keras_utils import PerformanceReportCallback def initialize_parameters(default_model='p3b1_default_model.txt'): @@ -160,10 +160,11 @@ def train_model(gParameters, models, gParameters['run_id'] = base_run_id + ".{}.{}.{}".format(fold, epoch, k) candleRemoteMonitor = candle.CandleRemoteMonitor(params=gParameters) timeoutMonitor = candle.TerminateOnTimeOut(gParameters['timeout']) + perf_callback = PerformanceReportCallback(gParameters['batch_size']) model.fit({'input': X_train[k]}, {'out_' + str(k): Y_train[k]}, epochs=1, verbose=verbose, - callbacks=[candleRemoteMonitor, timeoutMonitor], + callbacks=[candleRemoteMonitor, timeoutMonitor, perf_callback], batch_size=gParameters['batch_size'], validation_data=(X_test[k], Y_test[k])) diff --git a/Pilot3/P3B2/p3b2_baseline_keras2.py b/Pilot3/P3B2/p3b2_baseline_keras2.py index 2414d1b4..48328b5c 100644 --- a/Pilot3/P3B2/p3b2_baseline_keras2.py +++ b/Pilot3/P3B2/p3b2_baseline_keras2.py @@ -1,3 +1,4 @@ +import tensorflow as tf import tensorflow.keras as keras from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Activation @@ -11,7 +12,7 @@ import p3b2 as bmk import candle - +from keras_utils import PerformanceReportCallback def initialize_parameters(default_model='p3b2_default_model.txt'): @@ -126,13 +127,33 @@ def run(gParameters): ret_seq = True else: ret_seq = False - - if k == 0: - model.add(LSTM(rnn_size, input_shape=(maxlen, len(chars)), return_sequences=ret_seq, - dropout=dropout, recurrent_dropout=recurrent_dropout)) + if gParameters['itexlstm'] is not None and gParameters['itexlstm'] is True: + try: + import intel_extension_for_tensorflow as itex + print('Using ITEX-LSTM') + if k == 0: + model.add(itex.ops.ItexLSTM(rnn_size, input_shape=(maxlen, len(chars)), return_sequences=ret_seq, + dropout=dropout, recurrent_dropout=recurrent_dropout)) + else: + model.add(itex.ops.ItexLSTM(rnn_size, dropout=dropout, recurrent_dropout=recurrent_dropout, + return_sequences=ret_seq)) + except ImportError: + from tensorflow.keras.layers import LSTM + print('Using Keras-LSTM') + if k == 0: + model.add(LSTM(rnn_size, input_shape=(maxlen, len(chars)), return_sequences=ret_seq, + dropout=dropout, recurrent_dropout=recurrent_dropout)) + else: + model.add(LSTM(rnn_size, dropout=dropout, recurrent_dropout=recurrent_dropout, + return_sequences=ret_seq)) else: - model.add(LSTM(rnn_size, dropout=dropout, recurrent_dropout=recurrent_dropout, - return_sequences=ret_seq)) + from tensorflow.keras.layers import LSTM + if k == 0: + model.add(LSTM(rnn_size, input_shape=(maxlen, len(chars)), return_sequences=ret_seq, + dropout=dropout, recurrent_dropout=recurrent_dropout)) + else: + model.add(LSTM(rnn_size, dropout=dropout, recurrent_dropout=recurrent_dropout, + return_sequences=ret_seq)) model.add(Dense(len(chars))) model.add(Activation(gParameters['activation'])) @@ -153,7 +174,8 @@ def run(gParameters): print('Iteration', iteration) history = LossHistory() - model.fit(X, y, batch_size=100, epochs=1, callbacks=[history]) + perf_callback = PerformanceReportCallback(gParameters['batch_size']) + model.fit(X, y, batch_size=gParameters['batch_size'], epochs=gParameters['epochs'], callbacks=[history, perf_callback]) loss = history.losses[-1] if verbose: diff --git a/Pilot3/P3B2/p3b2_default_model.txt b/Pilot3/P3B2/p3b2_default_model.txt index 51089215..937711bc 100644 --- a/Pilot3/P3B2/p3b2_default_model.txt +++ b/Pilot3/P3B2/p3b2_default_model.txt @@ -2,6 +2,7 @@ data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P3B2/' train_data = 'P3B2_data.tgz' model_name = 'p3b2' +batch_size = 100 rnn_size = 64 epochs = 2 n_layers = 1 @@ -16,3 +17,4 @@ primetext = 'Diagnosis' length = 1000 do_sample = True verbose = True +itexlstm = True diff --git a/Pilot3/P3B3/p3b3_baseline_keras2.py b/Pilot3/P3B3/p3b3_baseline_keras2.py index e03ab229..7fd52e66 100644 --- a/Pilot3/P3B3/p3b3_baseline_keras2.py +++ b/Pilot3/P3B3/p3b3_baseline_keras2.py @@ -17,7 +17,7 @@ import p3b3 as bmk import keras_mt_shared_cnn import candle - +from keras_utils import PerformanceReportCallback def initialize_parameters(default_model='p3b3_default_model.txt'): @@ -105,7 +105,7 @@ def run_cnn(GP, train_x, train_y, test_x, test_y, candleRemoteMonitor = candle.CandleRemoteMonitor(params=GP) timeoutMonitor = candle.TerminateOnTimeOut(GP['timeout']) - + perf_callback = PerformanceReportCallback(batch_size) history = cnn.fit( x=np.array(train_x), y=train_labels, @@ -113,7 +113,7 @@ def run_cnn(GP, train_x, train_y, test_x, test_y, epochs=epochs, verbose=2, validation_data=validation_data, - callbacks=[candleRemoteMonitor, timeoutMonitor] + callbacks=[candleRemoteMonitor, timeoutMonitor, perf_callback] ) return history diff --git a/Pilot3/P3B4/dense_attention.py b/Pilot3/P3B4/dense_attention.py index 464e0df0..a0fef8dd 100644 --- a/Pilot3/P3B4/dense_attention.py +++ b/Pilot3/P3B4/dense_attention.py @@ -22,6 +22,7 @@ from __future__ import division from __future__ import print_function +import tensorflow as tf from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import smart_cond @@ -172,7 +173,11 @@ def compute_mask(self, inputs, mask=None): q_mask = mask[0] if q_mask is None: return None - return ops.convert_to_tensor_v2(q_mask) + if tf.__version__ < '2.13.0': + return ops.convert_to_tensor_v2(q_mask) + else: + from tensorflow.python.framework import tensor_conversion + return tensor_conversion.convert_to_tensor_v2(q_mask) return None def _validate_call_args(self, inputs, mask): diff --git a/Pilot3/P3B4/tf2_mthisan.py b/Pilot3/P3B4/tf2_mthisan.py index eae03ea3..8e945090 100644 --- a/Pilot3/P3B4/tf2_mthisan.py +++ b/Pilot3/P3B4/tf2_mthisan.py @@ -235,9 +235,11 @@ def train(self, data, labels, batch_size=128, epochs=100, patience=5, y_trues = [[] for c in self.num_classes] start_time = time.time() + epoch_time = 0 + epoch_batch_count = 0 + # train for start in range(0, len(data), batch_size): - # get batch index if start + batch_size < len(data): stop = start + batch_size @@ -245,8 +247,13 @@ def train(self, data, labels, batch_size=128, epochs=100, patience=5, stop = len(data) # train step + batch_begin_time = time.time() predictions, loss = self._train_step(data[start:stop], np.array([lIndex[start:stop] for lIndex in labels])) + batch_time = time.time() - batch_begin_time + epoch_time += batch_time + epoch_batch_count += 1 + batch_speed = batch_size/batch_time # track correct predictions for i, (p, lIndex) in enumerate(zip(predictions, [lIndex[start:stop] for lIndex in labels])): @@ -258,7 +265,8 @@ def train(self, data, labels, batch_size=128, epochs=100, patience=5, # checkpoint after every epoch print("\ntraining time: %.2f" % (time.time() - start_time)) - + epoch_speed = (batch_size * epoch_batch_count) / epoch_time + print(f"\r\nepoch time (s):", round(epoch_time, 3), " throughput(samples/sec):", round(epoch_speed, 3), flush=True) for i in range(self.num_tasks): micro = f1_score(y_trues[i], y_preds[i], average='micro') macro = f1_score(y_trues[i], y_preds[i], average='macro') diff --git a/common/keras_utils.py b/common/keras_utils.py index fb251882..a246f967 100644 --- a/common/keras_utils.py +++ b/common/keras_utils.py @@ -5,6 +5,7 @@ from tensorflow.keras import optimizers from tensorflow.keras import initializers +import tensorflow as tf from tensorflow.keras.layers import Dropout from tensorflow.keras.callbacks import Callback from tensorflow.keras.utils import get_custom_objects @@ -20,6 +21,7 @@ from sklearn.metrics import r2_score import os +import time def set_parallelism_threads(): @@ -89,35 +91,35 @@ def build_optimizer(optimizer, lr, kerasDefaults): """ if optimizer == 'sgd': - return optimizers.SGD(lr=lr, decay=kerasDefaults['decay_lr'], + return tf.keras.optimizers.legacy.SGD(lr=lr, decay=kerasDefaults['decay_lr'], momentum=kerasDefaults['momentum_sgd'], nesterov=kerasDefaults['nesterov_sgd']) # , # clipnorm=kerasDefaults['clipnorm'], # clipvalue=kerasDefaults['clipvalue']) elif optimizer == 'rmsprop': - return optimizers.RMSprop(lr=lr, rho=kerasDefaults['rho'], + return tf.keras.optimizers.legacy.RMSprop(lr=lr, rho=kerasDefaults['rho'], epsilon=kerasDefaults['epsilon'], decay=kerasDefaults['decay_lr']) # , # clipnorm=kerasDefaults['clipnorm'], # clipvalue=kerasDefaults['clipvalue']) elif optimizer == 'adagrad': - return optimizers.Adagrad(lr=lr, + return tf.keras.optimizers.legacy.Adagrad(lr=lr, epsilon=kerasDefaults['epsilon'], decay=kerasDefaults['decay_lr']) # , # clipnorm=kerasDefaults['clipnorm'], # clipvalue=kerasDefaults['clipvalue']) elif optimizer == 'adadelta': - return optimizers.Adadelta(lr=lr, rho=kerasDefaults['rho'], + return tf.keras.optimizers.legacy.Adadelta(lr=lr, rho=kerasDefaults['rho'], epsilon=kerasDefaults['epsilon'], decay=kerasDefaults['decay_lr']) # , # clipnorm=kerasDefaults['clipnorm'], # clipvalue=kerasDefaults['clipvalue']) elif optimizer == 'adam': - return optimizers.Adam(lr=lr, beta_1=kerasDefaults['beta_1'], + return tf.keras.optimizers.legacy.Adam(lr=lr, beta_1=kerasDefaults['beta_1'], beta_2=kerasDefaults['beta_2'], epsilon=kerasDefaults['epsilon'], decay=kerasDefaults['decay_lr']) # , @@ -255,3 +257,57 @@ def __init__(self, print_fcn=print): def on_epoch_end(self, epoch, logs={}): msg = "[Epoch: %i] %s" % (epoch, ", ".join("%s: %f" % (k, v) for k, v in sorted(logs.items()))) self.print_fcn(msg) + + +class PerformanceReportCallback(Callback): + def __init__(self, BATCH_SIZE): + super( ).__init__() + self.batchsize = BATCH_SIZE + self.logfreq = 0 + self.batch_begin_time = 0 + self.batch_end_time = 0 + self.max_speed = 0 + self.epoch_time = 0 + self.train_time = 0 + self.epoch_count = 0 + + def on_batch_begin(self, batch, logs=None): + self.batch_begin_time = time.time() + + def on_batch_end(self, batch, logs=None): + if batch == 0 and self.epoch_count == 0: + return + self.batch_time = time.time() - self.batch_begin_time + self.epoch_time += self.batch_time + self.epoch_batch_count += 1 + self.train_batch_count += 1 + + self.batch_speed = self.batchsize / self.batch_time + if self.batch_speed > self.max_speed: + self.max_speed = self.batch_speed + + if self.logfreq != 0 and batch % self.logfreq == 0: + print(f"\r\nbatch {batch} time(s) {round(self.batch_time, 6)} throughput(samples/sec): {round(self.batch_speed, 3)}", flush = True) + + def on_epoch_begin(self, epoch, logs=None): + self.epoch_batch_count = 0 + self.epoch_time = 0 + self.epoch_begin_time = time.time() + + def on_epoch_end(self, epoch, logs=None): + self.train_time += self.epoch_time + self.epoch_count += 1 + + if self.epoch_time != 0: + self.epoch_avg_speed = self.epoch_batch_count * self.batchsize / self.epoch_time + print(f"\r\nepoch {epoch} time(s): ", round(self.epoch_time, 3), " throughput(samples/sec): ", round(self.epoch_avg_speed, 3), flush = True) + + def on_train_begin(self, logs=None): + self.train_batch_count = 0 + self.train_time = 0 + self.train_begin_time = time.time() + + def on_train_end(self, logs=None): + if self.train_time != 0: + speed_train = (self.batchsize * self.train_batch_count) / self.train_time + print("\r\nTotal train time(s): " , round(self.train_time, 3), " batches: ", self.train_batch_count, " batchsize: ", self.batchsize, " throughput(samples/sec) (avg, max): ", round(speed_train, 3), round(self.max_speed, 3), flush = True) diff --git a/examples/ADRP/adrp_baseline_keras2.py b/examples/ADRP/adrp_baseline_keras2.py index 531fda00..0741f0cd 100644 --- a/examples/ADRP/adrp_baseline_keras2.py +++ b/examples/ADRP/adrp_baseline_keras2.py @@ -6,8 +6,8 @@ import matplotlib.pyplot as plt import h5py -import tensorflow as tf +import tensorflow as tf from tensorflow.keras import backend as K from tensorflow.keras.layers import Input, Dense, Dropout from tensorflow.keras.models import Model, model_from_json, model_from_yaml @@ -29,6 +29,7 @@ import sys import adrp import candle +from keras_utils import PerformanceReportCallback np.set_printoptions(precision=4) @@ -376,6 +377,7 @@ def run(params): # history = parallel_model.fit(X_train, Y_train, epochs = params["epochs"] batch_size = params["batch_size"] + perf_callback = PerformanceReportCallback(batch_size) timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) if (params['use_sample_weight']): if (params['sample_weight_type'] == 'linear'): @@ -431,7 +433,7 @@ def run(params): verbose=1, sample_weight=train_weight, validation_data=(X_test, Y_test, test_weight), - callbacks=[checkpointer, timeout_monitor, csv_logger, reduce_lr, early_stop], + callbacks=[checkpointer, timeout_monitor, csv_logger, reduce_lr, early_stop, perf_callback], ) print("Reloading saved best model") diff --git a/examples/xform-smiles/smiles_class_transformer.py b/examples/xform-smiles/smiles_class_transformer.py index afe9b61c..97caadc1 100644 --- a/examples/xform-smiles/smiles_class_transformer.py +++ b/examples/xform-smiles/smiles_class_transformer.py @@ -23,6 +23,13 @@ from tensorflow.keras.preprocessing import sequence from tensorflow.keras.preprocessing import text +file_path = os.path.dirname(os.path.realpath(__file__)) +lib_path2 = os.path.abspath(os.path.join(file_path, "..", "..", "common")) +sys.path.append(lib_path2) + +import candle +from keras_utils import PerformanceReportCallback + file_path = os.path.dirname(os.path.realpath(__file__)) lib_path = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) sys.path.append(lib_path) @@ -168,12 +175,13 @@ def prep_text(texts, tokenizer, max_sequence_length): csv_logger = CSVLogger('smile_class.training.log') reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.75, patience=20, verbose=1, mode='auto', epsilon=0.0001, cooldown=3, min_lr=0.000000001) early_stop = EarlyStopping(monitor='val_loss', patience=100, verbose=1, mode='auto') +perf_callback = PerformanceReportCallback(BATCH) history = model.fit(x_train, y_train, batch_size=BATCH, epochs=EPOCH, verbose=1, validation_data=(x_val, y_val), - callbacks=[checkpointer, csv_logger, reduce_lr, early_stop]) + callbacks=[checkpointer, csv_logger, reduce_lr, early_stop, perf_callback]) model.load_weights('smile_class.autosave.model.h5')