From 8db245a5eee7f30507e22d3ac21fe0b40f4f1d8a Mon Sep 17 00:00:00 2001 From: Hamed Jamshidian Date: Wed, 8 Mar 2023 15:27:51 +0000 Subject: [PATCH 01/11] Multiple checkpoint save --- parlai/scripts/train_model.py | 48 ++++++++++++++++++++++++++++++++--- parlai/utils/misc.py | 10 ++++++++ 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/parlai/scripts/train_model.py b/parlai/scripts/train_model.py index 2284985ab63..b3ce0a4ec3e 100644 --- a/parlai/scripts/train_model.py +++ b/parlai/scripts/train_model.py @@ -58,7 +58,7 @@ num_workers, ) from parlai.utils.io import PathManager -from parlai.utils.misc import Timer, nice_report +from parlai.utils.misc import Timer, nice_report, ordinal from parlai.utils.world_logging import WorldLogger @@ -134,6 +134,13 @@ def setup_args(parser=None) -> ParlaiParser: default=-1, help='End training after n model updates', ) + train.add_argument( + '-topk', + '--save-top-k-checkpoints', + type=int, + default=1, + help='Save and keep checkpoints with top k validation metric', + ) train.add_argument('-ltim', '--log-every-n-secs', type=float, default=-1) train.add_argument( '-lstep', @@ -409,6 +416,7 @@ def __init__(self, opt): self.save_every_n_secs = _num_else_inf( opt, 'save_every_n_secs', distributed_warn=True ) + self.save_top_k = _num_else_inf(opt, 'save_top_k_checkpoints') # smart defaults for --validation-metric-mode if opt['validation_metric'] in {'loss', 'ppl', 'mean_rank'}: @@ -427,6 +435,7 @@ def __init__(self, opt): self.final_test_report = {} self.final_extra_valid_report = {} self.best_valid = None + self.best_k_models = [] #every element is a (checkpoint address, validation metric) tuple self.impatience = 0 self.saved = False @@ -453,6 +462,7 @@ def __init__(self, opt): 'total_epochs', 0.0 ) self.train_reports = obj.get('train_reports', []) + self.best_k_models = obj.get('best_k_models', {}) if 'best_valid' in obj: self.best_valid = obj['best_valid'] else: @@ -522,6 +532,7 @@ def _save_train_stats(self, suffix=None): 'train_reports': self.train_reports, 'valid_reports': self.valid_reports, 'best_valid': self.best_valid, + 'best_k_models': self.best_k_models, 'impatience': self.impatience, 'final_valid_report': dict_report(self.final_valid_report), 'final_test_report': dict_report(self.final_test_report), @@ -579,8 +590,10 @@ def validate(self): # check if this is the best validation so far if ( - self.best_valid is None - or self.valid_optim * new_valid > self.valid_optim * self.best_valid + ( + self.best_valid is None + or self.valid_optim * new_valid > self.valid_optim * self.best_valid + ) and self.save_top_k == 1 ): logging.success( 'new best {}: {:.4g}{}'.format( @@ -606,6 +619,23 @@ def validate(self): ): logging.info('task solved! stopping.') return True + elif ( + self.save_top_k > 1 + and self.opt.get('model_file') + and ( + len(self.best_k_models) < self.save_top_k + or self.valid_optim * new_valid > self.valid_optim * self.best_k_models[-1][1] + # if new validation metric is better than kth saved model metric + ) + ): + model_rank = sum( + new_valid < saved_model_prop[1] for saved_model_prop in self.best_k_models + + model_suffix = '_'+ordinal(model_rank)+'.'+self._train_steps + self.best_k_models.insert(model_rank, (self.opt['model_file']+model_suffix, new_valid)) + self.save_model(model_suffix) # Save model as "model_nth." + self._modify_next_rank_checkpoints(model_rank) + else: self.impatience += 1 logging.report( @@ -628,6 +658,18 @@ def validate(self): logging.info('ran out of patience! stopping training.') return True return False + + def _modify_next_rank_checkpoints(self, model_rank): + if len(self.best_k_models) >= self.save_top_k: + #remove kth best model from disk and best_k_models list to make space for new model + os.remove(self.best_k_models[-1][0]) + del self.best_k_models[-1] + for ind in range(model_rank+1, len(self.best_k_models)): + prev_model_path = self.best_k_models[-1][0] + model_train_steps = prev_model_path.split('.')[-1] + new_model_path = self.opt['model_file'] + '_' + ordinal(ind-1) + '.' + model_train_steps + os.rename(self.best_k_models[-1][0], new_model_path) + def _run_single_eval(self, opt, valid_world, max_exs, datatype, is_multitask, task): diff --git a/parlai/utils/misc.py b/parlai/utils/misc.py index 66435c69693..6a2f294cffa 100644 --- a/parlai/utils/misc.py +++ b/parlai/utils/misc.py @@ -323,6 +323,16 @@ def _report_sort_key(report_key: str) -> Tuple[str, str]: sub_key = '/'.join(fields) return (sub_key or 'all', main_key) +def ordinal(n: int): + """ + Convert a number to its ordinal counterpart + """ + if 11 <= (n % 100) <= 13: + suffix = 'th' + else: + suffix = ['th', 'st', 'nd', 'rd', 'th'][min(n % 10, 4)] + return str(n) + suffix + def float_formatter(f: Union[float, int]) -> str: """ From 8edc746286112260fe74bd0e8fa37fb2e806c192 Mon Sep 17 00:00:00 2001 From: Hamed Jamshidian Date: Wed, 8 Mar 2023 20:27:20 +0000 Subject: [PATCH 02/11] Fixing duplicate topk opt --- parlai/scripts/train_model.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/parlai/scripts/train_model.py b/parlai/scripts/train_model.py index b3ce0a4ec3e..4cf178a5b8c 100644 --- a/parlai/scripts/train_model.py +++ b/parlai/scripts/train_model.py @@ -61,6 +61,10 @@ from parlai.utils.misc import Timer, nice_report, ordinal from parlai.utils.world_logging import WorldLogger +import debugpy + +debugpy.listen(5678) +debugpy.wait_for_client() def _num_else_inf(opt: Opt, key: str, distributed_warn=False): if opt[key] > 0: @@ -135,7 +139,7 @@ def setup_args(parser=None) -> ParlaiParser: help='End training after n model updates', ) train.add_argument( - '-topk', + '-stopk', '--save-top-k-checkpoints', type=int, default=1, @@ -628,10 +632,8 @@ def validate(self): # if new validation metric is better than kth saved model metric ) ): - model_rank = sum( - new_valid < saved_model_prop[1] for saved_model_prop in self.best_k_models - - model_suffix = '_'+ordinal(model_rank)+'.'+self._train_steps + model_rank = sum(new_valid < saved_model_prop[1] for saved_model_prop in self.best_k_models) + model_suffix = '_'+ordinal(model_rank)+'.'+str(self._train_steps) self.best_k_models.insert(model_rank, (self.opt['model_file']+model_suffix, new_valid)) self.save_model(model_suffix) # Save model as "model_nth." self._modify_next_rank_checkpoints(model_rank) From d40adefec76630846b7e836d7ccab93c853456b7 Mon Sep 17 00:00:00 2001 From: Hamed Jamshidian Date: Wed, 8 Mar 2023 20:28:53 +0000 Subject: [PATCH 03/11] Fixing not casting to string _train_steps in model_suffix --- parlai/scripts/train_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parlai/scripts/train_model.py b/parlai/scripts/train_model.py index 4cf178a5b8c..79847857713 100644 --- a/parlai/scripts/train_model.py +++ b/parlai/scripts/train_model.py @@ -633,7 +633,7 @@ def validate(self): ) ): model_rank = sum(new_valid < saved_model_prop[1] for saved_model_prop in self.best_k_models) - model_suffix = '_'+ordinal(model_rank)+'.'+str(self._train_steps) + model_suffix = '_' + ordinal(model_rank) + '.' + str(self._train_steps) self.best_k_models.insert(model_rank, (self.opt['model_file']+model_suffix, new_valid)) self.save_model(model_suffix) # Save model as "model_nth." self._modify_next_rank_checkpoints(model_rank) From 4a066d6a77ea6a4adc1e1fee2015c85fcbf4c36e Mon Sep 17 00:00:00 2001 From: Hamed Jamshidian Date: Wed, 8 Mar 2023 21:44:45 +0000 Subject: [PATCH 04/11] Reseting impatience in save_top_k mode --- parlai/scripts/train_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/parlai/scripts/train_model.py b/parlai/scripts/train_model.py index 79847857713..873f3f04c7e 100644 --- a/parlai/scripts/train_model.py +++ b/parlai/scripts/train_model.py @@ -632,6 +632,7 @@ def validate(self): # if new validation metric is better than kth saved model metric ) ): + self.impatience = 0 model_rank = sum(new_valid < saved_model_prop[1] for saved_model_prop in self.best_k_models) model_suffix = '_' + ordinal(model_rank) + '.' + str(self._train_steps) self.best_k_models.insert(model_rank, (self.opt['model_file']+model_suffix, new_valid)) From c7e1dc0bf71a27a17cd21adc77ee36fc6c0fc1c1 Mon Sep 17 00:00:00 2001 From: Hamed Jamshidian Date: Wed, 8 Mar 2023 22:02:16 +0000 Subject: [PATCH 05/11] Fixing bug in suffix ordinal --- parlai/scripts/train_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parlai/scripts/train_model.py b/parlai/scripts/train_model.py index 873f3f04c7e..e4e94e6399a 100644 --- a/parlai/scripts/train_model.py +++ b/parlai/scripts/train_model.py @@ -634,7 +634,7 @@ def validate(self): ): self.impatience = 0 model_rank = sum(new_valid < saved_model_prop[1] for saved_model_prop in self.best_k_models) - model_suffix = '_' + ordinal(model_rank) + '.' + str(self._train_steps) + model_suffix = '_' + ordinal(model_rank+1) + '.' + str(self._train_steps) self.best_k_models.insert(model_rank, (self.opt['model_file']+model_suffix, new_valid)) self.save_model(model_suffix) # Save model as "model_nth." self._modify_next_rank_checkpoints(model_rank) @@ -670,7 +670,7 @@ def _modify_next_rank_checkpoints(self, model_rank): for ind in range(model_rank+1, len(self.best_k_models)): prev_model_path = self.best_k_models[-1][0] model_train_steps = prev_model_path.split('.')[-1] - new_model_path = self.opt['model_file'] + '_' + ordinal(ind-1) + '.' + model_train_steps + new_model_path = self.opt['model_file'] + '_' + ordinal(ind) + '.' + model_train_steps os.rename(self.best_k_models[-1][0], new_model_path) From 373d0a1bc657bd90c2ccd89f21d941933f42ab65 Mon Sep 17 00:00:00 2001 From: Hamed Jamshidian Date: Wed, 8 Mar 2023 22:09:51 +0000 Subject: [PATCH 06/11] Fixing bug in _modify_next_rank_checkpoints ordinal --- parlai/scripts/train_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parlai/scripts/train_model.py b/parlai/scripts/train_model.py index e4e94e6399a..60ca76128bb 100644 --- a/parlai/scripts/train_model.py +++ b/parlai/scripts/train_model.py @@ -670,7 +670,7 @@ def _modify_next_rank_checkpoints(self, model_rank): for ind in range(model_rank+1, len(self.best_k_models)): prev_model_path = self.best_k_models[-1][0] model_train_steps = prev_model_path.split('.')[-1] - new_model_path = self.opt['model_file'] + '_' + ordinal(ind) + '.' + model_train_steps + new_model_path = self.opt['model_file'] + '_' + ordinal(ind+1) + '.' + model_train_steps os.rename(self.best_k_models[-1][0], new_model_path) From a5a2ab510730bdd2d55af9bc90d621a0e6d6519a Mon Sep 17 00:00:00 2001 From: Hamed Jamshidian Date: Thu, 9 Mar 2023 12:46:24 +0000 Subject: [PATCH 07/11] Fixed bug in indexing checkpoints after the new checkpoint --- parlai/scripts/train_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parlai/scripts/train_model.py b/parlai/scripts/train_model.py index 60ca76128bb..594d7e2559f 100644 --- a/parlai/scripts/train_model.py +++ b/parlai/scripts/train_model.py @@ -668,10 +668,10 @@ def _modify_next_rank_checkpoints(self, model_rank): os.remove(self.best_k_models[-1][0]) del self.best_k_models[-1] for ind in range(model_rank+1, len(self.best_k_models)): - prev_model_path = self.best_k_models[-1][0] + prev_model_path = self.best_k_models[ind][0] model_train_steps = prev_model_path.split('.')[-1] new_model_path = self.opt['model_file'] + '_' + ordinal(ind+1) + '.' + model_train_steps - os.rename(self.best_k_models[-1][0], new_model_path) + os.rename(self.best_k_models[ind][0], new_model_path) def _run_single_eval(self, opt, valid_world, max_exs, datatype, is_multitask, task): From e01b6de0c3b30c08ef9f1fae6683aef6d159b0b6 Mon Sep 17 00:00:00 2001 From: Hamed Jamshidian Date: Fri, 10 Mar 2023 10:03:25 +0000 Subject: [PATCH 08/11] Using pathlib to remove redundant files in multiple checkpoint stats --- parlai/scripts/train_model.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/parlai/scripts/train_model.py b/parlai/scripts/train_model.py index 594d7e2559f..be72d25a1e3 100644 --- a/parlai/scripts/train_model.py +++ b/parlai/scripts/train_model.py @@ -28,6 +28,7 @@ import torch import json import os +from pathlib import Path import signal from typing import Tuple @@ -635,7 +636,7 @@ def validate(self): self.impatience = 0 model_rank = sum(new_valid < saved_model_prop[1] for saved_model_prop in self.best_k_models) model_suffix = '_' + ordinal(model_rank+1) + '.' + str(self._train_steps) - self.best_k_models.insert(model_rank, (self.opt['model_file']+model_suffix, new_valid)) + self.best_k_models.insert(model_rank, [self.opt['model_file']+model_suffix, new_valid]) self.save_model(model_suffix) # Save model as "model_nth." self._modify_next_rank_checkpoints(model_rank) @@ -662,16 +663,22 @@ def validate(self): return True return False + + def _modify_next_rank_checkpoints(self, model_rank): - if len(self.best_k_models) >= self.save_top_k: - #remove kth best model from disk and best_k_models list to make space for new model - os.remove(self.best_k_models[-1][0]) + if len(self.best_k_models) > self.save_top_k: + #remove last best model and its files from disk and best_k_models list to make space for new model + last_path = Path(self.best_k_models[-1][0]) + for file in last_path.parent.glob(last_path.name + '*'): + file.unlink() del self.best_k_models[-1] for ind in range(model_rank+1, len(self.best_k_models)): - prev_model_path = self.best_k_models[ind][0] - model_train_steps = prev_model_path.split('.')[-1] - new_model_path = self.opt['model_file'] + '_' + ordinal(ind+1) + '.' + model_train_steps - os.rename(self.best_k_models[ind][0], new_model_path) + prev_model_path = Path(self.best_k_models[ind][0]) + model_train_steps = prev_model_path.suffix[1:] + new_model_path = Path(self.opt['model_file'] + '_' + ordinal(ind+1) + '.' + model_train_steps) + for file in prev_model_path.parent.glob(prev_model_path.name + '*'): + file.rename(str(new_model_path) + ''.join(file.suffixes[1:])) + self.best_k_models[ind][0] = str(new_model_path) def _run_single_eval(self, opt, valid_world, max_exs, datatype, is_multitask, task): From dac1c81d0ff3348e334bf4374ee23c99e64bb00f Mon Sep 17 00:00:00 2001 From: Hamed Jamshidian Date: Fri, 10 Mar 2023 10:53:43 +0000 Subject: [PATCH 09/11] Logging for ranning out of patience in multiple checkpoints state --- parlai/scripts/train_model.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/parlai/scripts/train_model.py b/parlai/scripts/train_model.py index be72d25a1e3..e999ba00e69 100644 --- a/parlai/scripts/train_model.py +++ b/parlai/scripts/train_model.py @@ -639,14 +639,21 @@ def validate(self): self.best_k_models.insert(model_rank, [self.opt['model_file']+model_suffix, new_valid]) self.save_model(model_suffix) # Save model as "model_nth." self._modify_next_rank_checkpoints(model_rank) - + else: self.impatience += 1 - logging.report( - 'did not beat best {}: {} impatience: {}'.format( - opt['validation_metric'], round(self.best_valid, 4), self.impatience + if self.save_top_k == 1: + logging.report( + 'did not beat best {}: {} impatience: {}'.format( + opt['validation_metric'], round(self.best_valid, 4), self.impatience + ) + ) + else: + logging.report( + 'did not beat {} model\'s {}: {} impatience: {}'.format( + ordinal(self.save_top_k), opt['validation_metric'], round(self.best_k_models[-1][1], 4), self.impatience + ) ) - ) self.validate_time.reset() # saving From 3918b8f92b0cee8f64f6c768234b5b34e5de2a2c Mon Sep 17 00:00:00 2001 From: Hamed Jamshidian Date: Fri, 10 Mar 2023 10:57:36 +0000 Subject: [PATCH 10/11] Storing save state in multiple checkpoint --- parlai/scripts/train_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/parlai/scripts/train_model.py b/parlai/scripts/train_model.py index e999ba00e69..b322b20500e 100644 --- a/parlai/scripts/train_model.py +++ b/parlai/scripts/train_model.py @@ -638,6 +638,7 @@ def validate(self): model_suffix = '_' + ordinal(model_rank+1) + '.' + str(self._train_steps) self.best_k_models.insert(model_rank, [self.opt['model_file']+model_suffix, new_valid]) self.save_model(model_suffix) # Save model as "model_nth." + self.saved = True self._modify_next_rank_checkpoints(model_rank) else: From a0da2f015b232c0ce0816661921f19d85c09bb84 Mon Sep 17 00:00:00 2001 From: Hamed Jamshidian Date: Fri, 10 Mar 2023 11:00:45 +0000 Subject: [PATCH 11/11] Stop training in multiple checkpoints if validation metric passed cutoff --- parlai/scripts/train_model.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/parlai/scripts/train_model.py b/parlai/scripts/train_model.py index b322b20500e..07bab0a9dab 100644 --- a/parlai/scripts/train_model.py +++ b/parlai/scripts/train_model.py @@ -62,10 +62,6 @@ from parlai.utils.misc import Timer, nice_report, ordinal from parlai.utils.world_logging import WorldLogger -import debugpy - -debugpy.listen(5678) -debugpy.wait_for_client() def _num_else_inf(opt: Opt, key: str, distributed_warn=False): if opt[key] > 0: @@ -640,7 +636,15 @@ def validate(self): self.save_model(model_suffix) # Save model as "model_nth." self.saved = True self._modify_next_rank_checkpoints(model_rank) - + if ( + opt['validation_metric_mode'] == 'max' + and self.best_k_models[-1][1] >= opt['validation_cutoff'] + ) or ( + opt['validation_metric_mode'] == 'min' + and self.best_k_models[-1][1] <= opt['validation_cutoff'] + ): + logging.info('task solved! stopping.') + return True else: self.impatience += 1 if self.save_top_k == 1: