From 87f6b9c5302aa9f9e3df276d80fcb4be2365ea7c Mon Sep 17 00:00:00 2001 From: Erick Matsen Date: Tue, 11 Jun 2024 05:24:38 -0700 Subject: [PATCH 01/16] feat: Add optimizer_of_name function for dynamic optimizer creation --- netam/common.py | 11 +++++++++++ netam/framework.py | 9 +++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/netam/common.py b/netam/common.py index deb2131d..188b09c5 100644 --- a/netam/common.py +++ b/netam/common.py @@ -142,6 +142,17 @@ def stack_heterogeneous(tensors, pad_value=0.0): return torch.stack(padded_tensors) +def optimizer_of_name(optimizer_name, model_parameters, **kwargs): + """ + Build a torch.optim optimizer from a string name and model parameters. + """ + try: + optimizer_class = getattr(optim, optimizer_name) + return optimizer_class(model_parameters, **kwargs) + except AttributeError: + raise ValueError(f"Optimizer '{optimizer_name}' is not recognized in torch.optim") + + def find_least_used_cuda_gpu(): """ Find the least used CUDA GPU on the system using nvidia-smi. diff --git a/netam/framework.py b/netam/framework.py index 4a329666..fc5a6542 100644 --- a/netam/framework.py +++ b/netam/framework.py @@ -19,6 +19,7 @@ generate_kmers, kmer_to_index_of, nt_mask_tensor_of, + optimizer_of_name, BASES, BASES_AND_N_TO_INDEX, BIG, @@ -371,6 +372,7 @@ def __init__( train_dataset, val_dataset, model, + optimizer_name="AdamW", batch_size=1024, learning_rate=0.1, min_learning_rate=1e-4, @@ -383,12 +385,13 @@ def __init__( """ self.train_dataset = train_dataset self.val_dataset = val_dataset - self.batch_size = batch_size if train_dataset is not None: self.writer = SummaryWriter(log_dir=f"./_logs/{name}") self.writer.add_text("model_name", model.__class__.__name__) self.writer.add_text("model_hyperparameters", str(model.hyperparameters)) self.model = model + self.optimizer_name = optimizer_name + self.batch_size = batch_size self.learning_rate = learning_rate self.min_learning_rate = min_learning_rate self.l2_regularization_coeff = l2_regularization_coeff @@ -417,7 +420,9 @@ def reset_optimization(self, learning_rate=None): """Reset the optimizer and scheduler.""" if learning_rate is None: learning_rate = self.learning_rate - self.optimizer = torch.optim.AdamW( + + self.optimizer = optimizer_of_name( + self.optimizer_name, self.model.parameters(), lr=learning_rate, weight_decay=self.l2_regularization_coeff, From e1821560ae3a9702321776e9ed3a1dfef8a4a1e1 Mon Sep 17 00:00:00 2001 From: Erick Matsen Date: Tue, 11 Jun 2024 09:53:17 -0700 Subject: [PATCH 02/16] save more hparams in yml --- netam/dnsm.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/netam/dnsm.py b/netam/dnsm.py index 62d8db44..ad7a34a2 100644 --- a/netam/dnsm.py +++ b/netam/dnsm.py @@ -420,7 +420,11 @@ def to_crepe(self): training_hyperparameters = { key: self.__dict__[key] for key in [ + "optimizer_name", + "batch_size", "learning_rate", + "min_learning_rate", + "l2_regularization_coeff", ] } encoder = framework.PlaceholderEncoder() From 5a8bbe987addb59ee8ecc1535a2c8b07464a31bb Mon Sep 17 00:00:00 2001 From: Erick Matsen Date: Tue, 11 Jun 2024 09:57:59 -0700 Subject: [PATCH 03/16] adding cli to netam for concat_csvs :joy: --- netam/cli.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 netam/cli.py diff --git a/netam/cli.py b/netam/cli.py new file mode 100644 index 00000000..c9ae493a --- /dev/null +++ b/netam/cli.py @@ -0,0 +1,35 @@ +import fire +import pandas as pd + + +def concatenate_csvs( + input_csvs_str: str, + output_csv: str, + is_tsv: bool = False, + record_path: bool = False, +): + """ + This function concatenates multiple CSV or TSV files into one CSV file. + + Args: + input_csvs: A string of paths to the input CSV or TSV files separated by commas. + output_csv: Path to the output CSV file. + is_tsv: A boolean flag that determines whether the input files are TSV. + record_path: A boolean flag that adds a column recording the path of the input_csv. + """ + input_csvs = input_csvs_str.split(",") + dfs = [] + + for csv in input_csvs: + df = pd.read_csv(csv, delimiter="\t" if is_tsv else ",") + if record_path: + df["input_file_path"] = csv + dfs.append(df) + + result_df = pd.concat(dfs, ignore_index=True) + + result_df.to_csv(output_csv, index=False) + + +def main(): + fire.Fire() From bae32b86ffb6744acc2fea81e41a847c0b6f3c22 Mon Sep 17 00:00:00 2001 From: Erick Matsen Date: Tue, 11 Jun 2024 12:21:57 -0700 Subject: [PATCH 04/16] if mistake --- netam/framework.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/netam/framework.py b/netam/framework.py index fc5a6542..b138df6e 100644 --- a/netam/framework.py +++ b/netam/framework.py @@ -323,7 +323,7 @@ def load_crepe(prefix, device=None): model.eval() crepe_instance = Crepe(encoder, model, config["training_hyperparameters"]) - if device: + if device is not None: crepe_instance.to(device) return crepe_instance From 8cd8d07375ddbf790b90d23185fe50147fe17075 Mon Sep 17 00:00:00 2001 From: Erick Matsen Date: Tue, 11 Jun 2024 12:29:23 -0700 Subject: [PATCH 05/16] crepe is on device --- netam/framework.py | 1 + 1 file changed, 1 insertion(+) diff --git a/netam/framework.py b/netam/framework.py index b138df6e..05474645 100644 --- a/netam/framework.py +++ b/netam/framework.py @@ -360,6 +360,7 @@ def load_pcp_df(pcp_df_path_gz, sample_count=None, chosen_v_families=None): def add_shm_model_outputs_to_pcp_df(pcp_df, crepe_prefix, device=None): crepe = load_crepe(crepe_prefix, device=device) + print(f"crepe is on {crepe.device}") rates, csps = trimmed_shm_model_outputs_of_crepe(crepe, pcp_df["parent"]) pcp_df["rates"] = rates pcp_df["subs_probs"] = csps From d5783fd929e06ae43e2bea51af79257785de8ef7 Mon Sep 17 00:00:00 2001 From: Erick Matsen Date: Tue, 11 Jun 2024 15:36:53 -0700 Subject: [PATCH 06/16] make format --- netam/common.py | 4 +++- netam/framework.py | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/netam/common.py b/netam/common.py index 188b09c5..e7dc95a6 100644 --- a/netam/common.py +++ b/netam/common.py @@ -150,7 +150,9 @@ def optimizer_of_name(optimizer_name, model_parameters, **kwargs): optimizer_class = getattr(optim, optimizer_name) return optimizer_class(model_parameters, **kwargs) except AttributeError: - raise ValueError(f"Optimizer '{optimizer_name}' is not recognized in torch.optim") + raise ValueError( + f"Optimizer '{optimizer_name}' is not recognized in torch.optim" + ) def find_least_used_cuda_gpu(): diff --git a/netam/framework.py b/netam/framework.py index 05474645..51b65778 100644 --- a/netam/framework.py +++ b/netam/framework.py @@ -360,7 +360,6 @@ def load_pcp_df(pcp_df_path_gz, sample_count=None, chosen_v_families=None): def add_shm_model_outputs_to_pcp_df(pcp_df, crepe_prefix, device=None): crepe = load_crepe(crepe_prefix, device=device) - print(f"crepe is on {crepe.device}") rates, csps = trimmed_shm_model_outputs_of_crepe(crepe, pcp_df["parent"]) pcp_df["rates"] = rates pcp_df["subs_probs"] = csps @@ -421,7 +420,7 @@ def reset_optimization(self, learning_rate=None): """Reset the optimizer and scheduler.""" if learning_rate is None: learning_rate = self.learning_rate - + self.optimizer = optimizer_of_name( self.optimizer_name, self.model.parameters(), From 960359fa129223ccacc6b40feaf66c45a03aa88e Mon Sep 17 00:00:00 2001 From: Erick Matsen Date: Wed, 12 Jun 2024 04:30:44 -0700 Subject: [PATCH 07/16] Fix and returning to Adam as default --- netam/framework.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/netam/framework.py b/netam/framework.py index 51b65778..289decc7 100644 --- a/netam/framework.py +++ b/netam/framework.py @@ -372,7 +372,7 @@ def __init__( train_dataset, val_dataset, model, - optimizer_name="AdamW", + optimizer_name="Adam", batch_size=1024, learning_rate=0.1, min_learning_rate=1e-4, @@ -757,6 +757,7 @@ def __init__( train_dataset, val_dataset, model, + optimizer_name="Adam", batch_size=1024, learning_rate=0.1, min_learning_rate=1e-4, @@ -767,11 +768,12 @@ def __init__( train_dataset, val_dataset, model, - batch_size, - learning_rate, - min_learning_rate, - l2_regularization_coeff, - name, + optimizer_name=optimizer_name, + batch_size=batch_size, + learning_rate=learning_rate, + min_learning_rate=min_learning_rate, + l2_regularization_coeff=l2_regularization_coeff, + name=name, ) def loss_of_batch(self, batch): From 0543dbe8e2e14cfb8560f4f5b63c6be9d9ff9885 Mon Sep 17 00:00:00 2001 From: Erick Matsen Date: Wed, 12 Jun 2024 11:54:51 -0700 Subject: [PATCH 08/16] feat: Update optimizer_of_name function to handle SGDMomentum optimizer --- netam/common.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/netam/common.py b/netam/common.py index e7dc95a6..5625cb98 100644 --- a/netam/common.py +++ b/netam/common.py @@ -145,7 +145,12 @@ def stack_heterogeneous(tensors, pad_value=0.0): def optimizer_of_name(optimizer_name, model_parameters, **kwargs): """ Build a torch.optim optimizer from a string name and model parameters. + + Use a SGD optimizer with momentum if the optimizer_name is "SGDMomentum". """ + if optimizer_name == "SGDMomentum": + optimizer_name = "SGD" + kwargs["momentum"] = 0.9 try: optimizer_class = getattr(optim, optimizer_name) return optimizer_class(model_parameters, **kwargs) From 4efc5c0f6576aae9386953a78c05e1015c3fab2d Mon Sep 17 00:00:00 2001 From: Erick Matsen Date: Wed, 12 Jun 2024 14:26:29 -0700 Subject: [PATCH 09/16] RMSprop by default! --- netam/framework.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/netam/framework.py b/netam/framework.py index 289decc7..4f1e94ca 100644 --- a/netam/framework.py +++ b/netam/framework.py @@ -372,7 +372,7 @@ def __init__( train_dataset, val_dataset, model, - optimizer_name="Adam", + optimizer_name="RMSprop", batch_size=1024, learning_rate=0.1, min_learning_rate=1e-4, @@ -757,7 +757,7 @@ def __init__( train_dataset, val_dataset, model, - optimizer_name="Adam", + optimizer_name="RMSprop", batch_size=1024, learning_rate=0.1, min_learning_rate=1e-4, From 2c11a3f1b49c31c0a2a43e79c9e5039e75d19344 Mon Sep 17 00:00:00 2001 From: Erick Matsen Date: Thu, 13 Jun 2024 03:32:05 -0700 Subject: [PATCH 10/16] - --- netam/framework.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/netam/framework.py b/netam/framework.py index 4f1e94ca..64db9c97 100644 --- a/netam/framework.py +++ b/netam/framework.py @@ -725,7 +725,7 @@ def joint_train( optimize_branch_lengths() self.mark_branch_lengths_optimized(0) for cycle in range(cycle_count): - print(f"### Beginning cycle {cycle + 1}/{cycle_count}") + print(f"### Beginning cycle {cycle + 1}/{cycle_count} using optimizer {self.optimizer_name}") self.mark_branch_lengths_optimized(cycle + 1) current_lr = self.optimizer.param_groups[0]["lr"] # set new_lr to be the geometric mean of current_lr and the From edcfbeaf7f410e16e44ddef79b542e5e34915324 Mon Sep 17 00:00:00 2001 From: Erick Matsen Date: Thu, 13 Jun 2024 04:44:05 -0700 Subject: [PATCH 11/16] make format --- netam/framework.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/netam/framework.py b/netam/framework.py index 64db9c97..e4ed50ac 100644 --- a/netam/framework.py +++ b/netam/framework.py @@ -725,7 +725,9 @@ def joint_train( optimize_branch_lengths() self.mark_branch_lengths_optimized(0) for cycle in range(cycle_count): - print(f"### Beginning cycle {cycle + 1}/{cycle_count} using optimizer {self.optimizer_name}") + print( + f"### Beginning cycle {cycle + 1}/{cycle_count} using optimizer {self.optimizer_name}" + ) self.mark_branch_lengths_optimized(cycle + 1) current_lr = self.optimizer.param_groups[0]["lr"] # set new_lr to be the geometric mean of current_lr and the From 46f8c6802679592419d05e15fcfeead2d25d53b0 Mon Sep 17 00:00:00 2001 From: Erick Matsen Date: Thu, 13 Jun 2024 14:54:49 -0700 Subject: [PATCH 12/16] don't count hours --- netam/framework.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/netam/framework.py b/netam/framework.py index e4ed50ac..0fdc9f65 100644 --- a/netam/framework.py +++ b/netam/framework.py @@ -431,11 +431,11 @@ def reset_optimization(self, learning_rate=None): self.optimizer, mode="min", factor=0.5, patience=10 ) - def execution_hours(self): + def execution_time(self): """ - Return time in hours (rounded to 3 decimal places) since the Burrito was created. + Return time since the Burrito was created. """ - return round((time() - self.start_time) / 3600, 3) + return time() - self.start_time def multi_train(self, epochs, max_tries=3): """ @@ -456,7 +456,7 @@ def multi_train(self, epochs, max_tries=3): return train_history def write_loss(self, loss_name, loss, step): - self.writer.add_scalar(loss_name, loss, step, walltime=self.execution_hours()) + self.writer.add_scalar(loss_name, loss, step, walltime=self.execution_time()) def write_cuda_memory_info(self): megabyte_scaling_factor = 1 / 1024**2 @@ -695,7 +695,7 @@ def mark_branch_lengths_optimized(self, cycle): "branch length optimization", cycle, self.global_epoch, - walltime=self.execution_hours(), + walltime=self.execution_time(), ) def joint_train( @@ -725,9 +725,7 @@ def joint_train( optimize_branch_lengths() self.mark_branch_lengths_optimized(0) for cycle in range(cycle_count): - print( - f"### Beginning cycle {cycle + 1}/{cycle_count} using optimizer {self.optimizer_name}" - ) + print(f"### Beginning cycle {cycle + 1}/{cycle_count} using optimizer {self.optimizer_name}") self.mark_branch_lengths_optimized(cycle + 1) current_lr = self.optimizer.param_groups[0]["lr"] # set new_lr to be the geometric mean of current_lr and the @@ -967,10 +965,10 @@ def find_optimal_branch_lengths(self, dataset, **optimization_kwargs): def write_loss(self, loss_name, loss, step): rate_loss, csp_loss = loss.unbind() self.writer.add_scalar( - "Rate " + loss_name, rate_loss.item(), step, walltime=self.execution_hours() + "Rate " + loss_name, rate_loss.item(), step, walltime=self.execution_time() ) self.writer.add_scalar( - "CSP " + loss_name, csp_loss.item(), step, walltime=self.execution_hours() + "CSP " + loss_name, csp_loss.item(), step, walltime=self.execution_time() ) From 7e806b45a82a67800aec304d6dcdc4daa65c42d0 Mon Sep 17 00:00:00 2001 From: Erick Matsen Date: Thu, 13 Jun 2024 14:55:03 -0700 Subject: [PATCH 13/16] make format --- netam/framework.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/netam/framework.py b/netam/framework.py index 0fdc9f65..a645466a 100644 --- a/netam/framework.py +++ b/netam/framework.py @@ -725,7 +725,9 @@ def joint_train( optimize_branch_lengths() self.mark_branch_lengths_optimized(0) for cycle in range(cycle_count): - print(f"### Beginning cycle {cycle + 1}/{cycle_count} using optimizer {self.optimizer_name}") + print( + f"### Beginning cycle {cycle + 1}/{cycle_count} using optimizer {self.optimizer_name}" + ) self.mark_branch_lengths_optimized(cycle + 1) current_lr = self.optimizer.param_groups[0]["lr"] # set new_lr to be the geometric mean of current_lr and the From ecb19fc638c7c606ce890f89ed2c7f475ccee8ca Mon Sep 17 00:00:00 2001 From: Erick Matsen Date: Thu, 13 Jun 2024 15:57:58 -0700 Subject: [PATCH 14/16] changing l2_regularization_coeff to weight_decay --- data/cnn_joi_sml-shmoof_small.yml | 2 +- netam/dnsm.py | 6 +++--- netam/framework.py | 12 ++++++------ netam/hyper_burrito.py | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/data/cnn_joi_sml-shmoof_small.yml b/data/cnn_joi_sml-shmoof_small.yml index db6ad39e..a455085d 100644 --- a/data/cnn_joi_sml-shmoof_small.yml +++ b/data/cnn_joi_sml-shmoof_small.yml @@ -11,6 +11,6 @@ model_hyperparameters: kmer_length: 3 serialization_version: 0 training_hyperparameters: - l2_regularization_coeff: 1.0e-06 + weight_decay: 1.0e-06 learning_rate: 0.1 min_learning_rate: 1.0e-06 diff --git a/netam/dnsm.py b/netam/dnsm.py index ad7a34a2..d07dcd3b 100644 --- a/netam/dnsm.py +++ b/netam/dnsm.py @@ -424,7 +424,7 @@ def to_crepe(self): "batch_size", "learning_rate", "min_learning_rate", - "l2_regularization_coeff", + "weight_decay", ] } encoder = framework.PlaceholderEncoder() @@ -448,7 +448,7 @@ def burrito_of_model( batch_size=1024, learning_rate=0.1, min_learning_rate=1e-4, - l2_regularization_coeff=1e-6, + weight_decay=1e-6, ): model.to(device) burrito = DNSMBurrito( @@ -458,6 +458,6 @@ def burrito_of_model( batch_size=batch_size, learning_rate=learning_rate, min_learning_rate=min_learning_rate, - l2_regularization_coeff=l2_regularization_coeff, + weight_decay=weight_decay, ) return burrito diff --git a/netam/framework.py b/netam/framework.py index a645466a..20a487a6 100644 --- a/netam/framework.py +++ b/netam/framework.py @@ -376,7 +376,7 @@ def __init__( batch_size=1024, learning_rate=0.1, min_learning_rate=1e-4, - l2_regularization_coeff=1e-6, + weight_decay=1e-6, name="", ): """ @@ -394,7 +394,7 @@ def __init__( self.batch_size = batch_size self.learning_rate = learning_rate self.min_learning_rate = min_learning_rate - self.l2_regularization_coeff = l2_regularization_coeff + self.weight_decay = weight_decay self.name = name self.reset_optimization() self.bce_loss = nn.BCELoss() @@ -425,7 +425,7 @@ def reset_optimization(self, learning_rate=None): self.optimizer_name, self.model.parameters(), lr=learning_rate, - weight_decay=self.l2_regularization_coeff, + weight_decay=self.weight_decay, ) self.scheduler = ReduceLROnPlateau( self.optimizer, mode="min", factor=0.5, patience=10 @@ -763,7 +763,7 @@ def __init__( batch_size=1024, learning_rate=0.1, min_learning_rate=1e-4, - l2_regularization_coeff=1e-6, + weight_decay=1e-6, name="", ): super().__init__( @@ -774,7 +774,7 @@ def __init__( batch_size=batch_size, learning_rate=learning_rate, min_learning_rate=min_learning_rate, - l2_regularization_coeff=l2_regularization_coeff, + weight_decay=weight_decay, name=name, ) @@ -829,7 +829,7 @@ def to_crepe(self): for key in [ "learning_rate", "min_learning_rate", - "l2_regularization_coeff", + "weight_decay", ] } encoder = KmerSequenceEncoder( diff --git a/netam/hyper_burrito.py b/netam/hyper_burrito.py index 25d9d957..e1dad26f 100644 --- a/netam/hyper_burrito.py +++ b/netam/hyper_burrito.py @@ -181,7 +181,7 @@ def burrito_of_model( batch_size=1024, learning_rate=0.1, min_learning_rate=1e-4, - l2_regularization_coeff=1e-6, + weight_decay=1e-6, ): burrito = SHMBurrito( self.train_dataset, @@ -190,6 +190,6 @@ def burrito_of_model( batch_size=batch_size, learning_rate=learning_rate, min_learning_rate=min_learning_rate, - l2_regularization_coeff=l2_regularization_coeff, + weight_decay=weight_decay, ) return burrito From 6dd1fcc22d424e04ce7e02ce025fa87040da4e50 Mon Sep 17 00:00:00 2001 From: Erick Matsen Date: Fri, 14 Jun 2024 10:26:19 -0700 Subject: [PATCH 15/16] bl opt default --- netam/framework.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/netam/framework.py b/netam/framework.py index 20a487a6..30c021bf 100644 --- a/netam/framework.py +++ b/netam/framework.py @@ -699,7 +699,7 @@ def mark_branch_lengths_optimized(self, cycle): ) def joint_train( - self, epochs=100, cycle_count=2, training_method="full", out_prefix=None + self, epochs=100, cycle_count=2, training_method="full", out_prefix=None, optimize_bl_first_cycle=True ): """ Do joint optimization of model and branch lengths. @@ -708,6 +708,10 @@ def joint_train( If training_method is "yun", then we use Yun's approximation to the branch lengths. If training_method is "fixed", then we fix the branch lengths and only optimize the model. + We give an option to optimize the branch lengths in the first cycle (by + default we do). But, this can be useful to turn off e.g. if we've loaded + in some preoptimized branch lengths. + We reset the optimization after each cycle, and we use a learning rate schedule that uses a weighted geometric mean of the current learning rate and the initial learning rate that progressively moves towards @@ -722,7 +726,8 @@ def joint_train( else: raise ValueError(f"Unknown training method {training_method}") loss_history_l = [] - optimize_branch_lengths() + if optimize_bl_first_cycle: + optimize_branch_lengths() self.mark_branch_lengths_optimized(0) for cycle in range(cycle_count): print( From 98a775924c5278a0377a368dc8d6f51aed017166 Mon Sep 17 00:00:00 2001 From: Erick Matsen Date: Fri, 14 Jun 2024 11:05:47 -0700 Subject: [PATCH 16/16] make format --- netam/framework.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/netam/framework.py b/netam/framework.py index 30c021bf..42107d69 100644 --- a/netam/framework.py +++ b/netam/framework.py @@ -699,7 +699,12 @@ def mark_branch_lengths_optimized(self, cycle): ) def joint_train( - self, epochs=100, cycle_count=2, training_method="full", out_prefix=None, optimize_bl_first_cycle=True + self, + epochs=100, + cycle_count=2, + training_method="full", + out_prefix=None, + optimize_bl_first_cycle=True, ): """ Do joint optimization of model and branch lengths.