Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix FairseqHydraPretrainJob for better start_checkpoint behavior #554

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions fairseq/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ def __init__(
self.package_name = package_name
self.check_consistency()

@property
def data(self):
"""
Get the underlying data of the FairseqHydraConfig
"""
return self.config_dict

def write(self, path: str):
config_dict = self.config_dict.copy()
config_dict = util.update_nested_dict(config_dict, self.post_config_dict)
Expand Down Expand Up @@ -170,6 +177,9 @@ def __init__(
kwargs = locals()
del kwargs["self"]

# save start checkpoint and rename to checkpoint_latest
self.start_checkpoint = fairseq_hydra_config.data.get("checkpoint", {}).pop("restore_file")

self.command_line_args = command_line_args or []
stored_epochs = list(range(save_interval, max_epoch, save_interval)) + [max_epoch]

Expand Down Expand Up @@ -231,9 +241,28 @@ def create_fairseq_hydra_config(cls, fairseq_hydra_config, max_epoch, max_update
}
res.update(FairseqHydraConfig(config_dict, post_config_dict))
return res

def _fairseq_prepare_checkpoint(self, start_checkpoint):
# rename the start checkpoint to checkpoint_last.pt if it is not None and checkpoint_last.pt does not exist
if start_checkpoint is None:
print("No start checkpoint provided")
return
if not os.path.exists(start_checkpoint):
raise FileNotFoundError(f"Start checkpoint {start_checkpoint} does not exist")
if not os.path.exists(os.path.join(self.out_checkpoint_dir.get_path(), "checkpoint_last.pt")):
michelwi marked this conversation as resolved.
Show resolved Hide resolved
print(f"Linking {start_checkpoint} to {self.out_checkpoint_dir.get_path()}")
os.symlink(
start_checkpoint,
os.path.join(self.out_checkpoint_dir.get_path(), "checkpoint_last.pt")
)
os.symlink(
start_checkpoint,
os.path.join(self.out_checkpoint_dir.get_path(), os.path.basename(start_checkpoint))
)
michelwi marked this conversation as resolved.
Show resolved Hide resolved

def create_files(self):
self.fairseq_hydra_config.write(self.out_fairseq_hydra_yaml.get_path())
self._fairseq_prepare_checkpoint(self.start_checkpoint)
util.create_executable("fairseq.sh", self._get_run_cmd())

def run(self):
Expand Down
Loading