Skip to content

Commit

Permalink
Update file exists checkpointing error messages to be more helpful (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
irenedea authored Nov 16, 2023
1 parent bfbb89a commit 15d4c34
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 5 deletions.
7 changes: 6 additions & 1 deletion composer/callbacks/checkpoint_saver.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,12 @@ def _save_checkpoint(self, state: State, logger: Logger):
).lstrip('/')

log.debug(f'Uploading checkpoint to {remote_file_name}')
logger.upload_file(remote_file_name=remote_file_name, file_path=saved_path, overwrite=self.overwrite)
try:
logger.upload_file(remote_file_name=remote_file_name, file_path=saved_path, overwrite=self.overwrite)
except FileExistsError as e:
raise FileExistsError(
f'Uploading checkpoint failed with error: {e}. overwrite was set to {self.overwrite}. To overwrite checkpoints with Trainer, set save_overwrite to True.'
)

# symlinks stay the same with sharded checkpointing
if self.latest_remote_file_name is not None:
Expand Down
4 changes: 2 additions & 2 deletions composer/loggers/remote_uploader_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ def _check_workers(self):
# Periodically check to see if any of the upload workers crashed
# They would crash if:
# a) A file could not be uploaded, and the retry counter failed, or
# b) allow_overwrite=False, but the file already exists,
# b) overwrite=False, but the file already exists,
if not self._all_workers_alive:
if not self._exception_queue.empty():
exception = self._exception_queue.get_nowait()
Expand Down Expand Up @@ -615,7 +615,7 @@ def upload_file():
pass
else:
# Exceptions will be detected on the next batch_end or epoch_end event
e = FileExistsError(f'Object {uri} already exists, but allow_overwrite was set to False.')
e = FileExistsError(f'Object {uri} already exists, but overwrite was set to False.')
exception_queue.put_nowait(e)
raise e
log.info('Uploading file %s to %s', file_path_to_upload, uri)
Expand Down
3 changes: 1 addition & 2 deletions tests/loggers/test_remote_uploader_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,7 @@ def object_store_test_helper(
# the fatal exception that the worker throws.
with pytest.raises(
FileExistsError,
match=
f'Object local://{remote_file_name} already exists, but allow_overwrite was set to False.'):
match=f'Object local://{remote_file_name} already exists, but overwrite was set to False.'):
remote_uploader_downloader.run_event(event_to_test, dummy_state, logger)

else:
Expand Down

0 comments on commit 15d4c34

Please sign in to comment.