-
Notifications
You must be signed in to change notification settings - Fork 5.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[tune/train] Restore Tuner and results properly from moved storage path #40647
Changes from 8 commits
fafe238
e538749
315fd84
d1216f3
589e6ad
c92ab1d
e97516b
bce55d9
22eb216
67eb6d3
c499622
c2d1b25
b05dbbe
7157d22
babb1bf
d7ff5df
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -699,6 +699,36 @@ def set_experiment_tag(self, experiment_tag): | |
self.experiment_tag = experiment_tag | ||
self.invalidate_json_state() | ||
|
||
def set_storage(self, new_storage: StorageContext): | ||
"""Updates the storage context of the trial. | ||
|
||
If the `storage_path` or `experiment_dir_name` has changed, then this setter | ||
also updates the paths of all checkpoints tracked by the checkpoint manager. | ||
This enables restoration from a checkpoint if the user moves the directory. | ||
""" | ||
original_storage = self.storage | ||
|
||
checkpoint_manager = self.run_metadata.checkpoint_manager | ||
|
||
for checkpoint_result in checkpoint_manager.best_checkpoint_results: | ||
checkpoint_result.checkpoint = Checkpoint( | ||
path=checkpoint_result.checkpoint.path.replace( | ||
original_storage.trial_fs_path, new_storage.trial_fs_path, 1 | ||
), | ||
filesystem=new_storage.storage_filesystem, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updating the storage filesystem means that we require resuming from an experiment directory that contains everything -- it's not possible to start training on S3, download everything except for checkpoints to local, then restore from local. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this makes sense... let's document this somewhere? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense. Given that checkpoints are stored in trial directory (also experiment directory), generally the users will download everything together? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think so. However, I think the reason people do this is because loading results from cloud is not so easy. |
||
) | ||
latest_checkpoint_result = checkpoint_manager.latest_checkpoint_result | ||
if latest_checkpoint_result: | ||
latest_checkpoint_result.checkpoint = Checkpoint( | ||
path=latest_checkpoint_result.checkpoint.path.replace( | ||
original_storage.trial_fs_path, new_storage.trial_fs_path, 1 | ||
), | ||
filesystem=new_storage.storage_filesystem, | ||
) | ||
|
||
self.storage = new_storage | ||
self.invalidate_json_state() | ||
|
||
@property | ||
def num_failures(self): | ||
return self.run_metadata.num_failures | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -209,11 +209,6 @@ def train_fn(config): | |
assert isinstance(result_grid.errors[0], RuntimeError) | ||
|
||
|
||
def test_result_grid_moved_experiment_path(ray_start_2_cpus, tmpdir): | ||
# TODO(justinvyu): [handle_moved_storage_path] | ||
pytest.skip("Not implemented yet.") | ||
Comment on lines
-212
to
-214
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this test got merged with the tuner restore test |
||
|
||
|
||
if __name__ == "__main__": | ||
import sys | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ | |
_upload_to_fs_path, | ||
) | ||
from ray.tune import Callback, Trainable | ||
from ray.tune.analysis import ExperimentAnalysis | ||
from ray.tune.execution.experiment_state import _find_newest_experiment_checkpoint | ||
from ray.tune.experiment import Trial | ||
from ray.tune.result_grid import ResultGrid | ||
|
@@ -743,8 +744,6 @@ def create_trainable_with_params(): | |
assert not results.errors | ||
|
||
|
||
# TODO(justinvyu): [handle_moved_storage_path] | ||
@pytest.mark.skip("Restoring from a moved storage path is not supported yet.") | ||
@pytest.mark.parametrize("use_tune_run", [True, False]) | ||
def test_tuner_restore_from_moved_experiment_path( | ||
ray_start_2_cpus, tmp_path, use_tune_run | ||
|
@@ -755,10 +754,10 @@ def test_tuner_restore_from_moved_experiment_path( | |
fail_marker = tmp_path / "fail_marker" | ||
fail_marker.write_text("", encoding="utf-8") | ||
|
||
old_local_dir = tmp_path / "ray_results" | ||
old_storage_path = tmp_path / "ray_results" | ||
old_exp_name = "exp_dir" | ||
|
||
new_local_dir = tmp_path / "new_ray_results" | ||
new_storage_path = tmp_path / "new_ray_results" | ||
new_exp_name = "new_exp_dir" | ||
|
||
# Initial training run (that errors out in the middle) | ||
|
@@ -770,25 +769,35 @@ def test_tuner_restore_from_moved_experiment_path( | |
), | ||
run_config=RunConfig( | ||
name=old_exp_name, | ||
storage_path=str(old_local_dir), | ||
storage_path=str(old_storage_path), | ||
checkpoint_config=CheckpointConfig(num_to_keep=num_to_keep), | ||
), | ||
param_space={ | ||
"failing_hanging": (fail_marker, None), | ||
}, | ||
) | ||
results = tuner.fit() | ||
assert len(results.errors) == 1 | ||
tuner.fit() | ||
|
||
# Move experiment from `tmp_path/ray_results/exp_dir` | ||
# to `tmp_path/moved_ray_results/new_exp_dir`, changing both `storage_path` and | ||
# the experiment `name` | ||
shutil.move(str(old_storage_path), str(new_storage_path)) | ||
os.rename( | ||
str(new_storage_path / old_exp_name), str(new_storage_path / new_exp_name) | ||
) | ||
|
||
# Check that the results can be read from the new location. | ||
restore_path = str(new_storage_path / new_exp_name) | ||
results = ResultGrid(ExperimentAnalysis(restore_path)) | ||
|
||
# TODO(justinvyu): [populate_exception] for storage_path != None | ||
# assert len(results.errors) == 1 | ||
training_iteration = results[0].metrics["training_iteration"] | ||
Comment on lines
+794
to
796
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doesn't work right now because it's looking for the error locally at |
||
assert ( | ||
training_iteration == 1 | ||
), f"Should only have 1 train.report before erroring, got {training_iteration}" | ||
|
||
# Move experiment from `tmp_path/ray_results/exp_dir` | ||
# to `tmp_path/moved_ray_results/new_exp_dir`, changing both `local_dir` and | ||
# the experiment `name` | ||
shutil.move(str(old_local_dir), str(new_local_dir)) | ||
os.rename(str(new_local_dir / old_exp_name), str(new_local_dir / new_exp_name)) | ||
assert results[0].checkpoint.path.endswith("checkpoint_000000") | ||
assert "new_exp_dir" in results[0].checkpoint.path | ||
|
||
del tuner | ||
# Remove fail_marker so that the restored Tuner doesn't error again | ||
|
@@ -799,18 +808,18 @@ def test_tuner_restore_from_moved_experiment_path( | |
analysis = tune.run( | ||
_train_fn_sometimes_failing, | ||
name=new_exp_name, | ||
storage_path=str(new_local_dir), | ||
storage_path=str(new_storage_path), | ||
resume="AUTO+ERRORED", | ||
) | ||
results = ResultGrid(analysis) | ||
else: | ||
restore_path = str(new_local_dir / new_exp_name) | ||
tuner = Tuner.restore( | ||
restore_path, trainable=_train_fn_sometimes_failing, resume_errored=True | ||
) | ||
results = tuner.fit() | ||
|
||
assert len(results.errors) == 0 | ||
|
||
# Check that we restored iter=1, then made 2 calls to train.report -> iter=3 | ||
training_iteration = results[0].metrics["training_iteration"] | ||
assert training_iteration == 3, training_iteration | ||
|
@@ -819,21 +828,12 @@ def test_tuner_restore_from_moved_experiment_path( | |
assert results[0].checkpoint | ||
assert len(results[0].best_checkpoints) == num_to_keep | ||
checkpoint_dirs = [ | ||
path | ||
for path in os.listdir(results[0].log_dir) | ||
if path.startswith("checkpoint_") | ||
path for path in os.listdir(results[0].path) if path.startswith("checkpoint_") | ||
] | ||
assert sorted(checkpoint_dirs) == ["checkpoint_000001", "checkpoint_000002"] | ||
|
||
# Make sure that we did not create a logdir in the old location | ||
assert not old_local_dir.exists() | ||
|
||
|
||
# TODO(justinvyu): [handle_moved_storage_path] | ||
@pytest.mark.skip("Restoring from a moved storage path is not supported yet.") | ||
def test_tuner_restore_from_moved_cloud_uri(ray_start_2_cpus, tmp_path): | ||
"""Test that restoring an experiment that was moved to a new remote URI | ||
resumes and continues saving new results at that URI.""" | ||
assert not old_storage_path.exists() | ||
|
||
|
||
def test_custom_searcher_and_scheduler_restore(ray_start_2_cpus, tmpdir): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What would happen if a experiment directory originally on S3, then moved to disk. Will we automatically detect the and build a new file system object?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tuner.restore("local_path")
would pass in a local storage path and would be resolved into a local filesystem. I should add a test for this.