From 083fa31449c3456e889269e44913578acfced67a Mon Sep 17 00:00:00 2001 From: sedrick-keh-tri <133716510+sedrick-keh-tri@users.noreply.github.com> Date: Fri, 26 Apr 2024 08:33:17 -0700 Subject: [PATCH] sagemaker use reserved instances (#258) * sagemaker use reserved instances * lint --- sagemaker_train/launch_sagemaker_train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sagemaker_train/launch_sagemaker_train.py b/sagemaker_train/launch_sagemaker_train.py index f2a180c2..58994328 100644 --- a/sagemaker_train/launch_sagemaker_train.py +++ b/sagemaker_train/launch_sagemaker_train.py @@ -189,6 +189,7 @@ def get_job_name(base): max_wait=5 * 24 * 60 * 60 if args.spot_instance else None, input_mode="FastFile", # environment={"TORCH_DISTRIBUTED_DEBUG": "DETAIL", "TORCH_CPP_LOG_LEVEL": "INFO"}, + environment={"SM_USE_RESERVED_CAPACITY": "1"}, keep_alive_period_in_seconds=30 * 60 if not args.spot_instance else None, # 30 minutes )