You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Training image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.6.0-gpu-py3
Instance type: ml.p3.2xlarge
checkpoint_local_path = "/state"
==================
Traceback (most recent call last):
File "main.py", line 543, in
main()
File "main.py", line 181, in main
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
File "/opt/conda/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 200, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/opt/conda/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 158, in start_processes
while not context.join():
File "/opt/conda/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 119, in join
raise Exception(msg)
Exception:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/opt/conda/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 20, in _wrap
fn(i, *args)
File "/opt/ml/code/main.py", line 349, in main_worker
train(train_loader, model, criterion, optimizer, epoch, args)
File "/opt/ml/code/main.py", line 390, in train
for i, (images, target) in enumerate(train_loader):
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 363, in next
data = self._next_data()
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 989, in _next_data
return self._process_data(data)
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1014, in _process_data
data.reraise()
File "/opt/conda/lib/python3.6/site-packages/torch/_utils.py", line 395, in reraise
raise self.exc_type(msg)
FileNotFoundError: Caught FileNotFoundError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 185, in _worker_loop
data = fetcher.fetch(index)
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/opt/conda/lib/python3.6/site-packages/torchvision/datasets/folder.py", line 139, in getitem
sample = self.transform(sample)
File "/opt/conda/lib/python3.6/site-packages/torchvision/transforms/transforms.py", line 61, in call
img = t(img)
File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 724, in _call_impl
result = hook(self, input)
File "/opt/conda/lib/python3.6/site-packages/smdebug/pytorch/hook.py", line 133, in forward_pre_hook
self._increment_step()
File "/opt/conda/lib/python3.6/site-packages/smdebug/core/hook.py", line 517, in _increment_step
self._write_state()
File "/opt/conda/lib/python3.6/site-packages/smdebug/core/hook.py", line 529, in _write_state
if self.state_store.is_checkpoint_updated():
File "/opt/conda/lib/python3.6/site-packages/smdebug/core/state_store.py", line 99, in is_checkpoint_updated
timestamps = [os.path.getmtime(file) for file in checkpoint_files]
File "/opt/conda/lib/python3.6/site-packages/smdebug/core/state_store.py", line 99, in
timestamps = [os.path.getmtime(file) for file in checkpoint_files]
File "/opt/conda/lib/python3.6/genericpath.py", line 55, in getmtime
return os.stat(filename).st_mtime
FileNotFoundError: [Errno 2] No such file or directory: '/state/metadata.json.sagemaker-uploaded'
The text was updated successfully, but these errors were encountered:
It seems like files get deleted between the time the list of files is created
checkpoint_files = self._get_checkpoint_files_in_dir(self._checkpoint_dir) - https://github.com/awslabs/sagemaker-debugger/blob/master/smdebug/core/state_store.py#L92
to
timestamps = [os.path.getmtime(file) for file in checkpoint_files] - https://github.com/awslabs/sagemaker-debugger/blob/master/smdebug/core/state_store.py#L99
==================
Training image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.6.0-gpu-py3
Instance type: ml.p3.2xlarge
checkpoint_local_path = "/state"
==================
Traceback (most recent call last):
File "main.py", line 543, in
main()
File "main.py", line 181, in main
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
File "/opt/conda/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 200, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/opt/conda/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 158, in start_processes
while not context.join():
File "/opt/conda/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 119, in join
raise Exception(msg)
Exception:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/opt/conda/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 20, in _wrap
fn(i, *args)
File "/opt/ml/code/main.py", line 349, in main_worker
train(train_loader, model, criterion, optimizer, epoch, args)
File "/opt/ml/code/main.py", line 390, in train
for i, (images, target) in enumerate(train_loader):
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 363, in next
data = self._next_data()
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 989, in _next_data
return self._process_data(data)
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1014, in _process_data
data.reraise()
File "/opt/conda/lib/python3.6/site-packages/torch/_utils.py", line 395, in reraise
raise self.exc_type(msg)
FileNotFoundError: Caught FileNotFoundError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 185, in _worker_loop
data = fetcher.fetch(index)
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/opt/conda/lib/python3.6/site-packages/torchvision/datasets/folder.py", line 139, in getitem
sample = self.transform(sample)
File "/opt/conda/lib/python3.6/site-packages/torchvision/transforms/transforms.py", line 61, in call
img = t(img)
File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 724, in _call_impl
result = hook(self, input)
File "/opt/conda/lib/python3.6/site-packages/smdebug/pytorch/hook.py", line 133, in forward_pre_hook
self._increment_step()
File "/opt/conda/lib/python3.6/site-packages/smdebug/core/hook.py", line 517, in _increment_step
self._write_state()
File "/opt/conda/lib/python3.6/site-packages/smdebug/core/hook.py", line 529, in _write_state
if self.state_store.is_checkpoint_updated():
File "/opt/conda/lib/python3.6/site-packages/smdebug/core/state_store.py", line 99, in is_checkpoint_updated
timestamps = [os.path.getmtime(file) for file in checkpoint_files]
File "/opt/conda/lib/python3.6/site-packages/smdebug/core/state_store.py", line 99, in
timestamps = [os.path.getmtime(file) for file in checkpoint_files]
File "/opt/conda/lib/python3.6/genericpath.py", line 55, in getmtime
return os.stat(filename).st_mtime
FileNotFoundError: [Errno 2] No such file or directory: '/state/metadata.json.sagemaker-uploaded'
The text was updated successfully, but these errors were encountered: