Skip to content

Commit

Permalink
only produce tensorboard logs on rank 0 by default
Browse files Browse the repository at this point in the history
ghstack-source-id: 4255cc792b9a221bc5a012e91db92533dcfa2243
Pull Request resolved: #339
  • Loading branch information
tianyu-l committed May 29, 2024
1 parent 1ceaa4e commit 6a8455e
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 5 deletions.
6 changes: 6 additions & 0 deletions torchtitan/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,12 @@ def __init__(self):
default="tb",
help="Folder to dump TensorBoard states",
)
self.parser.add_argument(
"--metrics.rank_0_only",
default=True,
action="store_true",
help="Whether to save TensorBoard metrics only for rank 0 or for all ranks",
)

# model configs
self.parser.add_argument(
Expand Down
15 changes: 10 additions & 5 deletions torchtitan/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,16 +113,21 @@ def close(self):

def build_metric_logger(config: JobConfig, tag: Optional[str] = None):
dump_dir = config.job.dump_folder
save_tb_folder = config.metrics.save_tb_folder
# since we don't have run id yet, use current minute as identifier
tb_config = config.metrics
save_tb_folder = tb_config.save_tb_folder
# since we don't have run id, use current minute as the identifier
datetime_str = datetime.now().strftime("%Y%m%d-%H%M")
log_dir = os.path.join(dump_dir, save_tb_folder, datetime_str)

enable_tb = config.metrics.enable_tensorboard
enable_tb = tb_config.enable_tensorboard
if enable_tb:
logger.info(
f"Metrics logging active. Tensorboard logs will be saved at {log_dir}"
)
if tb_config.rank_0_only:
enable_tb = torch.distributed.get_rank() == 0
else:
rank_str = f"rank_{torch.distributed.get_rank()}"
log_dir = os.path.join(log_dir, rank_str)

rank_str = f"rank_{torch.distributed.get_rank()}"
return MetricLogger(os.path.join(log_dir, rank_str), tag, enable_tb)
return MetricLogger(log_dir, tag, enable_tb)

0 comments on commit 6a8455e

Please sign in to comment.