diff --git a/ludwig/trainers/trainer.py b/ludwig/trainers/trainer.py index d1d263283a3..bf56a1c86f5 100644 --- a/ludwig/trainers/trainer.py +++ b/ludwig/trainers/trainer.py @@ -402,57 +402,91 @@ def write_step_summary(cls, train_summary_writer, combined_loss, all_losses, ste # Log CUDA memory stats. if torch.cuda.is_available(): - memory_stats = torch.cuda.memory_stats() - # Allocated bytes. - train_summary_writer.add_scalar( - "cuda/allocated_bytes.all.current", memory_stats["allocated_bytes.all.current"], global_step=step - ) - train_summary_writer.add_scalar( - "cuda/allocated_bytes.all.peak", memory_stats["allocated_bytes.all.peak"], global_step=step - ) - train_summary_writer.add_scalar( - "cuda/allocated_bytes.all.allocated", memory_stats["allocated_bytes.all.allocated"], global_step=step - ) - train_summary_writer.add_scalar( - "cuda/allocated_bytes.all.freed", memory_stats["allocated_bytes.all.freed"], global_step=step - ) + for i in range(torch.cuda.device_count()): + device = torch.device(f"cuda:{i}") + memory_stats = torch.cuda.memory_stats(device=device) + gb_memory_stats = {k: v / (1000**3) for k, v in memory_stats.items()} + # Allocated bytes. + train_summary_writer.add_scalar( + f"cuda/device{i}/allocated_gb.all.current", + gb_memory_stats["allocated_bytes.all.current"], + global_step=step, + ) + train_summary_writer.add_scalar( + f"cuda/device{i}/allocated_gb.all.peak", + gb_memory_stats["allocated_bytes.all.peak"], + global_step=step, + ) + train_summary_writer.add_scalar( + f"cuda/device{i}/allocated_gb.all.allocated", + gb_memory_stats["allocated_bytes.all.allocated"], + global_step=step, + ) + train_summary_writer.add_scalar( + f"cuda/device{i}/allocated_gb.all.freed", + gb_memory_stats["allocated_bytes.all.freed"], + global_step=step, + ) - # Reserved bytes. - train_summary_writer.add_scalar( - "cuda/reserved_bytes.all.current", memory_stats["reserved_bytes.all.current"], global_step=step - ) - train_summary_writer.add_scalar( - "cuda/reserved_bytes.all.peak", memory_stats["reserved_bytes.all.peak"], global_step=step - ) - train_summary_writer.add_scalar( - "cuda/reserved_bytes.all.allocated", memory_stats["reserved_bytes.all.allocated"], global_step=step - ) - train_summary_writer.add_scalar( - "cuda/reserved_bytes.all.freed", memory_stats["reserved_bytes.all.freed"], global_step=step - ) + # Reserved bytes. + train_summary_writer.add_scalar( + f"cuda/device{i}/reserved_gb.all.current", + gb_memory_stats["reserved_bytes.all.current"], + global_step=step, + ) + train_summary_writer.add_scalar( + f"cuda/device{i}/reserved_gb.all.peak", gb_memory_stats["reserved_bytes.all.peak"], global_step=step + ) + train_summary_writer.add_scalar( + f"cuda/device{i}/reserved_gb.all.allocated", + gb_memory_stats["reserved_bytes.all.allocated"], + global_step=step, + ) + train_summary_writer.add_scalar( + f"cuda/device{i}/reserved_gb.all.freed", + gb_memory_stats["reserved_bytes.all.freed"], + global_step=step, + ) - # Active bytes. - train_summary_writer.add_scalar( - "cuda/active_bytes.all.current", memory_stats["active_bytes.all.current"], global_step=step - ) - train_summary_writer.add_scalar( - "cuda/active_bytes.all.peak", memory_stats["active_bytes.all.peak"], global_step=step - ) - train_summary_writer.add_scalar( - "cuda/active_bytes.all.allocated", memory_stats["active_bytes.all.allocated"], global_step=step - ) - train_summary_writer.add_scalar( - "cuda/active_bytes.all.freed", memory_stats["active_bytes.all.freed"], global_step=step - ) + # Active bytes. + train_summary_writer.add_scalar( + f"cuda/device{i}/active_gb.all.current", + gb_memory_stats["active_bytes.all.current"], + global_step=step, + ) + train_summary_writer.add_scalar( + f"cuda/device{i}/active_gb.all.peak", gb_memory_stats["active_bytes.all.peak"], global_step=step + ) + train_summary_writer.add_scalar( + f"cuda/device{i}/active_gb.all.allocated", + gb_memory_stats["active_bytes.all.allocated"], + global_step=step, + ) + train_summary_writer.add_scalar( + f"cuda/device{i}/active_gb.all.freed", gb_memory_stats["active_bytes.all.freed"], global_step=step + ) - # Global free memory. - train_summary_writer.add_scalar("cuda/global_free_memory", torch.cuda.mem_get_info()[0], global_step=step) + # Global free memory. + train_summary_writer.add_scalar( + f"cuda/device{i}/global_free_memory_gb", + torch.cuda.mem_get_info(device=device)[0] / (1000**3), + global_step=step, + ) - # Total memory occupied. - train_summary_writer.add_scalar( - "cuda/total_memory_occupied", torch.cuda.mem_get_info()[1], global_step=step - ) + # Total memory occupied. + train_summary_writer.add_scalar( + f"cuda/device{i}/total_memory_occupied_gb", + torch.cuda.mem_get_info(device=device)[1] / (1000**3), + global_step=step, + ) + # Total memory used. + train_summary_writer.add_scalar( + f"cuda/device{i}/total_memory_used_gb", + (torch.cuda.mem_get_info(device=device)[1] - torch.cuda.mem_get_info(device=device)[0]) + / (1000**3), + global_step=step, + ) train_summary_writer.flush() def is_cpu_training(self):