From 6d74d21d71ecc7c125598a93f600b95aa35e2811 Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 11 Oct 2023 13:25:38 -0400 Subject: [PATCH] Log additional per-GPU information in model metadata files and GPU utilization on tensorboard. (#3712) --- ludwig/api.py | 36 ++++++++++++++++++++++++++---------- ludwig/trainers/trainer.py | 8 ++++++++ 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/ludwig/api.py b/ludwig/api.py index 0afea35b1e0..c85fc67c3c5 100644 --- a/ludwig/api.py +++ b/ludwig/api.py @@ -2141,6 +2141,31 @@ def kfold_cross_validate( return kfold_cv_stats, kfold_split_indices +def _get_compute_description(backend) -> Dict: + """Returns the compute description for the backend.""" + compute_description = {"num_nodes": backend.num_nodes} + + if torch.cuda.is_available(): + # Assumption: All nodes are of the same instance type. + # TODO: fix for Ray where workers may be of different skus + compute_description.update( + { + "gpus_per_node": torch.cuda.device_count(), + "arch_list": torch.cuda.get_arch_list(), + "gencode_flags": torch.cuda.get_gencode_flags(), + "devices": {}, + } + ) + for i in range(torch.cuda.device_count()): + compute_description["devices"][i] = { + "gpu_type": torch.cuda.get_device_name(i), + "device_capability": torch.cuda.get_device_capability(i), + "device_properties": str(torch.cuda.get_device_properties(i)), + } + + return compute_description + + @PublicAPI def get_experiment_description( config, @@ -2184,15 +2209,6 @@ def get_experiment_description( description["config"] = config description["torch_version"] = torch.__version__ - - gpu_info = {} - if torch.cuda.is_available(): - # Assumption: All nodes are of the same instance type. - # TODO: fix for Ray where workers may be of different skus - gpu_info = {"gpu_type": torch.cuda.get_device_name(0), "gpus_per_node": torch.cuda.device_count()} - - compute_description = {"num_nodes": backend.num_nodes, **gpu_info} - - description["compute"] = compute_description + description["compute"] = _get_compute_description(backend) return description diff --git a/ludwig/trainers/trainer.py b/ludwig/trainers/trainer.py index 7f41c4376be..10f5af9a4ca 100644 --- a/ludwig/trainers/trainer.py +++ b/ludwig/trainers/trainer.py @@ -487,6 +487,14 @@ def write_step_summary(cls, train_summary_writer, combined_loss, all_losses, ste / (1000**3), global_step=step, ) + + # Utilization. + # https://pytorch.org/docs/stable/generated/torch.cuda.utilization.html#torch.cuda.utilization + train_summary_writer.add_scalar( + f"cuda/device{i}/utilization", + torch.cuda.device(i).utilization(), + global_step=step, + ) train_summary_writer.flush() def is_cpu_training(self):