From 6d74d21d71ecc7c125598a93f600b95aa35e2811 Mon Sep 17 00:00:00 2001
From: Justin <justinxzhao@gmail.com>
Date: Wed, 11 Oct 2023 13:25:38 -0400
Subject: [PATCH] Log additional per-GPU information in model metadata files
 and GPU utilization on tensorboard. (#3712)

---
 ludwig/api.py              | 36 ++++++++++++++++++++++++++----------
 ludwig/trainers/trainer.py |  8 ++++++++
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/ludwig/api.py b/ludwig/api.py
index 0afea35b1e0..c85fc67c3c5 100644
--- a/ludwig/api.py
+++ b/ludwig/api.py
@@ -2141,6 +2141,31 @@ def kfold_cross_validate(
     return kfold_cv_stats, kfold_split_indices
 
 
+def _get_compute_description(backend) -> Dict:
+    """Returns the compute description for the backend."""
+    compute_description = {"num_nodes": backend.num_nodes}
+
+    if torch.cuda.is_available():
+        # Assumption: All nodes are of the same instance type.
+        # TODO: fix for Ray where workers may be of different skus
+        compute_description.update(
+            {
+                "gpus_per_node": torch.cuda.device_count(),
+                "arch_list": torch.cuda.get_arch_list(),
+                "gencode_flags": torch.cuda.get_gencode_flags(),
+                "devices": {},
+            }
+        )
+        for i in range(torch.cuda.device_count()):
+            compute_description["devices"][i] = {
+                "gpu_type": torch.cuda.get_device_name(i),
+                "device_capability": torch.cuda.get_device_capability(i),
+                "device_properties": str(torch.cuda.get_device_properties(i)),
+            }
+
+    return compute_description
+
+
 @PublicAPI
 def get_experiment_description(
     config,
@@ -2184,15 +2209,6 @@ def get_experiment_description(
 
     description["config"] = config
     description["torch_version"] = torch.__version__
-
-    gpu_info = {}
-    if torch.cuda.is_available():
-        # Assumption: All nodes are of the same instance type.
-        # TODO: fix for Ray where workers may be of different skus
-        gpu_info = {"gpu_type": torch.cuda.get_device_name(0), "gpus_per_node": torch.cuda.device_count()}
-
-    compute_description = {"num_nodes": backend.num_nodes, **gpu_info}
-
-    description["compute"] = compute_description
+    description["compute"] = _get_compute_description(backend)
 
     return description
diff --git a/ludwig/trainers/trainer.py b/ludwig/trainers/trainer.py
index 7f41c4376be..10f5af9a4ca 100644
--- a/ludwig/trainers/trainer.py
+++ b/ludwig/trainers/trainer.py
@@ -487,6 +487,14 @@ def write_step_summary(cls, train_summary_writer, combined_loss, all_losses, ste
                     / (1000**3),
                     global_step=step,
                 )
+
+                # Utilization.
+                # https://pytorch.org/docs/stable/generated/torch.cuda.utilization.html#torch.cuda.utilization
+                train_summary_writer.add_scalar(
+                    f"cuda/device{i}/utilization",
+                    torch.cuda.device(i).utilization(),
+                    global_step=step,
+                )
         train_summary_writer.flush()
 
     def is_cpu_training(self):