use cond instead of inline if+lint

youssef62 · Jan 7, 2025 · e602378 · e602378
1 parent 5f765c9
commit e602378
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 9 deletions.
diff --git a/aten/src/ATen/native/cuda/fused_adam_utils.cuh b/aten/src/ATen/native/cuda/fused_adam_utils.cuh
@@ -83,8 +83,8 @@ C10_DEVICE inline void adam_math(
     if (grad_scale_ptr) {
       r_args[kGradIdx][ii] = grad_to_store;
     }
-    //don't write into gradients if beta1 is 0
-    if (beta1>0){
+    // don't write into gradients if beta1 is 0
+    if (beta1>0) {
       r_args[kExpAvgIdx][ii] = exp_avg;
     }
     r_args[kExpAvgSqIdx][ii] = exp_avg_sq;

diff --git a/test/test_optim.py b/test/test_optim.py
@@ -2240,11 +2240,13 @@ def get_obj_size(d):
         ]
 
         num_params = 4
-        size_of_param_in_bytes = (
-            32 * 16 * dtype.__sizeof__()
-        )
+        size_of_param_in_bytes = 32 * 16 * dtype.__sizeof__()
         for optim_input in beta_1_optim_inputs:
-            zero = 0.0 if isinstance(optim_input.kwargs["betas"][0], float) else torch.tensor(0.0, device=device, dtype=dtype)
+            zero = (
+                0.0
+                if isinstance(optim_input.kwargs["betas"][0], float)
+                else torch.tensor(0.0, device=device, dtype=dtype)
+            )
             beta1_values = (optim_input.kwargs["betas"][0], zero)
             total_sizes = []  # will end up as [big_state_dict_size, no_exp_avg_sd_size]
             for beta1 in beta1_values:

diff --git a/torch/optim/adam.py b/torch/optim/adam.py
@@ -253,7 +253,7 @@ def step(self, closure=None):
             adam(
                 params_with_grad,
                 grads,
-                exp_avgs if beta1 > 0 else grads,
+                torch.cond(beta1 > 0, lambda: exp_avgs, lambda: grads),
                 exp_avg_sqs,
                 max_exp_avg_sqs,
                 state_steps,
@@ -433,7 +433,7 @@ def _single_tensor_adam(
             device_beta1 = beta1
 
         # Decay the first and second moment running average coefficient
-        if device_beta1 > 0: 
+        if device_beta1 > 0:
             exp_avg.lerp_(grad, 1 - device_beta1)
 
         exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2)

diff --git a/torch/testing/_internal/common_optimizers.py b/torch/testing/_internal/common_optimizers.py
@@ -570,7 +570,7 @@ def optim_inputs_func_adam(device, dtype=None):
                 params=None,
                 kwargs={"betas": (0.0, 0.999)},
                 desc="zero-beta1",
-            )
+            ),
         ]
         + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
         + (mps_supported_configs if _get_device_type(device) == "mps" else [])