Skip to content

Commit

Permalink
Print indexes of largest grad
Browse files Browse the repository at this point in the history
  • Loading branch information
zhu-han committed Dec 30, 2024
1 parent 32cf5be commit 06b9138
Showing 1 changed file with 33 additions and 8 deletions.
41 changes: 33 additions & 8 deletions egs/librispeech/ASR/zipformer/optim.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,18 +633,37 @@ def _show_param_with_unusual_grad(
"""
largest_ratio = 0.0
largest_name = ""
# ratios_names is a list of 3-tuples: (grad_ratio, param_name, tensor)
ratios_names = []
for (p, state, batch_param_names) in tuples:
dims = list(range(1, p.ndim))
grad_ratio = (p.grad**2).mean(dim=dims) / state["exp_avg_sq"].mean(
dim=dims

def mean(x):
# workaround for bad interface of torch's "mean" for when dims is the empty list.
if len(dims) > 0:
return x.mean(dim=dims)
else:
return x

grad_ratio = (
(mean(p.grad**2) / state["exp_avg_sq"].mean(dim=dims))
.sqrt()
.to("cpu")
)

ratios_names += zip(
grad_ratio.tolist(), batch_param_names, p.grad.unbind(dim=0)
)
max_grad_ratio, max_index = grad_ratio.to("cpu").max(dim=0)
if max_grad_ratio.item() > largest_ratio:
largest_ratio = max_grad_ratio.item()
largest_name = batch_param_names[max_index.item()]

ratios_names = sorted(ratios_names, reverse=True)
ratios_names = ratios_names[:10]
ratios_names = [
(ratio, name, largest_index(tensor))
for (ratio, name, tensor) in ratios_names
]

logging.warning(
f"Parameter with most larger-than-usual grad is {largest_name}, with ratio (cur_grad / normal_grad) of "
f"{largest_ratio ** 0.5}"
f"Parameters with most larger-than-usual grads, with ratios, are: {ratios_names}"
)

def _show_gradient_dominating_parameter(
Expand Down Expand Up @@ -714,6 +733,12 @@ def _show_gradient_dominating_parameter(
)


def largest_index(x: Tensor):
x = x.contiguous()
argmax = x.abs().argmax().item()
return [(argmax // x.stride(i)) % x.size(i) for i in range(x.ndim)]


class LRScheduler(object):
"""
Base-class for learning rate schedulers where the learning-rate depends on both the
Expand Down

0 comments on commit 06b9138

Please sign in to comment.