From 42ff02ac32ffb6d25cd62d321b8161ebc7d7e220 Mon Sep 17 00:00:00 2001 From: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> Date: Sun, 29 Dec 2024 21:19:32 +0800 Subject: [PATCH] Add weight support for LigerCrossEntropy (#420) ## Summary Resolve #404. Note: current implementation doesn't weight z loss. Reference: [PyTorch's CrossEntropyLoss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html) ## Testing Done It hasn't fully tested with other params. - Hardware Type: - [x] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [x] run `make test-convergence` to ensure convergence --- src/liger_kernel/ops/cross_entropy.py | 107 ++++++++-- .../ops/fused_linear_cross_entropy.py | 79 +++++--- .../transformers/cross_entropy.py | 3 + src/liger_kernel/transformers/functional.py | 3 + .../fused_linear_cross_entropy.py | 3 + test/transformers/test_cross_entropy.py | 187 +++++++++++++++++- .../test_fused_linear_cross_entropy.py | 23 ++- 7 files changed, 346 insertions(+), 59 deletions(-) diff --git a/src/liger_kernel/ops/cross_entropy.py b/src/liger_kernel/ops/cross_entropy.py index ca6f9f7db..3fa12c1ef 100644 --- a/src/liger_kernel/ops/cross_entropy.py +++ b/src/liger_kernel/ops/cross_entropy.py @@ -30,11 +30,14 @@ def liger_cross_entropy_kernel( X_stride, Y_ptr, Y_stride, + weight_ptr, loss_ptr, z_loss_ptr, loss_stride, n_cols, n_non_ignore, + sum_non_ignore_weight, + weight_sum, ignore_index, lse_square_scale: tl.constexpr, label_smoothing: tl.constexpr, @@ -42,6 +45,7 @@ def liger_cross_entropy_kernel( softcap, RETURN_Z_LOSS: tl.constexpr, BLOCK_SIZE: tl.constexpr, + HAS_WEIGHT: tl.constexpr, HAS_SOFTCAPPING: tl.constexpr, ): """ @@ -53,18 +57,22 @@ def liger_cross_entropy_kernel( X_stride (int): The stride of the input tensor. Y_ptr: Pointer to target tensor. Y_stride (int): The stride of the target tensor. + weight_ptr: Pointer to weight tensor. loss_ptr: Pointer to tensor to store the loss. z_loss_ptr: Pointer to tensor to store the z loss. No operation if RETURN_Z_LOSS is 0. loss_stride (int): The stride of the loss tensor. n_cols (int): The number of columns in the input tensor. - n_non_ignore (int): The number of non-ignored elements in the batch. + n_non_ignore (flaot): The number of non-ignored elements in the batch. + sum_non_ignore_weight (float): The sum of non-ignored target's weights in the batch. + weight_sum (float): The sum of weight tensor. ignore_index (int): The index to ignore in the target. label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing. lse_square_scale (float): The scaler of (logsumexp(_input)) ^ 2 adding to the loss for the stability of training. - RETURN_Z_LOSS (int): The boolean value to decide whether storing z loss to z_loss_ptr or not. It must be 0 or 1. reduction (str): The string for the reduction to apply softcap (float): The upper threshold for scaling logits to the range (-softcap, +softcap). + RETURN_Z_LOSS (int): The boolean value to decide whether storing z loss to z_loss_ptr or not. It must be 0 or 1. BLOCK_SIZE (int): The block size for Triton operations. + HAS_WEIGHT (bool): The boolean value to determine whether assigning weight to each of the classes. HAS_SOFTCAPPING (bool): The boolean value to determine whether applying soft-capping or not. """ @@ -89,6 +97,9 @@ def liger_cross_entropy_kernel( loss_ptr += program_id * loss_stride z_loss_ptr += program_id * loss_stride + if HAS_WEIGHT: + weight_y = tl.load(weight_ptr + y).cast(tl.float32) + # Online softmax: 2 loads + 1 store (compared with 3 loads + 1 store for the safe softmax) # Refer to Algorithm 3 in the paper: https://arxiv.org/pdf/1805.02867 @@ -117,7 +128,11 @@ def liger_cross_entropy_kernel( block_max = tl.max(X_block) if label_smoothing > 0: # scale X beforehand to avoid overflow - scaled_x_sum += tl.sum(tl.where(X_offsets < n_cols, -eps * X_block, 0.0)) + if HAS_WEIGHT: + weight_block = tl.load(weight_ptr + X_offsets, mask=X_offsets < n_cols) + scaled_x_sum += tl.sum(tl.where(X_offsets < n_cols, -eps * X_block * weight_block, 0.0)) + else: + scaled_x_sum += tl.sum(tl.where(X_offsets < n_cols, -eps * X_block, 0.0)) m_new = tl.maximum(m, block_max) d = d * tl.exp(m - m_new) + tl.sum(tl.exp(X_block - m_new)) m = m_new @@ -153,18 +168,41 @@ def liger_cross_entropy_kernel( if HAS_SOFTCAPPING: intermediate = tanh(X_block / softcap) X_block = softcap * intermediate - # softmax(x_i) - X_block = tl.exp(X_block - m) / d - # derivative of z-loss: 2 * lse_square_scale * lse * softmax(x_i) - X_block += 2 * lse_square_scale * lse * X_block - # smoothing term - X_block += -eps - # special handle dx_y - X_block = tl.where(X_offsets != y, X_block, X_block - (1 - label_smoothing)) - # reduction scale - if reduction == "mean": - X_block = X_block / (n_non_ignore) - # chain rule + + if not HAS_WEIGHT: + # softmax(x_i) + X_block = tl.exp(X_block - m) / d + # derivative of z-loss: 2 * lse_square_scale * lse * softmax(x_i) + X_block += 2 * lse_square_scale * lse * X_block + # smoothing term + X_block += -eps + # special handle dx_y + X_block = tl.where(X_offsets != y, X_block, X_block - (1 - label_smoothing)) + # reduction scale + if reduction == "mean": + X_block = X_block / n_non_ignore + else: + weight_block = tl.load(weight_ptr + X_offsets, mask=X_offsets < n_cols) + softmax_X = tl.exp(X_block - m) / d + # derivative of original_loss + dloss_ori = (1 - label_smoothing) * softmax_X + # specially handle dx_y + dloss_ori = tl.where(X_offsets != y, dloss_ori, dloss_ori - (1 - label_smoothing)) + dloss_ori = dloss_ori * weight_y + # derivative of smooth_loss + dloss_smooth = eps * (-weight_block + softmax_X * weight_sum) + # derivative of z-loss + dz_loss = 2 * lse_square_scale * lse * softmax_X + # reduction scale + if reduction == "mean": + dloss_ori = dloss_ori / sum_non_ignore_weight + dloss_smooth = dloss_smooth / sum_non_ignore_weight + # TODO: Implement weighted z_loss. Currently, z_loss is not scaled by weight. + dz_loss = dz_loss / n_non_ignore + # derivative of total_loss + X_block = dloss_ori + dloss_smooth + dz_loss + + # chain rule softcapping # d(softcap * tanh(x / softcap)) = (1 - tanh^2(x / softcap)) if HAS_SOFTCAPPING: X_block = X_block * (1 - intermediate * intermediate) @@ -183,6 +221,8 @@ def liger_cross_entropy_kernel( # sum(e ^ (X - max(X))) must >= 1 because the max term is e ^ 0 = 1 # So we can safely calculate log (softmax(X_y)) without overflow loss = lse - ori_X_y + if HAS_WEIGHT: + loss = weight_y * loss # Original loss = H(q, p), with label smoothing regularization = H(q', p) and (label_smoothing / V) = eps # H(q', p) = (1 - label_smoothing) * H(q, p) + label_smoothing * H(u, p) @@ -193,17 +233,24 @@ def liger_cross_entropy_kernel( # pytorch: https://github.com/pytorch/pytorch/blob/2981534f54d49fa3a9755c9b0855e7929c2527f0/aten/src/ATen/native/LossNLL.cpp#L516 # See full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issuecomment-2333753087 if label_smoothing > 0: - smooth_loss = scaled_x_sum + label_smoothing * lse + if HAS_WEIGHT: + smooth_loss = scaled_x_sum + eps * lse * weight_sum + else: + smooth_loss = scaled_x_sum + label_smoothing * lse loss = loss * (1 - label_smoothing) + smooth_loss # An auxiliary loss, z_loss # Refer to Page14 Loss function section in the paper PaLM: https://www.jmlr.org/papers/v24/22-1144.html z_loss = lse_square_scale * lse * lse - loss += z_loss # Normalize the loss by the number of non-ignored elements if reduction is "mean" if reduction == "mean": + if HAS_WEIGHT: + loss = loss / sum_non_ignore_weight + else: + loss = loss / n_non_ignore + # TODO: Implement weighted z_loss. Currently, z_loss is not scaled by weight. z_loss = z_loss / n_non_ignore - loss = loss / n_non_ignore + loss += z_loss tl.store(loss_ptr, loss) if RETURN_Z_LOSS == _TRUE: @@ -225,6 +272,7 @@ def liger_cross_entropy_kernel( def cross_entropy_forward( _input, target, + weight, ignore_index, lse_square_scale, label_smoothing, @@ -250,7 +298,20 @@ def cross_entropy_forward( else: z_loss_1d = loss_1d # dummy ptr when return_z_loss == False - n_non_ignore = (target != ignore_index).sum().item() + target_mask = target != ignore_index + n_non_ignore = target_mask.sum().item() + sum_non_ignore_weight = n_non_ignore + weight_sum = 0.0 + if weight is not None: + assert weight.shape[0] == V, f"If given, weight has to be a Tensor of size V. Got: {weight.shape}" + assert torch.is_floating_point( + weight + ), f"If given, weight has to be a Tensor of floating point dtype. Got: {weight.dtype}" + sum_non_ignore_weight = torch.gather(weight, dim=0, index=target.masked_select(target_mask)).sum().item() + weight_sum = weight.sum().item() + # ensure weight is contiguous + if weight.stride(-1) != 1: + weight = weight.contiguous() # ensure _input and target are contiguous in the last dimension if _input.stride(-1) != 1: @@ -264,18 +325,22 @@ def cross_entropy_forward( X_stride=_input.stride(-2), Y_ptr=target, Y_stride=target.stride(-1), # always 1 + weight_ptr=weight if weight is not None else _input, # dummy if None loss_ptr=loss_1d, z_loss_ptr=z_loss_1d, loss_stride=loss_1d.stride(-1), # always 1 n_cols=V, n_non_ignore=n_non_ignore, + sum_non_ignore_weight=sum_non_ignore_weight, ignore_index=ignore_index, + weight_sum=weight_sum, lse_square_scale=lse_square_scale, label_smoothing=label_smoothing, reduction=reduction, softcap=softcap if softcap is not None else 0.0, RETURN_Z_LOSS=return_z_loss, BLOCK_SIZE=BLOCK_SIZE, + HAS_WEIGHT=True if weight is not None else False, HAS_SOFTCAPPING=True if softcap is not None else False, # TODO: 32 seems to give the best performance # Performance is quite sensitive to num_warps @@ -327,6 +392,7 @@ def forward( ctx, _input: torch.Tensor, target: torch.Tensor, + weight: Optional[torch.FloatTensor], ignore_index: int = -100, lse_square_scale: float = 0.0, label_smoothing: float = 0.0, @@ -341,6 +407,7 @@ def forward( ctx : The context object. _input (tensor): The input tensor of shape (BT, V) where B is batch size, T is sequence length, V is vocab size. target (tensor): The target tensor of shape (BT) where each value is in [0, V-1]. + weight(Tensor, optional): a manual rescaling weight given to each class. If given, has to be a Tensor of size V and floating point dtype ignore_index (int): The index to ignore in the target. lse_square_scale (float): The scaler of (logsumexp(_input)) ^ 2 adding to the loss for the stability of training. label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing. @@ -354,6 +421,7 @@ def forward( loss, z_loss, _input = cross_entropy_forward( _input, target, + weight, ignore_index, lse_square_scale, label_smoothing, @@ -395,4 +463,5 @@ def backward(ctx, grad_output, grad_ouput2): None, None, None, + None, ) diff --git a/src/liger_kernel/ops/fused_linear_cross_entropy.py b/src/liger_kernel/ops/fused_linear_cross_entropy.py index 83fcb108a..4df484135 100644 --- a/src/liger_kernel/ops/fused_linear_cross_entropy.py +++ b/src/liger_kernel/ops/fused_linear_cross_entropy.py @@ -17,6 +17,7 @@ def fused_linear_cross_entropy_forward( _input, weight, target, + ce_weight=None, bias=None, ignore_index=-100, lse_square_scale=0.0, @@ -47,8 +48,22 @@ def fused_linear_cross_entropy_forward( # we use fp32 for loss accumulator loss_1d = torch.zeros(BT, dtype=torch.float32, device=device) - # NOTE: skip .item() here to avoid CUDA synchronization - total_n_non_ignore = (target != ignore_index).sum() + # TODO: evaluate how CUDA synchronization caused by .item() affects the speed + target_mask = target != ignore_index + total_n_non_ignore = target_mask.sum().item() + total_sum_non_ignore_ce_weight = total_n_non_ignore + ce_weight_sum = 0.0 + if ce_weight is not None: + assert ce_weight.shape[0] == V, f"If given, weight has to be a Tensor of size V. Got: {ce_weight.shape}" + assert torch.is_floating_point( + ce_weight + ), f"If given, weight has to be a Tensor of floating point dtype. Got: {ce_weight.dtype}" + total_sum_non_ignore_ce_weight = ( + torch.gather(ce_weight, dim=0, index=target.masked_select(target_mask)).sum().item() + ) + ce_weight_sum = ce_weight.sum().item() + if ce_weight.stride(-1) != 1: + ce_weight = ce_weight.contiguous() for chunk_id in range(num_chunks): start_idx = chunk_id * chunk_size @@ -66,7 +81,6 @@ def fused_linear_cross_entropy_forward( # unreduced loss loss_1d_slice = loss_1d[start_idx:end_idx] # chunk_size, - n_non_ignore = (target_chunk != ignore_index).sum().item() # ensure _input and target are contiguous logits_chunk = logits_chunk.contiguous() @@ -78,35 +92,28 @@ def fused_linear_cross_entropy_forward( X_stride=logits_chunk.stride(-2), Y_ptr=target_chunk, Y_stride=target_chunk.stride(-1), # always 1 + weight_ptr=ce_weight if ce_weight is not None else _input, # dummy if None loss_ptr=loss_1d_slice, z_loss_ptr=loss_1d_slice, # dummy ptr, not used loss_stride=loss_1d_slice.stride(-1), # always 1 n_cols=V, - n_non_ignore=n_non_ignore, + n_non_ignore=total_n_non_ignore, + sum_non_ignore_weight=total_sum_non_ignore_ce_weight, + weight_sum=ce_weight_sum, ignore_index=ignore_index, lse_square_scale=lse_square_scale, label_smoothing=label_smoothing, reduction=reduction, softcap=softcap if softcap is not None else 0.0, RETURN_Z_LOSS=0, # False + HAS_WEIGHT=True if ce_weight is not None else False, HAS_SOFTCAPPING=True if softcap is not None else False, BLOCK_SIZE=BLOCK_SIZE, num_warps=32 if not is_hip() else 16, ) - # gradient of logits_chunk is computed in-place by the above triton kernel and is of shape: chunk_size x V - # thus grad_input[start_idx: end_idx] should be of shape: chunk_size x H - # additionally, since we are chunking the inputs, observe that the loss and gradients are calculated only - # on `n_non_ignore` tokens. However, the gradient of the input should be calculated for all tokens. - # Thus, we need an additional scaling factor of (n_non_ignore/total_n_non_ignore) to scale the gradients. - - if reduction == "mean": - alpha = n_non_ignore / total_n_non_ignore if total_n_non_ignore > 0 else 0.0 - else: - alpha = 1.0 - - loss_1d[start_idx:end_idx] = loss_1d_slice * alpha - grad_logits_chunk = logits_chunk * alpha # chunk_size x V + loss_1d[start_idx:end_idx] = loss_1d_slice + grad_logits_chunk = logits_chunk # chunk_size x V grad_input[start_idx:end_idx] = grad_logits_chunk @ weight @@ -118,7 +125,7 @@ def fused_linear_cross_entropy_forward( ), # In an autocast scenario without bias, differing logits_chunk data types will cause an addmm operation error. mat2=_input_chunk, out=grad_weight, - alpha=alpha, + alpha=1.0, beta=1.0, ) @@ -127,7 +134,7 @@ def fused_linear_cross_entropy_forward( input=grad_bias, other=logits_chunk.sum(dim=0), out=grad_bias, - alpha=alpha, + alpha=1.0, ) if reduction == "none": @@ -193,6 +200,7 @@ def forward( weight, target, bias=None, + ce_weight=None, ignore_index=-100, lse_square_scale=0.0, label_smoothing=0.0, @@ -212,21 +220,23 @@ def forward( target: (B*T) where each value is in [0, V-1] weight: (V, H) where V is the number of classes bias: (V) where V is the number of classes + ce_weight: a manual rescaling weight given to each class. If given, has to be a Tensor of size V and floating point dtype ignore_index: the index to ignore in the target label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing. reduction: reduction to apply """ loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward( - _input, - weight, - target, - bias, - ignore_index, - lse_square_scale, - label_smoothing, - reduction, - softcap, + _input=_input, + weight=weight, + target=target, + bias=bias, + ce_weight=ce_weight, + ignore_index=ignore_index, + lse_square_scale=lse_square_scale, + label_smoothing=label_smoothing, + reduction=reduction, + softcap=softcap, ) # downcast to dtype and store for backward ctx.save_for_backward( @@ -243,4 +253,15 @@ def backward(ctx, grad_output): grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_backward( grad_output, grad_input, grad_weight, grad_bias ) - return (grad_input, grad_weight, None, grad_bias, None, None, None, None, None) + return ( + grad_input, + grad_weight, + None, + grad_bias, + None, + None, + None, + None, + None, + None, + ) diff --git a/src/liger_kernel/transformers/cross_entropy.py b/src/liger_kernel/transformers/cross_entropy.py index 9131c2e9e..d72fc3b00 100644 --- a/src/liger_kernel/transformers/cross_entropy.py +++ b/src/liger_kernel/transformers/cross_entropy.py @@ -8,6 +8,7 @@ class LigerCrossEntropyLoss(torch.nn.Module): def __init__( self, + weight: Optional[torch.FloatTensor] = None, ignore_index: int = -100, lse_square_scale: float = 0.0, label_smoothing: float = 0.0, @@ -28,6 +29,7 @@ def __init__( "none", }, f"reduction must be one of 'mean', 'sum', or 'none'. Got: {reduction}" assert softcap is None or softcap > 0, f"softcap must greater than 0.0 or None. Got: {softcap}" + self.weight = weight self.ignore_index = ignore_index self.lse_square_scale = lse_square_scale self.label_smoothing = label_smoothing @@ -39,6 +41,7 @@ def forward(self, _input: torch.Tensor, target: torch.Tensor): loss, z_loss = LigerCrossEntropyFunction.apply( _input, target, + self.weight, self.ignore_index, self.lse_square_scale, self.label_smoothing, diff --git a/src/liger_kernel/transformers/functional.py b/src/liger_kernel/transformers/functional.py index 32c0e3298..dd34fafb1 100644 --- a/src/liger_kernel/transformers/functional.py +++ b/src/liger_kernel/transformers/functional.py @@ -32,6 +32,7 @@ def liger_cross_entropy( loss, z_loss = LigerCrossEntropyFunction.apply( input, target, + weight, ignore_index, lse_square_scale, label_smoothing, @@ -49,6 +50,7 @@ def liger_fused_linear_cross_entropy( weight, target, bias=None, + ce_weight=None, ignore_index: int = -100, lse_square_scale: float = 0.0, label_smoothing: float = 0.0, @@ -60,6 +62,7 @@ def liger_fused_linear_cross_entropy( weight, target, bias, + ce_weight, ignore_index, lse_square_scale, label_smoothing, diff --git a/src/liger_kernel/transformers/fused_linear_cross_entropy.py b/src/liger_kernel/transformers/fused_linear_cross_entropy.py index 406be7737..1de352e6c 100644 --- a/src/liger_kernel/transformers/fused_linear_cross_entropy.py +++ b/src/liger_kernel/transformers/fused_linear_cross_entropy.py @@ -8,6 +8,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module): def __init__( self, + ce_weight: Optional[torch.FloatTensor] = None, ignore_index: int = -100, lse_square_scale: float = 0.0, label_smoothing: float = 0.0, @@ -24,6 +25,7 @@ def __init__( "none", }, f"reduction must be one of 'mean', 'sum', or 'none'. Got: {reduction}" assert softcap is None or softcap > 0, f"softcap must greater than 0.0 or None. Got: {softcap}" + self.ce_weight = ce_weight self.ignore_index = ignore_index self.lse_square_scale = lse_square_scale self.label_smoothing = label_smoothing @@ -36,6 +38,7 @@ def forward(self, lin_weight, _input, target, bias=None): lin_weight, target, bias, + self.ce_weight, self.ignore_index, self.lse_square_scale, self.label_smoothing, diff --git a/test/transformers/test_cross_entropy.py b/test/transformers/test_cross_entropy.py index 534637a87..b88033f2a 100644 --- a/test/transformers/test_cross_entropy.py +++ b/test/transformers/test_cross_entropy.py @@ -21,6 +21,7 @@ class CrossEntropyWithZLoss(torch.nn.Module): def __init__( self, + weight=None, lse_square_scale=0.0, reduction="mean", ignore_index=-100, @@ -29,6 +30,7 @@ def __init__( dtype=torch.float32, ): super().__init__() + self.weight = weight self.lse_square_scale = lse_square_scale self.reduction = reduction self.ignore_index = ignore_index @@ -39,10 +41,14 @@ def __init__( def forward(self, logits, targets): # Loss calculations are all in float32 logits = logits.to(torch.float32) + + target_mask = targets != self.ignore_index + # Standard cross entropy loss ce_loss = F.cross_entropy( logits, targets, + weight=self.weight, reduction=self.reduction, label_smoothing=self.label_smoothing, ignore_index=self.ignore_index, @@ -53,9 +59,9 @@ def forward(self, logits, targets): # Z-loss term z_loss = torch.where(targets != self.ignore_index, self.lse_square_scale * (lse**2), 0.0) - z_loss = z_loss.to(logits.dtype) + if self.reduction == "mean": - z_loss = z_loss.sum() / (targets != self.ignore_index).sum() + z_loss = z_loss.sum() / target_mask.sum() elif self.reduction == "sum": z_loss = z_loss.sum() else: @@ -284,6 +290,74 @@ def _test_correctness_with_z_loss_with_other_params_once( assert_verbose_allclose(_input.grad, _input2.grad, atol=atol, rtol=rtol) +def _test_correctness_with_weight_once(target_ce, B, T, V, reduction, weight, scalar, dtype, atol, rtol): + torch.manual_seed(0) + torch_ce = CrossEntropyLoss(weight=weight, reduction=reduction) + + _tensor = torch.randn(B * T, V, device=device, dtype=dtype) * scalar + _input = _tensor.detach().clone().requires_grad_(True) + _input2 = _tensor.detach().clone().requires_grad_(True) + + target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long) + + output = torch_ce(_input, target) + output2 = target_ce(_input2, target) + assert torch.allclose(output, output2, atol=atol, rtol=rtol) + + output.backward(gradient=torch.ones_like(output)) + output2.backward(gradient=torch.ones_like(output)) + assert torch.allclose(_input.grad, _input2.grad, atol=atol, rtol=rtol) + + +def _test_correctness_with_weight_with_other_params_once( + target_ce, + B, + T, + V, + reduction, + weight, + lse_square_scale, + ignore_index, + label_smoothing, + softcap, + scalar, + dtype, + atol, + rtol, +): + torch.manual_seed(0) + torch_ce = CrossEntropyWithZLoss( + weight=weight, + lse_square_scale=lse_square_scale, + ignore_index=ignore_index, + reduction=reduction, + label_smoothing=label_smoothing, + dtype=dtype, + ) + + _tensor = torch.randn(B * T, V, device=device, dtype=dtype) * scalar + # upcasting to match liger's casting strategy + _input = _tensor.detach().clone().requires_grad_(True) + _input2 = _tensor.detach().clone().requires_grad_(True) + + target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long) + + # Assign some random number of elements as ignore_index + num_elements_to_assign = torch.randint( + 1, B * T // 2, (1,) + ).item() # Random number of elements to set to ignore_index + indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign] # Randomly select indices + target[indices_to_assign] = ignore_index + + output = torch_ce(softcap * torch.tanh(_input.to(torch.float32) / softcap), target).to(dtype) + output2 = target_ce(_input2, target) + assert_verbose_allclose(output, output2, atol=atol, rtol=rtol) + + output.backward(gradient=torch.ones_like(output)) + output2.backward(gradient=torch.ones_like(output)) + assert_verbose_allclose(_input.grad, _input2.grad, atol=atol, rtol=rtol) + + def _test_correctness_not_last_layer_once(target_ce, B, T, V, reduction, scalar, dtype, atol, rtol): torch_ce = CrossEntropyLoss(reduction=reduction) @@ -324,6 +398,7 @@ def _test_correctness_functional( y1, y1_z = liger_cross_entropy( x1, target, + None, ignore_index=0, lse_square_scale=1e-4, label_smoothing=0.1, @@ -331,7 +406,7 @@ def _test_correctness_functional( softcap=30.0, return_z_loss=True, ) - y2, y2_z = LigerCrossEntropyFunction.apply(x2, target, 0, 1e-4, 0.1, "mean", 30.0, True) + y2, y2_z = LigerCrossEntropyFunction.apply(x2, target, None, 0, 1e-4, 0.1, "mean", 30.0, True) assert torch.allclose(y1, y2, atol=atol, rtol=rtol) assert torch.allclose(y1_z, y2_z, atol=atol, rtol=rtol) @@ -660,6 +735,104 @@ def test_correctness_with_z_loss_with_other_params_once( (1.0, torch.float32, 1e-8, 1e-6), ], ) +def test_correctness_with_weight_once(B, T, V, reduction, scalar, dtype, atol, rtol): + weight = torch.rand(V, device=device, dtype=dtype) + test_ce = LigerCrossEntropyLoss(weight=weight, reduction=reduction) + _test_correctness_with_weight_once(test_ce, B, T, V, reduction, weight, scalar, dtype, atol, rtol) + + +@pytest.mark.parametrize( + "B, T, V", + [ + (2, 4096, 32000), # llama2, mistral + # # weird shapes + (3, 423, 32000), + ], +) +@pytest.mark.parametrize("reduction", ["sum", "mean", "none"]) +@pytest.mark.parametrize( + "ignore_index, lse_square_scale, label_smoothing, softcap", + [ + (-100, 1e-4, 0.1, 30.0), + (42, 1e-5, 0.2, 40.0), + ], +) +@pytest.mark.parametrize( + "scalar, dtype, atol, rtol", + [ + pytest.param( + 1.0, + torch.bfloat16, + 1e-8, + 5e-2, + marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"), + ), + (1.0, torch.float32, 1e-8, 1e-6), + ], +) +def test_correctness_with_weight_with_other_params_once( + B, + T, + V, + reduction, + lse_square_scale, + ignore_index, + label_smoothing, + softcap, + scalar, + dtype, + atol, + rtol, +): + weight = torch.rand(V, device=device, dtype=torch.float32) # match softcap casting + test_ce = LigerCrossEntropyLoss( + weight=weight, + lse_square_scale=lse_square_scale, + reduction=reduction, + ignore_index=ignore_index, + label_smoothing=label_smoothing, + softcap=softcap, + ) + _test_correctness_with_weight_with_other_params_once( + test_ce, + B, + T, + V, + reduction, + weight, + lse_square_scale, + ignore_index, + label_smoothing, + softcap, + scalar, + dtype, + atol, + rtol, + ) + + +@pytest.mark.parametrize( + "B, T, V", + [ + (2, 4096, 32000), # llama2, mistral + # # weird shapes + (3, 423, 32000), + ], +) +@pytest.mark.parametrize("reduction", ["sum", "mean"]) +@pytest.mark.parametrize( + "scalar, dtype, atol, rtol", + [ + pytest.param( + 1.0, + torch.bfloat16, + 1e-8, + 5e-2, + marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"), + ), + (1.0, torch.float32, 1e-8, 1e-6), + ], +) def test_correctness_not_last_layer(B, T, V, reduction, scalar, dtype, atol, rtol): liger_ce = LigerCrossEntropyLoss(reduction=reduction) _test_correctness_not_last_layer_once(liger_ce, B, T, V, reduction, scalar, dtype, atol, rtol) @@ -693,17 +866,21 @@ def test_float32_internal(): X_stride=X_bf16.stride(-2), Y_ptr=Y, Y_stride=Y.stride(-1), + weight_ptr=X_bf16, # dummy ptr, not used z_loss_ptr=loss_bf16, # dummy ptr, not used loss_ptr=loss_bf16, loss_stride=loss_bf16.stride(-1), n_cols=n_cols, n_non_ignore=n_non_ignore, + sum_non_ignore_weight=n_non_ignore, # not used + weight_sum=0.0, # not used ignore_index=ignore_index, lse_square_scale=lse_square_scale, label_smoothing=label_smoothing, reduction=reduction, softcap=softcap, RETURN_Z_LOSS=0, # False + HAS_WEIGHT=False, HAS_SOFTCAPPING=False, BLOCK_SIZE=BLOCK_SIZE, num_warps=32 if not is_hip() else 16, @@ -717,17 +894,21 @@ def test_float32_internal(): X_stride=X_fp32.stride(-2), Y_ptr=Y, Y_stride=Y.stride(-1), + weight_ptr=X_fp32, # dummy ptr, not used loss_ptr=loss_fp32, z_loss_ptr=loss_fp32, # dummy ptr, not used loss_stride=loss_fp32.stride(-1), n_cols=n_cols, n_non_ignore=n_non_ignore, + sum_non_ignore_weight=n_non_ignore, # not used + weight_sum=n_non_ignore, # not used ignore_index=ignore_index, lse_square_scale=lse_square_scale, label_smoothing=label_smoothing, reduction=reduction, softcap=softcap, RETURN_Z_LOSS=0, # False + HAS_WEIGHT=False, HAS_SOFTCAPPING=False, BLOCK_SIZE=BLOCK_SIZE, num_warps=32 if not is_hip() else 16, diff --git a/test/transformers/test_fused_linear_cross_entropy.py b/test/transformers/test_fused_linear_cross_entropy.py index a4d6ba2ff..d4e919811 100644 --- a/test/transformers/test_fused_linear_cross_entropy.py +++ b/test/transformers/test_fused_linear_cross_entropy.py @@ -39,6 +39,7 @@ def __init__( V: int, dtype: torch.dtype, bias: bool = False, + ce_weight: Optional[torch.FloatTensor] = None, ignore_index: int = -100, lse_square_scale: float = 0.0, label_smoothing: float = 0.0, @@ -48,6 +49,7 @@ def __init__( super().__init__() self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype) self.ce_loss = CrossEntropyWithZLoss( + weight=ce_weight, ignore_index=ignore_index, lse_square_scale=lse_square_scale, label_smoothing=label_smoothing, @@ -68,6 +70,7 @@ def __init__( H: int, V: int, dtype: torch.dtype, + ce_weight: Optional[torch.FloatTensor] = None, bias: bool = False, ignore_index: int = -100, lse_square_scale: float = 0.0, @@ -78,6 +81,7 @@ def __init__( super().__init__() self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype) self.ce_loss = LigerFusedLinearCrossEntropyLoss( + ce_weight=ce_weight, ignore_index=ignore_index, lse_square_scale=lse_square_scale, label_smoothing=label_smoothing, @@ -114,15 +118,11 @@ def forward(self, x, y): ) @pytest.mark.parametrize("bias", [True, False]) @pytest.mark.parametrize( - "label_smoothing, ignore_index, lse_square_scale, softcap", + "has_ce_weight, label_smoothing, ignore_index, lse_square_scale, softcap", [ - (0, -100, 0, None), - ( - 0.1, - 42, - 1e-4, - 30.0, - ), # Pass non-default values once to ensure all params work along + (False, 0, -100, 0, None), + # Pass non-default values once to ensure all params work along + (True, 0.1, 42, 1e-4, 30.0), ], ) def test_correctness( @@ -133,6 +133,7 @@ def test_correctness( scalar, dtype, bias, + has_ce_weight, lse_square_scale, label_smoothing, ignore_index, @@ -141,10 +142,15 @@ def test_correctness( atol, rtol, ): + if has_ce_weight: + ce_weight = torch.rand(V, device=device, dtype=torch.float32) + else: + ce_weight = None torch_lm_head_ce = TorchLMHeadCE( H=H, V=V, bias=bias, + ce_weight=ce_weight, lse_square_scale=lse_square_scale, label_smoothing=label_smoothing, ignore_index=ignore_index, @@ -156,6 +162,7 @@ def test_correctness( H=H, V=V, bias=bias, + ce_weight=ce_weight, lse_square_scale=lse_square_scale, label_smoothing=label_smoothing, ignore_index=ignore_index,