backwards cuda causal mask hack now relies on padding once and indexi…

…ng out
lucidrains · Apr 8, 2024 · 0b5d5af · 0b5d5af
1 parent 228d824
commit 0b5d5af
Show file tree

Hide file tree

Showing 2 changed files with 1 addition and 7 deletions.
diff --git a/ring_attention_pytorch/ring_flash_attention_cuda.py b/ring_attention_pytorch/ring_flash_attention_cuda.py
@@ -767,12 +767,6 @@ def backward(ctx, do):
 
                 else:
                     ring_dq, ring_dk, ring_dv = 0., 0., 0.
-
-                q = q[:, :row_length]
-                o = o[:, :row_length]
-                do = do[:, :row_length]
-                lse = lse[..., :row_length]
-
             else:
 
                 (

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'ring-attention-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.3.6',
+  version = '0.3.7',
   license='MIT',
   description = 'Ring Attention - Pytorch',
   author = 'Phil Wang',