address #10

lucidrains · Aug 8, 2023 · 14ecab4 · 14ecab4
1 parent 8d560e9
commit 14ecab4
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 3 deletions.
diff --git a/recurrent_memory_transformer_pytorch/attend.py b/recurrent_memory_transformer_pytorch/attend.py
@@ -83,6 +83,7 @@ def flash_attn(self, q, k, v, mask = None):
         if exists(mask):
             if mask.ndim != 4:
                 mask = rearrange(mask, 'b j -> b 1 1 j')
+
             mask = mask.expand(-1, heads, q_len, -1)
 
         # Check if there is a compatible device for flash attention

diff --git a/recurrent_memory_transformer_pytorch/recurrent_memory_transformer.py b/recurrent_memory_transformer_pytorch/recurrent_memory_transformer.py
@@ -341,8 +341,13 @@ def forward(
             causal_mask = F.pad(causal_mask, (0, mem_length, read_mem_length, 0), value = False)
             causal_mask = F.pad(causal_mask, (read_mem_length, 0, 0, mem_length), value = True)
 
-            assert not exists(mask)
-            mask = rearrange(causal_mask, 'i j -> 1 1 i j')
+            causal_mask = rearrange(causal_mask, 'i j -> 1 1 i j')
+
+            if exists(mask):
+                mask = rearrange(mask, 'b j -> b 1 1 j')
+                mask = mask & causal_mask
+            else:
+                mask = causal_mask
 
         # rotary embedding - offset main positions by 10000, and keep all memories at position 0
 

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'recurrent-memory-transformer-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.4.2',
+  version = '0.4.3',
   license='MIT',
   description = 'Recurrent Memory Transformer - Pytorch',
   author = 'Phil Wang',