From b8a2ff79b8cbb63f4f224d9848f666ddce904ed8 Mon Sep 17 00:00:00 2001
From: Igor Shilov <shilov@fb.com>
Date: Tue, 25 Oct 2022 14:12:54 +0100
Subject: [PATCH 01/32] support empty batches in memory manager and optimizer

---
 opacus/data_loader.py                     | 25 ++++++-
 opacus/grad_sample/README.md              |  1 +
 opacus/grad_sample/conv.py                |  8 +++
 opacus/optimizers/optimizer.py            | 18 +++--
 opacus/privacy_engine.py                  | 11 +--
 opacus/tests/batch_memory_manager_test.py | 85 ++++++++++++++++++++---
 opacus/tests/privacy_engine_test.py       | 14 ++++
 opacus/utils/batch_memory_manager.py      |  5 ++
 8 files changed, 142 insertions(+), 25 deletions(-)

diff --git a/opacus/data_loader.py b/opacus/data_loader.py
index 4feaaf94..8b200d49 100644
--- a/opacus/data_loader.py
+++ b/opacus/data_loader.py
@@ -29,7 +29,9 @@
 
 
 def wrap_collate_with_empty(
-    collate_fn: Optional[_collate_fn_t], sample_empty_shapes: Sequence
+    collate_fn: Optional[_collate_fn_t],
+    sample_empty_shapes: Sequence[torch.Size],
+    dtypes: Sequence[torch.dtype],
 ):
     """
     Wraps given collate function to handle empty batches.
@@ -49,7 +51,10 @@ def collate(batch):
         if len(batch) > 0:
             return collate_fn(batch)
         else:
-            return [torch.zeros(x) for x in sample_empty_shapes]
+            return [
+                torch.zeros(shape, dtype=dtype)
+                for shape, dtype in zip(sample_empty_shapes, dtypes)
+            ]
 
     return collate
 
@@ -67,6 +72,19 @@ def shape_safe(x: Any):
     return x.shape if hasattr(x, "shape") else ()
 
 
+def dtype_safe(x: Any):
+    """
+    Exception-safe getter for ``dtype`` attribute
+
+    Args:
+        x: any object
+
+    Returns:
+        ``x.shape`` if attribute exists, empty tuple otherwise
+    """
+    return x.dtype if hasattr(x, "dtype") else type(x)
+
+
 class DPDataLoader(DataLoader):
     """
     DataLoader subclass that always does Poisson sampling and supports empty batches
@@ -144,6 +162,7 @@ def __init__(
                 generator=generator,
             )
         sample_empty_shapes = [[0, *shape_safe(x)] for x in dataset[0]]
+        dtypes = [dtype_safe(x) for x in dataset[0]]
         if collate_fn is None:
             collate_fn = default_collate
 
@@ -156,7 +175,7 @@ def __init__(
             dataset=dataset,
             batch_sampler=batch_sampler,
             num_workers=num_workers,
-            collate_fn=wrap_collate_with_empty(collate_fn, sample_empty_shapes),
+            collate_fn=wrap_collate_with_empty(collate_fn, sample_empty_shapes, dtypes),
             pin_memory=pin_memory,
             timeout=timeout,
             worker_init_fn=worker_init_fn,
diff --git a/opacus/grad_sample/README.md b/opacus/grad_sample/README.md
index 1a78499a..e3eed52a 100644
--- a/opacus/grad_sample/README.md
+++ b/opacus/grad_sample/README.md
@@ -74,6 +74,7 @@ Please note that these are known limitations and we plan to improve Expanded Wei
 | `batch_first=False`          | ✅ Supported                    | Not supported   | ✅ Supported  |
 | Recurrent networks           | ✅ Supported                    | Not supported   | ✅ Supported  |
 | Padding `same` in Conv       | ✅ Supported                    | Not supported   | ✅ Supported  |
+| Empty poisson batches        | ✅ Supported                    | Not supported   | ✅ Supported  |
 
 † Note, that performance differences are unstable and can vary a lot depending on the exact model and batch size. 
 Numbers above are averaged over benchmarks with small models consisting of convolutional and linear layers. 
diff --git a/opacus/grad_sample/conv.py b/opacus/grad_sample/conv.py
index 2b8e6299..4014d80e 100644
--- a/opacus/grad_sample/conv.py
+++ b/opacus/grad_sample/conv.py
@@ -41,6 +41,14 @@ def compute_conv_grad_sample(
         backprops: Backpropagations
     """
     n = activations.shape[0]
+    if n == 0:
+        # Empty batch
+        ret = {}
+        ret[layer.weight] = torch.zeros_like(layer.weight).unsqueeze(0)
+        if layer.bias is not None and layer.bias.requires_grad:
+            ret[layer.bias] = torch.zeros_like(layer.bias).unsqueeze(0)
+        return ret
+
     # get activations and backprops in shape depending on the Conv layer
     if type(layer) == nn.Conv2d:
         activations = unfold2d(
diff --git a/opacus/optimizers/optimizer.py b/opacus/optimizers/optimizer.py
index 46a414d9..1afb3ce5 100644
--- a/opacus/optimizers/optimizer.py
+++ b/opacus/optimizers/optimizer.py
@@ -394,13 +394,17 @@ def clip_and_accumulate(self):
         Stores clipped and aggregated gradients into `p.summed_grad```
         """
 
-        per_param_norms = [
-            g.reshape(len(g), -1).norm(2, dim=-1) for g in self.grad_samples
-        ]
-        per_sample_norms = torch.stack(per_param_norms, dim=1).norm(2, dim=1)
-        per_sample_clip_factor = (self.max_grad_norm / (per_sample_norms + 1e-6)).clamp(
-            max=1.0
-        )
+        if len(self.grad_samples[0]) == 0:
+            # Empty batch
+            per_sample_clip_factor = torch.zeros((0,))
+        else:
+            per_param_norms = [
+                g.reshape(len(g), -1).norm(2, dim=-1) for g in self.grad_samples
+            ]
+            per_sample_norms = torch.stack(per_param_norms, dim=1).norm(2, dim=1)
+            per_sample_clip_factor = (
+                self.max_grad_norm / (per_sample_norms + 1e-6)
+            ).clamp(max=1.0)
 
         for p in self.params:
             _check_processed_flag(p.grad_sample)
diff --git a/opacus/privacy_engine.py b/opacus/privacy_engine.py
index 4b46d337..e2414335 100644
--- a/opacus/privacy_engine.py
+++ b/opacus/privacy_engine.py
@@ -138,11 +138,12 @@ def __init__(self, *, accountant: str = "rdp", secure_mode: bool = False):
 
             self.secure_rng = csprng.create_random_device_generator("/dev/urandom")
         else:
-            warnings.warn(
-                "Secure RNG turned off. This is perfectly fine for experimentation as it allows "
-                "for much faster training performance, but remember to turn it on and retrain "
-                "one last time before production with ``secure_mode`` turned on."
-            )
+            # warnings.warn(
+            #     "Secure RNG turned off. This is perfectly fine for experimentation as it allows "
+            #     "for much faster training performance, but remember to turn it on and retrain "
+            #     "one last time before production with ``secure_mode`` turned on."
+            # )
+            pass
 
     def _prepare_optimizer(
         self,
diff --git a/opacus/tests/batch_memory_manager_test.py b/opacus/tests/batch_memory_manager_test.py
index 08bca477..7a57b573 100644
--- a/opacus/tests/batch_memory_manager_test.py
+++ b/opacus/tests/batch_memory_manager_test.py
@@ -37,8 +37,7 @@ class BatchMemoryManagerTest(unittest.TestCase):
     GSM_MODE = "hooks"
 
     def setUp(self) -> None:
-        self.data_size = 100
-        self.batch_size = 10
+        self.data_size = 256
         self.inps = torch.randn(self.data_size, 5)
         self.tgts = torch.randn(
             self.data_size,
@@ -46,11 +45,11 @@ def setUp(self) -> None:
 
         self.dataset = TensorDataset(self.inps, self.tgts)
 
-    def _init_training(self, **data_loader_kwargs):
+    def _init_training(self, batch_size=10, **data_loader_kwargs):
         model = Model()
         optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
         data_loader = DataLoader(
-            self.dataset, batch_size=self.batch_size, **data_loader_kwargs
+            self.dataset, batch_size=batch_size, **data_loader_kwargs
         )
 
         return model, optimizer, data_loader
@@ -58,16 +57,22 @@ def _init_training(self, **data_loader_kwargs):
     @given(
         num_workers=st.integers(0, 4),
         pin_memory=st.booleans(),
+        batch_size=st.sampled_from([8, 16, 64]),
+        max_physical_batch_size=st.sampled_from([4, 8]),
     )
     @settings(deadline=10000)
     def test_basic(
         self,
         num_workers: int,
         pin_memory: bool,
+        batch_size: int,
+        max_physical_batch_size: int,
     ):
+        batches_per_step = max(1, batch_size // max_physical_batch_size)
         model, optimizer, data_loader = self._init_training(
             num_workers=num_workers,
             pin_memory=pin_memory,
+            batch_size=batch_size,
         )
 
         privacy_engine = PrivacyEngine()
@@ -80,22 +85,19 @@ def test_basic(
             poisson_sampling=False,
             grad_sample_mode=self.GSM_MODE,
         )
-        max_physical_batch_size = 3
         with BatchMemoryManager(
             data_loader=data_loader,
             max_physical_batch_size=max_physical_batch_size,
             optimizer=optimizer,
         ) as new_data_loader:
-            self.assertEqual(
-                len(data_loader), len(data_loader.dataset) // self.batch_size
-            )
+            self.assertEqual(len(data_loader), len(data_loader.dataset) // batch_size)
             self.assertEqual(
                 len(new_data_loader),
                 len(data_loader.dataset) // max_physical_batch_size,
             )
             weights_before = torch.clone(model._module.fc.weight)
             for i, (x, y) in enumerate(new_data_loader):
-                self.assertTrue(x.shape[0] <= 3)
+                self.assertTrue(x.shape[0] <= max_physical_batch_size)
 
                 out = model(x)
                 loss = (y - out).mean()
@@ -104,7 +106,63 @@ def test_basic(
                 optimizer.step()
                 optimizer.zero_grad()
 
-                if i % 4 < 3:
+                if (i + 1) % batches_per_step > 0:
+                    self.assertTrue(
+                        torch.allclose(model._module.fc.weight, weights_before)
+                    )
+                else:
+                    self.assertFalse(
+                        torch.allclose(model._module.fc.weight, weights_before)
+                    )
+                    weights_before = torch.clone(model._module.fc.weight)
+
+    @given(
+        num_workers=st.integers(0, 4),
+        pin_memory=st.booleans(),
+    )
+    @settings(deadline=10000)
+    def test_empty_batch(
+        self,
+        num_workers: int,
+        pin_memory: bool,
+    ):
+        batch_size = 2
+        max_physical_batch_size = 10
+        torch.manual_seed(30)
+
+        model, optimizer, data_loader = self._init_training(
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            batch_size=batch_size,
+        )
+
+        privacy_engine = PrivacyEngine()
+        model, optimizer, data_loader = privacy_engine.make_private(
+            module=model,
+            optimizer=optimizer,
+            data_loader=data_loader,
+            noise_multiplier=0.0,
+            max_grad_norm=1e5,
+            poisson_sampling=True,
+            grad_sample_mode=self.GSM_MODE,
+        )
+        with BatchMemoryManager(
+            data_loader=data_loader,
+            max_physical_batch_size=max_physical_batch_size,
+            optimizer=optimizer,
+        ) as new_data_loader:
+            weights_before = torch.clone(model._module.fc.weight)
+            for i, (x, y) in enumerate(new_data_loader):
+                self.assertTrue(x.shape[0] <= max_physical_batch_size)
+
+                out = model(x)
+                loss = (y - out).mean()
+
+                loss.backward()
+                optimizer.step()
+                optimizer.zero_grad()
+
+                if len(x) == 0:
                     self.assertTrue(
                         torch.allclose(model._module.fc.weight, weights_before)
                     )
@@ -174,3 +232,10 @@ def test_equivalent_to_one_batch(self):
 )
 class BatchMemoryManagerTestWithExpandedWeights(BatchMemoryManagerTest):
     GSM_MODE = "ew"
+
+    def test_empty_batch(self):
+        pass
+
+
+class BatchMemoryManagerTestWithFunctorch(BatchMemoryManagerTest):
+    GSM_MODE = "functorch"
diff --git a/opacus/tests/privacy_engine_test.py b/opacus/tests/privacy_engine_test.py
index 90af717a..aede7578 100644
--- a/opacus/tests/privacy_engine_test.py
+++ b/opacus/tests/privacy_engine_test.py
@@ -805,6 +805,20 @@ def _init_model(
         return SampleConvNet()
 
 
+class PrivacyEngineConvNetEmptyBatchTest(PrivacyEngineConvNetTest):
+    def setUp(self):
+        super().setUp()
+
+        # This will trigger multiple empty batches with poisson sampling enabled
+        self.BATCH_SIZE = 1
+
+    def test_checkpoints(self):
+        pass
+
+    def test_noise_level(self):
+        pass
+
+
 class PrivacyEngineConvNetFrozenTest(BasePrivacyEngineTest, unittest.TestCase):
     def _init_data(self):
         ds = FakeData(
diff --git a/opacus/utils/batch_memory_manager.py b/opacus/utils/batch_memory_manager.py
index f7e3b65f..8f757e5d 100644
--- a/opacus/utils/batch_memory_manager.py
+++ b/opacus/utils/batch_memory_manager.py
@@ -53,6 +53,11 @@ def __init__(
 
     def __iter__(self):
         for batch_idxs in self.sampler:
+            if len(batch_idxs) == 0:
+                self.optimizer.signal_skip_step(do_skip=False)
+                yield []
+                continue
+
             split_idxs = np.array_split(
                 batch_idxs, math.ceil(len(batch_idxs) / self.max_batch_size)
             )

From 2e1b9d7adae727e7bf541c9271be5fb99d9b8216 Mon Sep 17 00:00:00 2001
From: Igor Shilov <shilov@fb.com>
Date: Tue, 25 Oct 2022 14:15:23 +0100
Subject: [PATCH 02/32] restore warning

---
 opacus/privacy_engine.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/opacus/privacy_engine.py b/opacus/privacy_engine.py
index e2414335..c1826559 100644
--- a/opacus/privacy_engine.py
+++ b/opacus/privacy_engine.py
@@ -138,11 +138,11 @@ def __init__(self, *, accountant: str = "rdp", secure_mode: bool = False):
 
             self.secure_rng = csprng.create_random_device_generator("/dev/urandom")
         else:
-            # warnings.warn(
-            #     "Secure RNG turned off. This is perfectly fine for experimentation as it allows "
-            #     "for much faster training performance, but remember to turn it on and retrain "
-            #     "one last time before production with ``secure_mode`` turned on."
-            # )
+            warnings.warn(
+                "Secure RNG turned off. This is perfectly fine for experimentation as it allows "
+                "for much faster training performance, but remember to turn it on and retrain "
+                "one last time before production with ``secure_mode`` turned on."
+            )
             pass
 
     def _prepare_optimizer(

From df9d1ab3b5750e428dab8f0b77b0d943b2af6aaf Mon Sep 17 00:00:00 2001
From: Igor Shilov <shilov@fb.com>
Date: Tue, 25 Oct 2022 16:49:32 +0100
Subject: [PATCH 03/32] disable functorch test for 1.13+

---
 opacus/tests/batch_memory_manager_test.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/opacus/tests/batch_memory_manager_test.py b/opacus/tests/batch_memory_manager_test.py
index 7a57b573..14bdbea7 100644
--- a/opacus/tests/batch_memory_manager_test.py
+++ b/opacus/tests/batch_memory_manager_test.py
@@ -237,5 +237,8 @@ def test_empty_batch(self):
         pass
 
 
+@unittest.skipIf(
+    torch.__version__ >= API_CUTOFF_VERSION, "not supported in this torch version"
+)
 class BatchMemoryManagerTestWithFunctorch(BatchMemoryManagerTest):
     GSM_MODE = "functorch"

From b952c2ae77f97675fc45fcb089805591f37588bd Mon Sep 17 00:00:00 2001
From: Igor Shilov <shilov@fb.com>
Date: Thu, 27 Oct 2022 14:33:51 +0100
Subject: [PATCH 04/32] 0-batch tests

---
 opacus/grad_sample/README.md                  |  2 +-
 opacus/grad_sample/embedding.py               |  4 ++++
 opacus/tests/grad_samples/common.py           | 24 ++++++++++++-------
 opacus/tests/grad_samples/conv1d_test.py      |  2 +-
 opacus/tests/grad_samples/conv2d_test.py      |  6 ++---
 opacus/tests/grad_samples/conv3d_test.py      |  2 +-
 opacus/tests/grad_samples/embedding_test.py   |  2 +-
 opacus/tests/grad_samples/group_norm_test.py  |  2 +-
 opacus/tests/grad_samples/linear_test.py      |  2 +-
 .../tests/grad_samples/sequence_bias_test.py  |  2 +-
 10 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/opacus/grad_sample/README.md b/opacus/grad_sample/README.md
index e3eed52a..7827680f 100644
--- a/opacus/grad_sample/README.md
+++ b/opacus/grad_sample/README.md
@@ -74,7 +74,7 @@ Please note that these are known limitations and we plan to improve Expanded Wei
 | `batch_first=False`          | ✅ Supported                    | Not supported   | ✅ Supported  |
 | Recurrent networks           | ✅ Supported                    | Not supported   | ✅ Supported  |
 | Padding `same` in Conv       | ✅ Supported                    | Not supported   | ✅ Supported  |
-| Empty poisson batches        | ✅ Supported                    | Not supported   | ✅ Supported  |
+| Empty poisson batches        | ✅ Supported                    | Not supported   | Not supported  |
 
 † Note, that performance differences are unstable and can vary a lot depending on the exact model and batch size. 
 Numbers above are averaged over benchmarks with small models consisting of convolutional and linear layers. 
diff --git a/opacus/grad_sample/embedding.py b/opacus/grad_sample/embedding.py
index 94b86c4b..f0aa575a 100644
--- a/opacus/grad_sample/embedding.py
+++ b/opacus/grad_sample/embedding.py
@@ -39,6 +39,10 @@ def compute_embedding_grad_sample(
         torch.backends.cudnn.deterministic = True
 
         batch_size = activations.shape[0]
+        if batch_size == 0:
+            ret[layer.weight] = torch.zeros_like(layer.weight).unsqueeze(0)
+            return ret
+
         index = (
             activations.unsqueeze(-1)
             .expand(*activations.shape, layer.embedding_dim)
diff --git a/opacus/tests/grad_samples/common.py b/opacus/tests/grad_samples/common.py
index f7fa1eac..5a981ba2 100644
--- a/opacus/tests/grad_samples/common.py
+++ b/opacus/tests/grad_samples/common.py
@@ -226,6 +226,9 @@ def run_test(
         except ImportError:
             grad_sample_modes = ["hooks"]
 
+        if type(x) is not PackedSequence and x.numel() == 0:
+            grad_sample_modes = ["hooks"]
+
         for grad_sample_mode in grad_sample_modes:
             for loss_reduction in ["sum", "mean"]:
 
@@ -262,6 +265,14 @@ def run_test_with_reduction(
         rtol=10e-5,
         grad_sample_mode="hooks",
     ):
+        opacus_grad_samples = self.compute_opacus_grad_sample(
+            x,
+            module,
+            batch_first=batch_first,
+            loss_reduction=loss_reduction,
+            grad_sample_mode=grad_sample_mode,
+        )
+
         if type(x) is PackedSequence:
             x_unpacked = _unpack_packedsequences(x)
             microbatch_grad_samples = self.compute_microbatch_grad_sample(
@@ -270,18 +281,13 @@ def run_test_with_reduction(
                 batch_first=batch_first,
                 loss_reduction=loss_reduction,
             )
-        else:
+        elif x.numel() > 0:
             microbatch_grad_samples = self.compute_microbatch_grad_sample(
                 x, module, batch_first=batch_first, loss_reduction=loss_reduction
             )
-
-        opacus_grad_samples = self.compute_opacus_grad_sample(
-            x,
-            module,
-            batch_first=batch_first,
-            loss_reduction=loss_reduction,
-            grad_sample_mode=grad_sample_mode,
-        )
+        else:
+            # We've checked opacus can handle 0-sized batch. Microbatch doesn't make sense
+            return
 
         if microbatch_grad_samples.keys() != opacus_grad_samples.keys():
             raise ValueError(
diff --git a/opacus/tests/grad_samples/conv1d_test.py b/opacus/tests/grad_samples/conv1d_test.py
index 179e496e..9ad2981a 100644
--- a/opacus/tests/grad_samples/conv1d_test.py
+++ b/opacus/tests/grad_samples/conv1d_test.py
@@ -25,7 +25,7 @@
 
 class Conv1d_test(GradSampleHooks_test):
     @given(
-        N=st.integers(1, 4),
+        N=st.integers(0, 4),
         C=st.sampled_from([1, 3, 32]),
         W=st.integers(6, 10),
         out_channels_mapper=st.sampled_from([expander, shrinker]),
diff --git a/opacus/tests/grad_samples/conv2d_test.py b/opacus/tests/grad_samples/conv2d_test.py
index f27ad158..6d9a5b33 100644
--- a/opacus/tests/grad_samples/conv2d_test.py
+++ b/opacus/tests/grad_samples/conv2d_test.py
@@ -29,7 +29,7 @@
 
 class Conv2d_test(GradSampleHooks_test):
     @given(
-        N=st.integers(1, 4),
+        N=st.integers(0, 4),
         C=st.sampled_from([1, 3, 32]),
         H=st.integers(11, 17),
         W=st.integers(11, 17),
@@ -73,7 +73,7 @@ def test_conv2d(
             groups=groups,
         )
         is_ew_compatible = (
-            padding != "same"
+            padding != "same" and N > 0
         )  # TODO add support for padding = 'same' with EW
 
         # Test regular GSM
@@ -86,7 +86,7 @@ def test_conv2d(
             ew_compatible=is_ew_compatible,
         )
 
-        if padding != "same":
+        if padding != "same" and N > 0:
             # Test 'convolution as a backward' GSM
             # 'convolution as a backward' doesn't support padding=same
             conv2d_gsm = GradSampleModule.GRAD_SAMPLERS[nn.Conv2d]
diff --git a/opacus/tests/grad_samples/conv3d_test.py b/opacus/tests/grad_samples/conv3d_test.py
index afa01b4b..1647ecef 100644
--- a/opacus/tests/grad_samples/conv3d_test.py
+++ b/opacus/tests/grad_samples/conv3d_test.py
@@ -25,7 +25,7 @@
 
 class Conv3d_test(GradSampleHooks_test):
     @given(
-        N=st.integers(1, 4),
+        N=st.integers(0, 4),
         C=st.sampled_from([1, 3, 32]),
         D=st.integers(3, 6),
         H=st.integers(6, 10),
diff --git a/opacus/tests/grad_samples/embedding_test.py b/opacus/tests/grad_samples/embedding_test.py
index ff02a130..e803f2dd 100644
--- a/opacus/tests/grad_samples/embedding_test.py
+++ b/opacus/tests/grad_samples/embedding_test.py
@@ -23,7 +23,7 @@
 
 class Embedding_test(GradSampleHooks_test):
     @given(
-        N=st.integers(1, 4),
+        N=st.integers(0, 4),
         T=st.integers(1, 5),
         Q=st.integers(1, 4),
         R=st.integers(1, 2),
diff --git a/opacus/tests/grad_samples/group_norm_test.py b/opacus/tests/grad_samples/group_norm_test.py
index 2f4bbaff..4f32f0e0 100644
--- a/opacus/tests/grad_samples/group_norm_test.py
+++ b/opacus/tests/grad_samples/group_norm_test.py
@@ -30,7 +30,7 @@ class GroupNorm_test(GradSampleHooks_test):
     """
 
     @given(
-        N=st.integers(1, 4),
+        N=st.integers(0, 4),
         C=st.integers(1, 8),
         H=st.integers(5, 10),
         W=st.integers(4, 8),
diff --git a/opacus/tests/grad_samples/linear_test.py b/opacus/tests/grad_samples/linear_test.py
index 3b23f3ef..82ce7409 100644
--- a/opacus/tests/grad_samples/linear_test.py
+++ b/opacus/tests/grad_samples/linear_test.py
@@ -23,7 +23,7 @@
 
 class Linear_test(GradSampleHooks_test):
     @given(
-        N=st.integers(1, 4),
+        N=st.integers(0, 4),
         Z=st.integers(1, 4),
         H=st.integers(1, 3),
         W=st.integers(10, 17),
diff --git a/opacus/tests/grad_samples/sequence_bias_test.py b/opacus/tests/grad_samples/sequence_bias_test.py
index b61ffc66..ec36d74b 100644
--- a/opacus/tests/grad_samples/sequence_bias_test.py
+++ b/opacus/tests/grad_samples/sequence_bias_test.py
@@ -23,7 +23,7 @@
 
 class SequenceBias_test(GradSampleHooks_test):
     @given(
-        N=st.integers(1, 4),
+        N=st.integers(0, 4),
         T=st.integers(10, 20),
         D=st.integers(4, 8),
         batch_first=st.booleans(),

From 5c7fc6ff0e0cac98d0bb44b0e1836962b8014da3 Mon Sep 17 00:00:00 2001
From: Igor Shilov <shilov@fb.com>
Date: Thu, 27 Oct 2022 14:36:33 +0100
Subject: [PATCH 05/32] lint

---
 opacus/privacy_engine.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/opacus/privacy_engine.py b/opacus/privacy_engine.py
index c1826559..4b46d337 100644
--- a/opacus/privacy_engine.py
+++ b/opacus/privacy_engine.py
@@ -143,7 +143,6 @@ def __init__(self, *, accountant: str = "rdp", secure_mode: bool = False):
                 "for much faster training performance, but remember to turn it on and retrain "
                 "one last time before production with ``secure_mode`` turned on."
             )
-            pass
 
     def _prepare_optimizer(
         self,

From 64f08adff89645a51b116dd516870b9676ab9728 Mon Sep 17 00:00:00 2001
From: Igor Shilov <shilov@fb.com>
Date: Thu, 27 Oct 2022 15:37:31 +0100
Subject: [PATCH 06/32] EW test fix

---
 opacus/tests/grad_samples/conv1d_test.py     | 4 +++-
 opacus/tests/grad_samples/conv3d_test.py     | 2 +-
 opacus/tests/grad_samples/embedding_test.py  | 2 +-
 opacus/tests/grad_samples/group_norm_test.py | 2 +-
 opacus/tests/grad_samples/linear_test.py     | 2 +-
 5 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/opacus/tests/grad_samples/conv1d_test.py b/opacus/tests/grad_samples/conv1d_test.py
index 9ad2981a..2576f159 100644
--- a/opacus/tests/grad_samples/conv1d_test.py
+++ b/opacus/tests/grad_samples/conv1d_test.py
@@ -67,4 +67,6 @@ def test_conv1d(
             dilation=dilation,
             groups=groups,
         )
-        self.run_test(x, conv, batch_first=True, atol=10e-5, rtol=10e-4)
+        self.run_test(
+            x, conv, batch_first=True, atol=10e-5, rtol=10e-4, ew_compatible=N > 0
+        )
diff --git a/opacus/tests/grad_samples/conv3d_test.py b/opacus/tests/grad_samples/conv3d_test.py
index 1647ecef..e50909e2 100644
--- a/opacus/tests/grad_samples/conv3d_test.py
+++ b/opacus/tests/grad_samples/conv3d_test.py
@@ -71,7 +71,7 @@ def test_conv3d(
             groups=groups,
         )
         is_ew_compatible = (
-            dilation == 1 and padding != "same"
+            dilation == 1 and padding != "same" and N > 0
         )  # TODO add support for padding = 'same' with EW
         self.run_test(
             x,
diff --git a/opacus/tests/grad_samples/embedding_test.py b/opacus/tests/grad_samples/embedding_test.py
index e803f2dd..e0142d36 100644
--- a/opacus/tests/grad_samples/embedding_test.py
+++ b/opacus/tests/grad_samples/embedding_test.py
@@ -56,4 +56,4 @@ def test_input_across_dims(
 
         emb = nn.Embedding(V, D)
         x = torch.randint(low=0, high=V - 1, size=size)
-        self.run_test(x, emb, batch_first=batch_first)
+        self.run_test(x, emb, batch_first=batch_first, ew_compatible=N > 0)
diff --git a/opacus/tests/grad_samples/group_norm_test.py b/opacus/tests/grad_samples/group_norm_test.py
index 4f32f0e0..e3836b93 100644
--- a/opacus/tests/grad_samples/group_norm_test.py
+++ b/opacus/tests/grad_samples/group_norm_test.py
@@ -54,4 +54,4 @@ def test_3d_input_groups(
 
         x = torch.randn([N, C, H, W])
         norm = nn.GroupNorm(num_groups=num_groups, num_channels=C, affine=True)
-        self.run_test(x, norm, batch_first=True)
+        self.run_test(x, norm, batch_first=True, ew_compatible=N > 0)
diff --git a/opacus/tests/grad_samples/linear_test.py b/opacus/tests/grad_samples/linear_test.py
index 82ce7409..e856e9d3 100644
--- a/opacus/tests/grad_samples/linear_test.py
+++ b/opacus/tests/grad_samples/linear_test.py
@@ -57,4 +57,4 @@ def test_input_bias(
         x = torch.randn(x_shape)
         if not batch_first:
             x = x.transpose(0, 1)
-        self.run_test(x, linear, batch_first=batch_first)
+        self.run_test(x, linear, batch_first=batch_first, ew_compatible=N > 0)

From df7c355064a94fe894eba2c10fe5290fac480388 Mon Sep 17 00:00:00 2001
From: Igor Shilov <shilov@fb.com>
Date: Thu, 27 Oct 2022 15:59:24 +0100
Subject: [PATCH 07/32] docstring up

---
 opacus/data_loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opacus/data_loader.py b/opacus/data_loader.py
index 8b200d49..884eed88 100644
--- a/opacus/data_loader.py
+++ b/opacus/data_loader.py
@@ -80,7 +80,7 @@ def dtype_safe(x: Any):
         x: any object
 
     Returns:
-        ``x.shape`` if attribute exists, empty tuple otherwise
+        ``x.dtype`` if attribute exists, type of x otherwise
     """
     return x.dtype if hasattr(x, "dtype") else type(x)
 

From b64b06afe419354bcf8ae7fa21a36a7bf765a857 Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Thu, 27 Oct 2022 18:53:14 +0100
Subject: [PATCH 08/32] Implement per sample grads util and refactor code

---
 opacus/tests/grad_samples/common.py           | 322 ++-------------
 opacus/tests/grad_samples/conv1d_test.py      |  32 +-
 opacus/tests/grad_samples/conv2d_test.py      | 109 ++++--
 opacus/tests/grad_samples/conv3d_test.py      |  58 +--
 .../dp_multihead_attention_test.py            |  28 +-
 opacus/tests/grad_samples/dp_rnn_test.py      |  36 +-
 opacus/tests/grad_samples/embedding_test.py   |  29 +-
 opacus/tests/grad_samples/group_norm_test.py  |  21 +-
 .../grad_samples/instance_norm1d_test.py      |  19 +-
 .../grad_samples/instance_norm2d_test.py      |  20 +-
 .../grad_samples/instance_norm3d_test.py      |  23 +-
 opacus/tests/grad_samples/layer_norm_test.py  |  24 +-
 opacus/tests/grad_samples/linear_test.py      |  26 +-
 .../tests/grad_samples/sequence_bias_test.py  |  20 +-
 opacus/utils/per_sample_gradients_utils.py    | 367 ++++++++++++++++++
 15 files changed, 691 insertions(+), 443 deletions(-)
 create mode 100644 opacus/utils/per_sample_gradients_utils.py

diff --git a/opacus/tests/grad_samples/common.py b/opacus/tests/grad_samples/common.py
index f7fa1eac..2652aa95 100644
--- a/opacus/tests/grad_samples/common.py
+++ b/opacus/tests/grad_samples/common.py
@@ -13,20 +13,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
 import unittest
-from typing import Dict, List, Union
+from typing import Union
 
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from opacus.grad_sample import wrap_model
-from opacus.utils.module_utils import trainable_parameters
-from opacus.utils.packed_sequences import compute_seq_lengths
-from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence
+from torch.nn.utils.rnn import PackedSequence
 from torch.testing import assert_allclose
 
+from opacus.utils.per_sample_gradients_utils import compute_grad_samples_microbatch_and_opacus
+
 
 def expander(x, factor: int = 2):
     return x * factor
@@ -36,189 +33,20 @@ def shrinker(x, factor: int = 2):
     return max(1, x // factor)  # if avoid returning 0 for x == 1
 
 
-class ModelWithLoss(nn.Module):
-    """
-    To test the gradients of a module, we need to have a loss.
-    This module makes it easy to get a loss from any nn.Module, and automatically generates
-    a target y vector for it in the forward (of all zeros of the correct size).
-    This reduces boilerplate while testing.
-    """
-
-    supported_reductions = ["mean", "sum"]
-
-    def __init__(self, module: nn.Module, loss_reduction: str = "mean"):
-        """
-        Instantiates this module.
-
-        Args:
-            module: The nn.Module you want to test.
-            loss_reduction: What reduction to apply to the loss. Defaults to "mean".
-
-        Raises:
-            ValueError: If ``loss_reduction`` is not among those supported.
-        """
-        super().__init__()
-        self.wrapped_module = module
-
-        if loss_reduction not in self.supported_reductions:
-            raise ValueError(
-                f"Passed loss_reduction={loss_reduction}. Only {self.supported_reductions} supported."
-            )
-        self.criterion = nn.L1Loss(reduction=loss_reduction)
-
-    def forward(self, x):
-        x = self.wrapped_module(x)
-        if type(x) is PackedSequence:
-            loss = _compute_loss_packedsequences(self.criterion, x)
-        else:
-            y = torch.zeros_like(x)
-            loss = self.criterion(x, y)
-        return loss
-
-
-def clone_module(module: nn.Module) -> nn.Module:
-    """
-    Handy utility to clone an nn.Module. PyTorch doesn't always support copy.deepcopy(), so it is
-    just easier to serialize the model to a BytesIO and read it from there.
-
-    Args:
-        module: The module to clone
-
-    Returns:
-        The clone of ``module``
-    """
-    with io.BytesIO() as bytesio:
-        torch.save(module, bytesio)
-        bytesio.seek(0)
-        module_copy = torch.load(bytesio)
-    return module_copy
-
-
 class GradSampleHooks_test(unittest.TestCase):
     """
     Set of common testing utils. It is meant to be subclassed by your test.
     See other tests as an example of how this is done.
     """
 
-    def compute_microbatch_grad_sample(
-        self,
-        x: Union[torch.Tensor, List[torch.Tensor]],
-        module: nn.Module,
-        batch_first=True,
-        loss_reduction="mean",
-    ) -> Dict[str, torch.tensor]:
-        """
-        Computes per-sample gradients with the microbatch method, i.e. by computing normal gradients
-        with batch_size set to 1, and manually accumulating them. This is our reference for testing
-        as this method is obviously correct, but slow.
-
-        Args:
-            x: The tensor in input to the ``module``
-            module: The ``ModelWithLoss`` that wraps the nn.Module you want to test.
-            batch_first: Whether batch size is the first dimension (as opposed to the second).
-                Defaults to True.
-
-        Returns:
-            Dictionary mapping parameter_name -> per-sample-gradient for that parameter
-        """
-        torch.use_deterministic_algorithms(True)
-        torch.manual_seed(0)
-        np.random.seed(0)
-
-        module = ModelWithLoss(clone_module(module), loss_reduction)
-
-        for _, p in trainable_parameters(module):
-            p.microbatch_grad_sample = []
-
-        if not batch_first and type(x) is not list:
-            # This allows us to iterate with x_i
-            x = x.transpose(0, 1)
-
-        # Invariant: x is [B, T, ...]
-
-        for x_i in x:
-            # x_i is [T, ...]
-            x_i = x_i.unsqueeze(
-                0 if batch_first else 1
-            )  # x_i of size [1, T, ...] if batch_first, else [T, 1, ...]
-            module.zero_grad()
-            loss_i = module(x_i)
-            loss_i.backward()
-            for p in module.parameters():
-                p.microbatch_grad_sample.append(p.grad.detach().clone())
-
-        for _, p in trainable_parameters(module):
-            if batch_first:
-                p.microbatch_grad_sample = torch.stack(
-                    p.microbatch_grad_sample, dim=0  # [B, T, ...]
-                )
-            else:
-                p.microbatch_grad_sample = torch.stack(
-                    p.microbatch_grad_sample, dim=1  # [T, B, ...]
-                ).transpose(
-                    0, 1
-                )  # Opacus's semantics is that grad_samples are ALWAYS batch_first: [B, T, ...]
-
-        microbatch_grad_samples = {
-            name: p.microbatch_grad_sample
-            for name, p in trainable_parameters(module.wrapped_module)
-        }
-        return microbatch_grad_samples
-
-    def compute_opacus_grad_sample(
-        self,
-        x: Union[torch.Tensor, PackedSequence],
-        module: nn.Module,
-        batch_first=True,
-        loss_reduction="mean",
-        grad_sample_mode="hooks",
-    ) -> Dict[str, torch.tensor]:
-        """
-        Runs Opacus to compute per-sample gradients and return them for testing purposes.
-
-        Args:
-            x: The tensor in input to the ``module``
-            module: The ``ModelWithLoss`` that wraps the nn.Module you want to test.
-            batch_first: Whether batch size is the first dimension (as opposed to the second).
-                Defaults to True.
-            loss_reduction: What reduction to apply to the loss. Defaults to "mean".
-
-        Returns:
-            Dictionary mapping parameter_name -> per-sample-gradient for that parameter
-        """
-        torch.use_deterministic_algorithms(True)
-        torch.manual_seed(0)
-        np.random.seed(0)
-
-        gs_module = wrap_model(
-            model=clone_module(module),
-            grad_sample_mode=grad_sample_mode,
-            batch_first=batch_first,
-            loss_reduction=loss_reduction,
-        )
-        grad_sample_module = ModelWithLoss(gs_module, loss_reduction)
-
-        grad_sample_module.zero_grad()
-        loss = grad_sample_module(x)
-        loss.backward()
-
-        opacus_grad_samples = {
-            name: p.grad_sample
-            for name, p in trainable_parameters(
-                grad_sample_module.wrapped_module._module
-            )
-        }
-
-        return opacus_grad_samples
-
     def run_test(
-        self,
-        x: Union[torch.Tensor, PackedSequence],
-        module: nn.Module,
-        batch_first=True,
-        atol=10e-6,
-        rtol=10e-5,
-        ew_compatible=True,
+            self,
+            x: Union[torch.Tensor, PackedSequence],
+            module: nn.Module,
+            batch_first=True,
+            atol=10e-6,
+            rtol=10e-5,
+            ew_compatible=True,
     ):
         grad_sample_modes = ["hooks", "functorch"]
         try:
@@ -228,9 +56,8 @@ def run_test(
 
         for grad_sample_mode in grad_sample_modes:
             for loss_reduction in ["sum", "mean"]:
-
                 with self.subTest(
-                    grad_sample_mode=grad_sample_mode, loss_reduction=loss_reduction
+                        grad_sample_mode=grad_sample_mode, loss_reduction=loss_reduction
                 ):
                     self.run_test_with_reduction(
                         x,
@@ -253,42 +80,21 @@ def run_test(
             )
 
     def run_test_with_reduction(
-        self,
-        x: Union[torch.Tensor, PackedSequence],
-        module: nn.Module,
-        batch_first=True,
-        loss_reduction="mean",
-        atol=10e-6,
-        rtol=10e-5,
-        grad_sample_mode="hooks",
+            self,
+            x: Union[torch.Tensor, PackedSequence],
+            module: nn.Module,
+            batch_first=True,
+            loss_reduction="mean",
+            atol=10e-6,
+            rtol=10e-5,
+            grad_sample_mode="hooks",
     ):
-        if type(x) is PackedSequence:
-            x_unpacked = _unpack_packedsequences(x)
-            microbatch_grad_samples = self.compute_microbatch_grad_sample(
-                x_unpacked,
-                module,
-                batch_first=batch_first,
-                loss_reduction=loss_reduction,
-            )
-        else:
-            microbatch_grad_samples = self.compute_microbatch_grad_sample(
-                x, module, batch_first=batch_first, loss_reduction=loss_reduction
-            )
-
-        opacus_grad_samples = self.compute_opacus_grad_sample(
-            x,
-            module,
-            batch_first=batch_first,
-            loss_reduction=loss_reduction,
-            grad_sample_mode=grad_sample_mode,
-        )
-
-        if microbatch_grad_samples.keys() != opacus_grad_samples.keys():
-            raise ValueError(
-                "Keys not matching! "
-                f"Keys only in microbatch: {microbatch_grad_samples.keys() - opacus_grad_samples.keys()}; "
-                f"Keys only in Opacus: {opacus_grad_samples.keys() - microbatch_grad_samples.keys()}"
-            )
+        microbatch_grad_samples, opacus_grad_samples = \
+            compute_grad_samples_microbatch_and_opacus(x,
+                                                       module,
+                                                       batch_first=batch_first,
+                                                       loss_reduction=loss_reduction,
+                                                       grad_sample_mode=grad_sample_mode)
 
         self.check_shapes(microbatch_grad_samples, opacus_grad_samples, loss_reduction)
         self.check_values(
@@ -296,10 +102,10 @@ def run_test_with_reduction(
         )
 
     def check_shapes(
-        self,
-        microbatch_grad_samples,
-        opacus_grad_samples,
-        loss_reduction,
+            self,
+            microbatch_grad_samples,
+            opacus_grad_samples,
+            loss_reduction,
     ) -> None:
         failed = []
         for name, opacus_grad_sample in opacus_grad_samples.items():
@@ -327,12 +133,12 @@ def check_shapes(
             )
 
     def check_values(
-        self,
-        microbatch_grad_samples,
-        opacus_grad_samples,
-        loss_reduction,
-        atol,
-        rtol,
+            self,
+            microbatch_grad_samples,
+            opacus_grad_samples,
+            loss_reduction,
+            atol,
+            rtol,
     ) -> None:
         failed = []
         for name, opacus_grad_sample in opacus_grad_samples.items():
@@ -358,59 +164,3 @@ def check_values(
                 f"A total of {len(failed)} values do not match "
                 f"for loss_reduction={loss_reduction}: \n\t{failed_str}"
             )
-
-
-def _unpack_packedsequences(X: PackedSequence) -> List[torch.Tensor]:
-    r"""
-    Produces a list of tensors from X (PackedSequence) such that this list was used to create X with batch_first=True
-
-    Args:
-        X: A PackedSequence from which the output list of tensors will be produced.
-
-    Returns:
-        unpacked_data: The list of tensors produced from X.
-    """
-
-    X_padded = pad_packed_sequence(X)
-    X_padded = X_padded[0].permute((1, 0, 2))
-
-    if X.sorted_indices is not None:
-        X_padded = X_padded[X.sorted_indices]
-
-    seq_lens = compute_seq_lengths(X.batch_sizes)
-    unpacked_data = [0] * len(seq_lens)
-    for idx, length in enumerate(seq_lens):
-        unpacked_data[idx] = X_padded[idx][:length, :]
-
-    return unpacked_data
-
-
-def _compute_loss_packedsequences(
-    criterion: nn.L1Loss, x: PackedSequence
-) -> torch.Tensor:
-    r"""
-    This function computes the loss in a different way for 'mean' reduced L1 loss while for 'sum' reduced L1 loss,
-    it computes the same way as with non-packed data. For 'mean' reduced L1 loss, it transforms x (PackedSequence)
-    into a list of tensors such that this list of tensors was used to create this PackedSequence in the first
-    place using batch_first=True and then takes the mean of the loss values produced from applying criterion on
-    each sequence sample.
-
-    Args:
-        criterion: An L1 loss function with reduction either set to 'sum' or 'mean'.
-        x: Data in the form of a PackedSequence.
-
-    Returns:
-        A loss variable, reduced either using summation or averaging from L1 errors.
-    """
-
-    if criterion.reduction == "sum":
-        y = torch.zeros_like(x[0])
-        return criterion(x[0], y)
-    elif criterion.reduction == "mean":
-        x = _unpack_packedsequences(x)
-        loss_sum = 0
-        for x_i in x:
-            y_i = torch.zeros_like(x_i)
-            loss_sum += criterion(x_i, y_i)
-        loss_mean = loss_sum / len(x)
-        return loss_mean
diff --git a/opacus/tests/grad_samples/conv1d_test.py b/opacus/tests/grad_samples/conv1d_test.py
index 179e496e..07c2d3f4 100644
--- a/opacus/tests/grad_samples/conv1d_test.py
+++ b/opacus/tests/grad_samples/conv1d_test.py
@@ -21,6 +21,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test, expander, shrinker
+from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
 
 
 class Conv1d_test(GradSampleHooks_test):
@@ -34,26 +35,28 @@ class Conv1d_test(GradSampleHooks_test):
         padding=st.sampled_from([0, 1, 2, "same", "valid"]),
         dilation=st.integers(1, 2),
         groups=st.integers(1, 12),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_conv1d(
-        self,
-        N: int,
-        C: int,
-        W: int,
-        out_channels_mapper: Callable[[int], int],
-        kernel_size: int,
-        stride: int,
-        padding: int,
-        dilation: int,
-        groups: int,
+            self,
+            N: int,
+            C: int,
+            W: int,
+            out_channels_mapper: Callable[[int], int],
+            kernel_size: int,
+            stride: int,
+            padding: int,
+            dilation: int,
+            groups: int,
+            test_or_check: int
     ):
 
         if padding == "same" and stride != 1:
             return
         out_channels = out_channels_mapper(C)
         if (
-            C % groups != 0 or out_channels % groups != 0
+                C % groups != 0 or out_channels % groups != 0
         ):  # since in_channels and out_channels must be divisible by groups
             return
 
@@ -67,4 +70,9 @@ def test_conv1d(
             dilation=dilation,
             groups=groups,
         )
-        self.run_test(x, conv, batch_first=True, atol=10e-5, rtol=10e-4)
+        if test_or_check == 1:
+            self.run_test(x, conv, batch_first=True, atol=10e-5, rtol=10e-4)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(x, conv, batch_first=True, atol=10e-5, rtol=10e-4,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/conv2d_test.py b/opacus/tests/grad_samples/conv2d_test.py
index f27ad158..603aec56 100644
--- a/opacus/tests/grad_samples/conv2d_test.py
+++ b/opacus/tests/grad_samples/conv2d_test.py
@@ -25,6 +25,7 @@
 from torch.testing import assert_allclose
 
 from .common import GradSampleHooks_test, expander, shrinker
+from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
 
 
 class Conv2d_test(GradSampleHooks_test):
@@ -39,26 +40,28 @@ class Conv2d_test(GradSampleHooks_test):
         padding=st.sampled_from([0, 2, "same", "valid"]),
         dilation=st.integers(1, 3),
         groups=st.integers(1, 16),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=30000)
     def test_conv2d(
-        self,
-        N: int,
-        C: int,
-        H: int,
-        W: int,
-        out_channels_mapper: Callable[[int], int],
-        kernel_size: int,
-        stride: int,
-        padding: int,
-        dilation: int,
-        groups: int,
+            self,
+            N: int,
+            C: int,
+            H: int,
+            W: int,
+            out_channels_mapper: Callable[[int], int],
+            kernel_size: int,
+            stride: int,
+            padding: int,
+            dilation: int,
+            groups: int,
+            test_or_check: int
     ):
         if padding == "same" and stride != 1:
             return
         out_channels = out_channels_mapper(C)
         if (
-            C % groups != 0 or out_channels % groups != 0
+                C % groups != 0 or out_channels % groups != 0
         ):  # since in_channels and out_channels must be divisible by groups
             return
 
@@ -73,26 +76,11 @@ def test_conv2d(
             groups=groups,
         )
         is_ew_compatible = (
-            padding != "same"
+                padding != "same"
         )  # TODO add support for padding = 'same' with EW
 
         # Test regular GSM
-        self.run_test(
-            x,
-            conv,
-            batch_first=True,
-            atol=10e-5,
-            rtol=10e-4,
-            ew_compatible=is_ew_compatible,
-        )
-
-        if padding != "same":
-            # Test 'convolution as a backward' GSM
-            # 'convolution as a backward' doesn't support padding=same
-            conv2d_gsm = GradSampleModule.GRAD_SAMPLERS[nn.Conv2d]
-            GradSampleModule.GRAD_SAMPLERS[
-                nn.Conv2d
-            ] = convolution2d_backward_as_a_convolution
+        if test_or_check == 1:
             self.run_test(
                 x,
                 conv,
@@ -101,6 +89,43 @@ def test_conv2d(
                 rtol=10e-4,
                 ew_compatible=is_ew_compatible,
             )
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=is_ew_compatible):
+                assert check_per_sample_gradients_are_correct(
+                    x,
+                    conv,
+                    batch_first=True,
+                    atol=10e-5,
+                    rtol=10e-4,
+                    grad_sample_mode=grad_sample_mode
+                )
+
+        if padding != "same":
+            # Test 'convolution as a backward' GSM
+            # 'convolution as a backward' doesn't support padding=same
+            conv2d_gsm = GradSampleModule.GRAD_SAMPLERS[nn.Conv2d]
+            GradSampleModule.GRAD_SAMPLERS[
+                nn.Conv2d
+            ] = convolution2d_backward_as_a_convolution
+            if test_or_check == 1:
+                self.run_test(
+                    x,
+                    conv,
+                    batch_first=True,
+                    atol=10e-5,
+                    rtol=10e-4,
+                    ew_compatible=is_ew_compatible,
+                )
+            if test_or_check == 2:
+                for grad_sample_mode in get_grad_sample_modes(use_ew=is_ew_compatible):
+                    assert check_per_sample_gradients_are_correct(
+                        x,
+                        conv,
+                        batch_first=True,
+                        atol=10e-5,
+                        rtol=10e-4,
+                        grad_sample_mode=grad_sample_mode,
+                    )
             GradSampleModule.GRAD_SAMPLERS[nn.Conv2d] = conv2d_gsm
 
     @given(
@@ -119,19 +144,19 @@ def test_conv2d(
     )
     @settings(deadline=30000)
     def test_unfold2d(
-        self,
-        B: int,
-        C: int,
-        H: int,
-        W: int,
-        k_h: int,
-        k_w: int,
-        pad_h: int,
-        pad_w: int,
-        stride_h: int,
-        stride_w: int,
-        dilation_h: int,
-        dilation_w: int,
+            self,
+            B: int,
+            C: int,
+            H: int,
+            W: int,
+            k_h: int,
+            k_w: int,
+            pad_h: int,
+            pad_w: int,
+            stride_h: int,
+            stride_w: int,
+            dilation_h: int,
+            dilation_w: int,
     ):
         X = torch.randn(B, C, H, W)
         X_unfold_torch = torch.nn.functional.unfold(
diff --git a/opacus/tests/grad_samples/conv3d_test.py b/opacus/tests/grad_samples/conv3d_test.py
index afa01b4b..ea7b95a6 100644
--- a/opacus/tests/grad_samples/conv3d_test.py
+++ b/opacus/tests/grad_samples/conv3d_test.py
@@ -21,6 +21,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test, expander, shrinker
+from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
 
 
 class Conv3d_test(GradSampleHooks_test):
@@ -36,28 +37,30 @@ class Conv3d_test(GradSampleHooks_test):
         padding=st.sampled_from([0, 2, (1, 2, 3), "same", "valid"]),
         dilation=st.sampled_from([1, (1, 2, 2)]),
         groups=st.integers(1, 16),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=30000)
     def test_conv3d(
-        self,
-        N: int,
-        C: int,
-        D: int,
-        H: int,
-        W: int,
-        out_channels_mapper: int,
-        kernel_size: Union[int, Tuple[int]],
-        stride: Union[int, Tuple[int]],
-        padding: Union[int, Tuple[int]],
-        dilation: int,
-        groups: int,
+            self,
+            N: int,
+            C: int,
+            D: int,
+            H: int,
+            W: int,
+            out_channels_mapper: int,
+            kernel_size: Union[int, Tuple[int]],
+            stride: Union[int, Tuple[int]],
+            padding: Union[int, Tuple[int]],
+            dilation: int,
+            groups: int,
+            test_or_check: int
     ):
 
         if padding == "same" and stride != 1:
             return
         out_channels = out_channels_mapper(C)
         if (
-            C % groups != 0 or out_channels % groups != 0
+                C % groups != 0 or out_channels % groups != 0
         ):  # since in_channels and out_channels must be divisible by groups
             return
         x = torch.randn([N, C, D, H, W])
@@ -71,13 +74,24 @@ def test_conv3d(
             groups=groups,
         )
         is_ew_compatible = (
-            dilation == 1 and padding != "same"
+                dilation == 1 and padding != "same"
         )  # TODO add support for padding = 'same' with EW
-        self.run_test(
-            x,
-            conv,
-            batch_first=True,
-            atol=10e-5,
-            rtol=10e-3,
-            ew_compatible=is_ew_compatible,
-        )
+        if test_or_check == 1:
+            self.run_test(
+                x,
+                conv,
+                batch_first=True,
+                atol=10e-5,
+                rtol=10e-3,
+                ew_compatible=is_ew_compatible,
+            )
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=is_ew_compatible):
+                assert check_per_sample_gradients_are_correct(
+                    x,
+                    conv,
+                    batch_first=True,
+                    atol=10e-5,
+                    rtol=10e-3,
+                    grad_sample_mode=grad_sample_mode
+                )
diff --git a/opacus/tests/grad_samples/dp_multihead_attention_test.py b/opacus/tests/grad_samples/dp_multihead_attention_test.py
index 057f2391..1f255e89 100644
--- a/opacus/tests/grad_samples/dp_multihead_attention_test.py
+++ b/opacus/tests/grad_samples/dp_multihead_attention_test.py
@@ -20,6 +20,7 @@
 from opacus.layers import DPMultiheadAttention
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
 
 
 class DPMultiheadAttentionAdapter(nn.Module):
@@ -53,18 +54,20 @@ class MultiHeadAttention_test(GradSampleHooks_test):
         add_bias_kv=st.booleans(),
         add_zero_attn=st.booleans(),
         kv_dim=st.booleans(),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_multihead_attention(
-        self,
-        N: int,
-        T: int,
-        D: int,
-        P: int,
-        bias: bool,
-        add_bias_kv: bool,
-        add_zero_attn: bool,
-        kv_dim: bool,
+            self,
+            N: int,
+            T: int,
+            D: int,
+            P: int,
+            bias: bool,
+            add_bias_kv: bool,
+            add_zero_attn: bool,
+            kv_dim: bool,
+            test_or_check: int
     ):
 
         if kv_dim:
@@ -86,4 +89,9 @@ def test_multihead_attention(
         v = torch.randn([T, N, D])
         x = torch.stack((q, k, v), dim=-1)
 
-        self.run_test(x, attn, batch_first=False)
+        if test_or_check == 1:
+            self.run_test(x, attn, batch_first=False)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(x, attn, batch_first=False,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/dp_rnn_test.py b/opacus/tests/grad_samples/dp_rnn_test.py
index 39f29ad6..23402ccf 100644
--- a/opacus/tests/grad_samples/dp_rnn_test.py
+++ b/opacus/tests/grad_samples/dp_rnn_test.py
@@ -21,7 +21,7 @@
 from opacus.utils.packed_sequences import _gen_packed_data
 
 from .common import GradSampleHooks_test
-
+from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
 
 MODELS = [
     DPRNN,
@@ -59,21 +59,23 @@ class RNN_test(GradSampleHooks_test):
         bidirectional=st.booleans(),
         using_packed_sequences=st.booleans(),
         packed_sequences_sorted=st.booleans(),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=30000)
     def test_rnn(
-        self,
-        model,
-        N: int,
-        T: int,
-        D: int,
-        H: int,
-        num_layers: int,
-        bias: bool,
-        batch_first: bool,
-        bidirectional: bool,
-        using_packed_sequences: bool,
-        packed_sequences_sorted: bool,
+            self,
+            model,
+            N: int,
+            T: int,
+            D: int,
+            H: int,
+            num_layers: int,
+            bias: bool,
+            batch_first: bool,
+            bidirectional: bool,
+            using_packed_sequences: bool,
+            packed_sequences_sorted: bool,
+            test_or_check: int
     ):
         rnn = model(
             D,
@@ -92,4 +94,10 @@ def test_rnn(
                 x = torch.randn([N, T, D])
             else:
                 x = torch.randn([T, N, D])
-        self.run_test(x, rnn, batch_first=batch_first, ew_compatible=False)
+
+        if test_or_check == 1:
+            self.run_test(x, rnn, batch_first=batch_first, ew_compatible=False)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(x, rnn, batch_first=batch_first,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/embedding_test.py b/opacus/tests/grad_samples/embedding_test.py
index ff02a130..7e053c1a 100644
--- a/opacus/tests/grad_samples/embedding_test.py
+++ b/opacus/tests/grad_samples/embedding_test.py
@@ -19,6 +19,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
 
 
 class Embedding_test(GradSampleHooks_test):
@@ -31,18 +32,20 @@ class Embedding_test(GradSampleHooks_test):
         D=st.integers(10, 17),
         dim=st.integers(2, 4),
         batch_first=st.booleans(),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_input_across_dims(
-        self,
-        N: int,
-        T: int,
-        Q: int,
-        R: int,
-        V: int,
-        D: int,
-        dim: int,
-        batch_first: bool,
+            self,
+            N: int,
+            T: int,
+            Q: int,
+            R: int,
+            V: int,
+            D: int,
+            dim: int,
+            batch_first: bool,
+            test_or_check: int
     ):
 
         if dim == 1:  # TODO: fix when dim is 1
@@ -56,4 +59,10 @@ def test_input_across_dims(
 
         emb = nn.Embedding(V, D)
         x = torch.randint(low=0, high=V - 1, size=size)
-        self.run_test(x, emb, batch_first=batch_first)
+
+        if test_or_check == 1:
+            self.run_test(x, emb, batch_first=batch_first)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(x, emb, batch_first=batch_first,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/group_norm_test.py b/opacus/tests/grad_samples/group_norm_test.py
index 2f4bbaff..b7f13cb9 100644
--- a/opacus/tests/grad_samples/group_norm_test.py
+++ b/opacus/tests/grad_samples/group_norm_test.py
@@ -21,6 +21,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
 
 
 class GroupNorm_test(GradSampleHooks_test):
@@ -35,15 +36,17 @@ class GroupNorm_test(GradSampleHooks_test):
         H=st.integers(5, 10),
         W=st.integers(4, 8),
         num_groups=st.sampled_from([1, 4, "C"]),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_3d_input_groups(
-        self,
-        N: int,
-        C: int,
-        H: int,
-        W: int,
-        num_groups: Union[int, str],
+            self,
+            N: int,
+            C: int,
+            H: int,
+            W: int,
+            num_groups: Union[int, str],
+            test_or_check: int
     ):
 
         if num_groups == "C":
@@ -55,3 +58,9 @@ def test_3d_input_groups(
         x = torch.randn([N, C, H, W])
         norm = nn.GroupNorm(num_groups=num_groups, num_channels=C, affine=True)
         self.run_test(x, norm, batch_first=True)
+        if test_or_check == 1:
+            self.run_test(x, norm, batch_first=True)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/instance_norm1d_test.py b/opacus/tests/grad_samples/instance_norm1d_test.py
index 151e3b1d..7001f1c8 100644
--- a/opacus/tests/grad_samples/instance_norm1d_test.py
+++ b/opacus/tests/grad_samples/instance_norm1d_test.py
@@ -19,6 +19,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
 
 
 class InstanceNorm1d_test(GradSampleHooks_test):
@@ -26,15 +27,23 @@ class InstanceNorm1d_test(GradSampleHooks_test):
         N=st.integers(1, 4),
         C=st.integers(1, 3),
         W=st.integers(5, 10),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_3d_input(
-        self,
-        N: int,
-        C: int,
-        W: int,
+            self,
+            N: int,
+            C: int,
+            W: int,
+            test_or_check: int
     ):
 
         x = torch.randn([N, C, W])
         norm = nn.InstanceNorm1d(num_features=C, affine=True, track_running_stats=False)
-        self.run_test(x, norm, batch_first=True)
+
+        if test_or_check == 1:
+            self.run_test(x, norm, batch_first=True)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/instance_norm2d_test.py b/opacus/tests/grad_samples/instance_norm2d_test.py
index cc7ba1f1..6f955f87 100644
--- a/opacus/tests/grad_samples/instance_norm2d_test.py
+++ b/opacus/tests/grad_samples/instance_norm2d_test.py
@@ -19,6 +19,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
 
 
 class InstanceNorm2d_test(GradSampleHooks_test):
@@ -27,16 +28,23 @@ class InstanceNorm2d_test(GradSampleHooks_test):
         C=st.integers(1, 3),
         W=st.integers(5, 10),
         H=st.integers(4, 8),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_4d_input(
-        self,
-        N: int,
-        C: int,
-        W: int,
-        H: int,
+            self,
+            N: int,
+            C: int,
+            W: int,
+            H: int,
+            test_or_check: int
     ):
 
         x = torch.randn([N, C, H, W])
         norm = nn.InstanceNorm2d(num_features=C, affine=True, track_running_stats=False)
-        self.run_test(x, norm, batch_first=True)
+        if test_or_check == 1:
+            self.run_test(x, norm, batch_first=True)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/instance_norm3d_test.py b/opacus/tests/grad_samples/instance_norm3d_test.py
index 1b3b3de3..68d5298e 100644
--- a/opacus/tests/grad_samples/instance_norm3d_test.py
+++ b/opacus/tests/grad_samples/instance_norm3d_test.py
@@ -19,6 +19,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
 
 
 class InstanceNorm3d_test(GradSampleHooks_test):
@@ -28,16 +29,24 @@ class InstanceNorm3d_test(GradSampleHooks_test):
         W=st.integers(5, 10),
         H=st.integers(4, 8),
         Z=st.integers(1, 4),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_5d_input(
-        self,
-        N: int,
-        C: int,
-        W: int,
-        H: int,
-        Z: int,
+            self,
+            N: int,
+            C: int,
+            W: int,
+            H: int,
+            Z: int,
+            test_or_check: int
+
     ):
         x = torch.randn([N, C, Z, H, W])
         norm = nn.InstanceNorm3d(num_features=C, affine=True, track_running_stats=False)
-        self.run_test(x, norm, batch_first=True)
+        if test_or_check == 1:
+            self.run_test(x, norm, batch_first=True)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/layer_norm_test.py b/opacus/tests/grad_samples/layer_norm_test.py
index 3e69eaa2..ea556dd8 100644
--- a/opacus/tests/grad_samples/layer_norm_test.py
+++ b/opacus/tests/grad_samples/layer_norm_test.py
@@ -19,6 +19,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
 
 
 class LayerNorm_test(GradSampleHooks_test):
@@ -29,16 +30,18 @@ class LayerNorm_test(GradSampleHooks_test):
         W=st.integers(5, 10),
         input_dim=st.integers(2, 4),
         norm_dim=st.integers(1, 3),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_input_norm(
-        self,
-        N: int,
-        Z: int,
-        W: int,
-        H: int,
-        input_dim: int,
-        norm_dim: int,
+            self,
+            N: int,
+            Z: int,
+            W: int,
+            H: int,
+            input_dim: int,
+            norm_dim: int,
+            test_or_check: int
     ):
 
         if norm_dim >= input_dim:
@@ -64,4 +67,9 @@ def test_input_norm(
 
         norm = nn.LayerNorm(normalized_shape, elementwise_affine=True)
         x = torch.randn(x_shape)
-        self.run_test(x, norm, batch_first=True)
+        if test_or_check == 1:
+            self.run_test(x, norm, batch_first=True)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/linear_test.py b/opacus/tests/grad_samples/linear_test.py
index 3b23f3ef..cb3ef89e 100644
--- a/opacus/tests/grad_samples/linear_test.py
+++ b/opacus/tests/grad_samples/linear_test.py
@@ -19,6 +19,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
 
 
 class Linear_test(GradSampleHooks_test):
@@ -30,17 +31,19 @@ class Linear_test(GradSampleHooks_test):
         input_dim=st.integers(2, 4),
         bias=st.booleans(),
         batch_first=st.booleans(),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_input_bias(
-        self,
-        N: int,
-        Z: int,
-        W: int,
-        H: int,
-        input_dim: int,
-        bias: bool,
-        batch_first: bool,
+            self,
+            N: int,
+            Z: int,
+            W: int,
+            H: int,
+            input_dim: int,
+            bias: bool,
+            batch_first: bool,
+            test_or_check: int
     ):
 
         if input_dim == 2:
@@ -57,4 +60,9 @@ def test_input_bias(
         x = torch.randn(x_shape)
         if not batch_first:
             x = x.transpose(0, 1)
-        self.run_test(x, linear, batch_first=batch_first)
+        if test_or_check == 1:
+            self.run_test(x, linear, batch_first=batch_first)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(x, linear, batch_first=batch_first,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/sequence_bias_test.py b/opacus/tests/grad_samples/sequence_bias_test.py
index b61ffc66..0bbf0e40 100644
--- a/opacus/tests/grad_samples/sequence_bias_test.py
+++ b/opacus/tests/grad_samples/sequence_bias_test.py
@@ -19,6 +19,7 @@
 from opacus.layers import SequenceBias
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
 
 
 class SequenceBias_test(GradSampleHooks_test):
@@ -27,14 +28,16 @@ class SequenceBias_test(GradSampleHooks_test):
         T=st.integers(10, 20),
         D=st.integers(4, 8),
         batch_first=st.booleans(),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_batch_second(
-        self,
-        N: int,
-        T: int,
-        D: int,
-        batch_first: bool,
+            self,
+            N: int,
+            T: int,
+            D: int,
+            batch_first: bool,
+            test_or_check: int
     ):
 
         seqbias = SequenceBias(D, batch_first)
@@ -42,4 +45,9 @@ def test_batch_second(
             x = torch.randn([N, T, D])
         else:
             x = torch.randn([T, N, D])
-        self.run_test(x, seqbias, batch_first, ew_compatible=False)
+        if test_or_check == 1:
+            self.run_test(x, seqbias, batch_first, ew_compatible=False)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=False):
+                assert check_per_sample_gradients_are_correct(x, seqbias, batch_first=batch_first,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/utils/per_sample_gradients_utils.py b/opacus/utils/per_sample_gradients_utils.py
new file mode 100644
index 00000000..74ce32ed
--- /dev/null
+++ b/opacus/utils/per_sample_gradients_utils.py
@@ -0,0 +1,367 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+from typing import Union, Dict, List
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence
+
+from opacus.grad_sample import wrap_model
+from opacus.utils.module_utils import trainable_parameters
+from opacus.utils.packed_sequences import compute_seq_lengths
+
+
+def clone_module(module: nn.Module) -> nn.Module:
+    """
+    Handy utility to clone an nn.Module. PyTorch doesn't always support copy.deepcopy(), so it is
+    just easier to serialize the model to a BytesIO and read it from there.
+
+    Args:
+        module: The module to clone
+
+    Returns:
+        The clone of ``module``
+    """
+    with io.BytesIO() as bytesio:
+        torch.save(module, bytesio)
+        bytesio.seek(0)
+        module_copy = torch.load(bytesio)
+    return module_copy
+
+
+class ModelWithLoss(nn.Module):
+    """
+    To test the gradients of a module, we need to have a loss.
+    This module makes it easy to get a loss from any nn.Module, and automatically generates
+    a target y vector for it in the forward (of all zeros of the correct size).
+    This reduces boilerplate while testing.
+    """
+
+    supported_reductions = ["mean", "sum"]
+
+    def __init__(self, module: nn.Module, loss_reduction: str = "mean"):
+        """
+        Instantiates this module.
+
+        Args:
+            module: The nn.Module you want to test.
+            loss_reduction: What reduction to apply to the loss. Defaults to "mean".
+
+        Raises:
+            ValueError: If ``loss_reduction`` is not among those supported.
+        """
+        super().__init__()
+        self.wrapped_module = module
+
+        if loss_reduction not in self.supported_reductions:
+            raise ValueError(
+                f"Passed loss_reduction={loss_reduction}. Only {self.supported_reductions} supported."
+            )
+        self.criterion = nn.L1Loss(reduction=loss_reduction)
+
+    def forward(self, x):
+        x = self.wrapped_module(x)
+        if type(x) is PackedSequence:
+            loss = _compute_loss_packedsequences(self.criterion, x)
+        else:
+            y = torch.zeros_like(x)
+            loss = self.criterion(x, y)
+        return loss
+
+
+def compute_microbatch_grad_sample(
+        x: Union[torch.Tensor, List[torch.Tensor]],
+        module: nn.Module,
+        batch_first=True,
+        loss_reduction="mean",
+) -> Dict[str, torch.tensor]:
+    """
+    Computes per-sample gradients with the microbatch method, i.e. by computing normal gradients
+    with batch_size set to 1, and manually accumulating them. This is our reference for testing
+    as this method is obviously correct, but slow.
+
+    Args:
+        x: The tensor in input to the ``module``
+        module: The ``ModelWithLoss`` that wraps the nn.Module you want to test.
+        batch_first: Whether batch size is the first dimension (as opposed to the second).
+            Defaults to True.
+
+    Returns:
+        Dictionary mapping parameter_name -> per-sample-gradient for that parameter
+    """
+    torch.use_deterministic_algorithms(True)
+    torch.manual_seed(0)
+    np.random.seed(0)
+
+    module = ModelWithLoss(clone_module(module), loss_reduction)
+
+    for _, p in trainable_parameters(module):
+        p.microbatch_grad_sample = []
+
+    if not batch_first and type(x) is not list:
+        # This allows us to iterate with x_i
+        x = x.transpose(0, 1)
+
+    # Invariant: x is [B, T, ...]
+
+    for x_i in x:
+        # x_i is [T, ...]
+        x_i = x_i.unsqueeze(
+            0 if batch_first else 1
+        )  # x_i of size [1, T, ...] if batch_first, else [T, 1, ...]
+        module.zero_grad()
+        loss_i = module(x_i)
+        loss_i.backward()
+        for p in module.parameters():
+            p.microbatch_grad_sample.append(p.grad.detach().clone())
+
+    for _, p in trainable_parameters(module):
+        if batch_first:
+            p.microbatch_grad_sample = torch.stack(
+                p.microbatch_grad_sample, dim=0  # [B, T, ...]
+            )
+        else:
+            p.microbatch_grad_sample = torch.stack(
+                p.microbatch_grad_sample, dim=1  # [T, B, ...]
+            ).transpose(
+                0, 1
+            )  # Opacus's semantics is that grad_samples are ALWAYS batch_first: [B, T, ...]
+
+    microbatch_grad_samples = {
+        name: p.microbatch_grad_sample
+        for name, p in trainable_parameters(module.wrapped_module)
+    }
+    return microbatch_grad_samples
+
+
+def compute_opacus_grad_sample(
+        x: Union[torch.Tensor, PackedSequence],
+        module: nn.Module,
+        batch_first=True,
+        loss_reduction="mean",
+        grad_sample_mode="hooks",
+) -> Dict[str, torch.tensor]:
+    """
+    Runs Opacus to compute per-sample gradients and return them for testing purposes.
+
+    Args:
+        x: The tensor in input to the ``module``
+        module: The ``ModelWithLoss`` that wraps the nn.Module you want to test.
+        batch_first: Whether batch size is the first dimension (as opposed to the second).
+            Defaults to True.
+        loss_reduction: What reduction to apply to the loss. Defaults to "mean".
+
+    Returns:
+        Dictionary mapping parameter_name -> per-sample-gradient for that parameter
+    """
+    torch.use_deterministic_algorithms(True)
+    torch.manual_seed(0)
+    np.random.seed(0)
+
+    gs_module = wrap_model(
+        model=clone_module(module),
+        grad_sample_mode=grad_sample_mode,
+        batch_first=batch_first,
+        loss_reduction=loss_reduction,
+    )
+    grad_sample_module = ModelWithLoss(gs_module, loss_reduction)
+
+    grad_sample_module.zero_grad()
+    loss = grad_sample_module(x)
+    loss.backward()
+
+    opacus_grad_samples = {
+        name: p.grad_sample
+        for name, p in trainable_parameters(
+            grad_sample_module.wrapped_module._module
+        )
+    }
+
+    return opacus_grad_samples
+
+
+def check_torch_version_for_ew_sample() -> bool:
+    return torch.__version__ >= (1, 13)
+
+
+def get_grad_sample_modes(use_ew: bool = False):
+    grad_sample_modes = ["hooks", "functorch"]
+    if use_ew and check_torch_version_for_ew_sample():
+        grad_sample_modes.append("ew")
+    return grad_sample_modes
+
+
+def check_per_sample_gradients_are_correct(x: Union[torch.Tensor, PackedSequence],
+                                           module: nn.Module,
+                                           *,
+                                           batch_first=True,
+                                           atol=10e-6,
+                                           rtol=10e-5,
+                                           grad_sample_mode="hooks") -> bool:
+    if grad_sample_mode == "functorch":
+        import functorch  # noqa
+
+    reductions = ["sum", "mean"]
+    if grad_sample_mode == "ew":
+        if not batch_first:
+            raise RuntimeError(f"Batch should be first dimension.")
+        if not check_torch_version_for_ew_sample():
+            raise RuntimeError(f"Unsupported torch version: {torch.__version__}.")
+        reductions = ["sum"]
+
+    correct = True
+    for loss_reduction in reductions:
+        correct = correct and check_per_sample_gradients_are_correct_with_reduction(
+            x,
+            module,
+            batch_first=batch_first,
+            loss_reduction=loss_reduction,
+            atol=atol,
+            rtol=rtol,
+            grad_sample_mode=grad_sample_mode,
+        )
+
+    return correct
+
+
+def compute_microbatch_grad_sample_tensor_or_seq(x: Union[torch.Tensor, PackedSequence],
+                                                 module: nn.Module,
+                                                 batch_first=True,
+                                                 loss_reduction="mean"):
+    if type(x) is PackedSequence:
+        x_unpacked = unpack_packedsequences(x)
+        microbatch_grad_samples = compute_microbatch_grad_sample(
+            x_unpacked,
+            module,
+            batch_first=batch_first,
+            loss_reduction=loss_reduction,
+        )
+    else:
+        microbatch_grad_samples = compute_microbatch_grad_sample(
+            x, module, batch_first=batch_first, loss_reduction=loss_reduction
+        )
+
+    return microbatch_grad_samples
+
+
+def compute_grad_samples_microbatch_and_opacus(x: Union[torch.Tensor, PackedSequence],
+                                               module: nn.Module,
+                                               batch_first=True,
+                                               loss_reduction="mean",
+                                               grad_sample_mode="hooks"):
+    microbatch_grad_samples = compute_microbatch_grad_sample_tensor_or_seq(x,
+                                                                           module,
+                                                                           batch_first=batch_first,
+                                                                           loss_reduction=loss_reduction)
+    opacus_grad_samples = compute_opacus_grad_sample(
+        x,
+        module,
+        batch_first=batch_first,
+        loss_reduction=loss_reduction,
+        grad_sample_mode=grad_sample_mode
+    )
+
+    if microbatch_grad_samples.keys() != opacus_grad_samples.keys():
+        raise ValueError(
+            "Keys not matching! "
+            f"Keys only in microbatch: {microbatch_grad_samples.keys() - opacus_grad_samples.keys()}; "
+            f"Keys only in Opacus: {opacus_grad_samples.keys() - microbatch_grad_samples.keys()}"
+        )
+
+    return microbatch_grad_samples, opacus_grad_samples
+
+
+def check_per_sample_gradients_are_correct_with_reduction(
+        x: Union[torch.Tensor, PackedSequence],
+        module: nn.Module,
+        batch_first=True,
+        loss_reduction="mean",
+        atol=10e-6,
+        rtol=10e-5,
+        grad_sample_mode="hooks"
+) -> bool:
+    microbatch_grad_samples, opacus_grad_samples = \
+        compute_grad_samples_microbatch_and_opacus(x,
+                                                   module,
+                                                   batch_first=batch_first,
+                                                   loss_reduction=loss_reduction,
+                                                   grad_sample_mode=grad_sample_mode)
+
+    correct = True
+    for name, opacus_grad_sample in opacus_grad_samples.items():
+        microbatch_grad_sample = microbatch_grad_samples[name]
+        correct = correct and np.allclose(microbatch_grad_sample, opacus_grad_sample, atol,
+                                          rtol) and opacus_grad_sample.shape == microbatch_grad_sample.shape
+
+    return correct
+
+
+def unpack_packedsequences(X: PackedSequence) -> List[torch.Tensor]:
+    r"""
+    Produces a list of tensors from X (PackedSequence) such that this list was used to create X with batch_first=True
+
+    Args:
+        X: A PackedSequence from which the output list of tensors will be produced.
+
+    Returns:
+        unpacked_data: The list of tensors produced from X.
+    """
+
+    X_padded = pad_packed_sequence(X)
+    X_padded = X_padded[0].permute((1, 0, 2))
+
+    if X.sorted_indices is not None:
+        X_padded = X_padded[X.sorted_indices]
+
+    seq_lens = compute_seq_lengths(X.batch_sizes)
+    unpacked_data = [0] * len(seq_lens)
+    for idx, length in enumerate(seq_lens):
+        unpacked_data[idx] = X_padded[idx][:length, :]
+
+    return unpacked_data
+
+
+def _compute_loss_packedsequences(
+        criterion: nn.L1Loss, x: PackedSequence
+) -> torch.Tensor:
+    r"""
+    This function computes the loss in a different way for 'mean' reduced L1 loss while for 'sum' reduced L1 loss,
+    it computes the same way as with non-packed data. For 'mean' reduced L1 loss, it transforms x (PackedSequence)
+    into a list of tensors such that this list of tensors was used to create this PackedSequence in the first
+    place using batch_first=True and then takes the mean of the loss values produced from applying criterion on
+    each sequence sample.
+
+    Args:
+        criterion: An L1 loss function with reduction either set to 'sum' or 'mean'.
+        x: Data in the form of a PackedSequence.
+
+    Returns:
+        A loss variable, reduced either using summation or averaging from L1 errors.
+    """
+
+    if criterion.reduction == "sum":
+        y = torch.zeros_like(x[0])
+        return criterion(x[0], y)
+    elif criterion.reduction == "mean":
+        x = unpack_packedsequences(x)
+        loss_sum = 0
+        for x_i in x:
+            y_i = torch.zeros_like(x_i)
+            loss_sum += criterion(x_i, y_i)
+        loss_mean = loss_sum / len(x)
+        return loss_mean

From 3f9f9cd1cfb27d86bd19f4aad179543ddcf919b8 Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Fri, 28 Oct 2022 12:22:15 +0100
Subject: [PATCH 09/32] Add docs and refactor

---
 opacus/utils/per_sample_gradients_utils.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/opacus/utils/per_sample_gradients_utils.py b/opacus/utils/per_sample_gradients_utils.py
index 74ce32ed..7e00a9ec 100644
--- a/opacus/utils/per_sample_gradients_utils.py
+++ b/opacus/utils/per_sample_gradients_utils.py
@@ -100,6 +100,8 @@ def compute_microbatch_grad_sample(
         module: The ``ModelWithLoss`` that wraps the nn.Module you want to test.
         batch_first: Whether batch size is the first dimension (as opposed to the second).
             Defaults to True.
+        loss_reduction: Indicates if the loss reduction (for aggregating the gradients)
+                is a sum or a mean operation. Can take values "sum" or "mean".
 
     Returns:
         Dictionary mapping parameter_name -> per-sample-gradient for that parameter
@@ -165,6 +167,7 @@ def compute_opacus_grad_sample(
         batch_first: Whether batch size is the first dimension (as opposed to the second).
             Defaults to True.
         loss_reduction: What reduction to apply to the loss. Defaults to "mean".
+        grad_sample_mode: What sampling method to use to get gradients.
 
     Returns:
         Dictionary mapping parameter_name -> per-sample-gradient for that parameter
@@ -213,6 +216,19 @@ def check_per_sample_gradients_are_correct(x: Union[torch.Tensor, PackedSequence
                                            atol=10e-6,
                                            rtol=10e-5,
                                            grad_sample_mode="hooks") -> bool:
+    """
+    A utility to check whether per sample gradients are computed correctly with a particular model.
+    Args:
+        x: The tensor in input to the ``module``
+        module: The ``ModelWithLoss`` that wraps the nn.Module you want to check.
+        batch_first: Whether batch size is the first dimension (as opposed to the second).
+            Defaults to True.
+        atol: The relative tolerance parameter (numpy).
+        rtol: The absolute tolerance parameter (numpy).
+        grad_sample_mode: What sampling method to use to get gradients.
+
+    Returns: True if per sample gradients were computed correctly. False otherwise.
+    """
     if grad_sample_mode == "functorch":
         import functorch  # noqa
 
@@ -226,7 +242,7 @@ def check_per_sample_gradients_are_correct(x: Union[torch.Tensor, PackedSequence
 
     correct = True
     for loss_reduction in reductions:
-        correct = correct and check_per_sample_gradients_are_correct_with_reduction(
+        correct = correct and _check_per_sample_gradients_are_correct_with_reduction(
             x,
             module,
             batch_first=batch_first,
@@ -286,7 +302,7 @@ def compute_grad_samples_microbatch_and_opacus(x: Union[torch.Tensor, PackedSequ
     return microbatch_grad_samples, opacus_grad_samples
 
 
-def check_per_sample_gradients_are_correct_with_reduction(
+def _check_per_sample_gradients_are_correct_with_reduction(
         x: Union[torch.Tensor, PackedSequence],
         module: nn.Module,
         batch_first=True,

From 765b84e76817bd12f1ff3e25948fa9da3998e434 Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Fri, 28 Oct 2022 12:51:18 +0100
Subject: [PATCH 10/32] Apply code style fixes

---
 opacus/tests/grad_samples/common.py           |  72 ++++++------
 opacus/tests/grad_samples/conv1d_test.py      |  41 ++++---
 opacus/tests/grad_samples/conv2d_test.py      |  63 +++++-----
 opacus/tests/grad_samples/conv3d_test.py      |  39 ++++---
 .../dp_multihead_attention_test.py            |  32 +++---
 opacus/tests/grad_samples/dp_rnn_test.py      |  38 +++---
 opacus/tests/grad_samples/embedding_test.py   |  32 +++---
 opacus/tests/grad_samples/group_norm_test.py  |  26 +++--
 .../grad_samples/instance_norm1d_test.py      |  20 ++--
 .../grad_samples/instance_norm2d_test.py      |  21 ++--
 .../grad_samples/instance_norm3d_test.py      |  23 ++--
 opacus/tests/grad_samples/layer_norm_test.py  |  28 +++--
 opacus/tests/grad_samples/linear_test.py      |  33 +++---
 .../tests/grad_samples/sequence_bias_test.py  |  22 ++--
 opacus/utils/per_sample_gradients_utils.py    | 108 ++++++++++--------
 15 files changed, 324 insertions(+), 274 deletions(-)

diff --git a/opacus/tests/grad_samples/common.py b/opacus/tests/grad_samples/common.py
index 2652aa95..660fa5b1 100644
--- a/opacus/tests/grad_samples/common.py
+++ b/opacus/tests/grad_samples/common.py
@@ -22,7 +22,9 @@
 from torch.nn.utils.rnn import PackedSequence
 from torch.testing import assert_allclose
 
-from opacus.utils.per_sample_gradients_utils import compute_grad_samples_microbatch_and_opacus
+from opacus.utils.per_sample_gradients_utils import (
+    compute_grad_samples_microbatch_and_opacus,
+)
 
 
 def expander(x, factor: int = 2):
@@ -40,13 +42,13 @@ class GradSampleHooks_test(unittest.TestCase):
     """
 
     def run_test(
-            self,
-            x: Union[torch.Tensor, PackedSequence],
-            module: nn.Module,
-            batch_first=True,
-            atol=10e-6,
-            rtol=10e-5,
-            ew_compatible=True,
+        self,
+        x: Union[torch.Tensor, PackedSequence],
+        module: nn.Module,
+        batch_first=True,
+        atol=10e-6,
+        rtol=10e-5,
+        ew_compatible=True,
     ):
         grad_sample_modes = ["hooks", "functorch"]
         try:
@@ -57,7 +59,7 @@ def run_test(
         for grad_sample_mode in grad_sample_modes:
             for loss_reduction in ["sum", "mean"]:
                 with self.subTest(
-                        grad_sample_mode=grad_sample_mode, loss_reduction=loss_reduction
+                    grad_sample_mode=grad_sample_mode, loss_reduction=loss_reduction
                 ):
                     self.run_test_with_reduction(
                         x,
@@ -80,21 +82,25 @@ def run_test(
             )
 
     def run_test_with_reduction(
-            self,
-            x: Union[torch.Tensor, PackedSequence],
-            module: nn.Module,
-            batch_first=True,
-            loss_reduction="mean",
-            atol=10e-6,
-            rtol=10e-5,
-            grad_sample_mode="hooks",
+        self,
+        x: Union[torch.Tensor, PackedSequence],
+        module: nn.Module,
+        batch_first=True,
+        loss_reduction="mean",
+        atol=10e-6,
+        rtol=10e-5,
+        grad_sample_mode="hooks",
     ):
-        microbatch_grad_samples, opacus_grad_samples = \
-            compute_grad_samples_microbatch_and_opacus(x,
-                                                       module,
-                                                       batch_first=batch_first,
-                                                       loss_reduction=loss_reduction,
-                                                       grad_sample_mode=grad_sample_mode)
+        (
+            microbatch_grad_samples,
+            opacus_grad_samples,
+        ) = compute_grad_samples_microbatch_and_opacus(
+            x,
+            module,
+            batch_first=batch_first,
+            loss_reduction=loss_reduction,
+            grad_sample_mode=grad_sample_mode,
+        )
 
         self.check_shapes(microbatch_grad_samples, opacus_grad_samples, loss_reduction)
         self.check_values(
@@ -102,10 +108,10 @@ def run_test_with_reduction(
         )
 
     def check_shapes(
-            self,
-            microbatch_grad_samples,
-            opacus_grad_samples,
-            loss_reduction,
+        self,
+        microbatch_grad_samples,
+        opacus_grad_samples,
+        loss_reduction,
     ) -> None:
         failed = []
         for name, opacus_grad_sample in opacus_grad_samples.items():
@@ -133,12 +139,12 @@ def check_shapes(
             )
 
     def check_values(
-            self,
-            microbatch_grad_samples,
-            opacus_grad_samples,
-            loss_reduction,
-            atol,
-            rtol,
+        self,
+        microbatch_grad_samples,
+        opacus_grad_samples,
+        loss_reduction,
+        atol,
+        rtol,
     ) -> None:
         failed = []
         for name, opacus_grad_sample in opacus_grad_samples.items():
diff --git a/opacus/tests/grad_samples/conv1d_test.py b/opacus/tests/grad_samples/conv1d_test.py
index 07c2d3f4..932e5748 100644
--- a/opacus/tests/grad_samples/conv1d_test.py
+++ b/opacus/tests/grad_samples/conv1d_test.py
@@ -21,7 +21,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test, expander, shrinker
-from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
+from ...utils.per_sample_gradients_utils import (
+    check_per_sample_gradients_are_correct,
+    get_grad_sample_modes,
+)
 
 
 class Conv1d_test(GradSampleHooks_test):
@@ -35,28 +38,28 @@ class Conv1d_test(GradSampleHooks_test):
         padding=st.sampled_from([0, 1, 2, "same", "valid"]),
         dilation=st.integers(1, 2),
         groups=st.integers(1, 12),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
     def test_conv1d(
-            self,
-            N: int,
-            C: int,
-            W: int,
-            out_channels_mapper: Callable[[int], int],
-            kernel_size: int,
-            stride: int,
-            padding: int,
-            dilation: int,
-            groups: int,
-            test_or_check: int
+        self,
+        N: int,
+        C: int,
+        W: int,
+        out_channels_mapper: Callable[[int], int],
+        kernel_size: int,
+        stride: int,
+        padding: int,
+        dilation: int,
+        groups: int,
+        test_or_check: int,
     ):
 
         if padding == "same" and stride != 1:
             return
         out_channels = out_channels_mapper(C)
         if (
-                C % groups != 0 or out_channels % groups != 0
+            C % groups != 0 or out_channels % groups != 0
         ):  # since in_channels and out_channels must be divisible by groups
             return
 
@@ -74,5 +77,11 @@ def test_conv1d(
             self.run_test(x, conv, batch_first=True, atol=10e-5, rtol=10e-4)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(x, conv, batch_first=True, atol=10e-5, rtol=10e-4,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x,
+                    conv,
+                    batch_first=True,
+                    atol=10e-5,
+                    rtol=10e-4,
+                    grad_sample_mode=grad_sample_mode,
+                )
diff --git a/opacus/tests/grad_samples/conv2d_test.py b/opacus/tests/grad_samples/conv2d_test.py
index 603aec56..3ef94f6a 100644
--- a/opacus/tests/grad_samples/conv2d_test.py
+++ b/opacus/tests/grad_samples/conv2d_test.py
@@ -25,7 +25,10 @@
 from torch.testing import assert_allclose
 
 from .common import GradSampleHooks_test, expander, shrinker
-from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
+from ...utils.per_sample_gradients_utils import (
+    get_grad_sample_modes,
+    check_per_sample_gradients_are_correct,
+)
 
 
 class Conv2d_test(GradSampleHooks_test):
@@ -40,28 +43,28 @@ class Conv2d_test(GradSampleHooks_test):
         padding=st.sampled_from([0, 2, "same", "valid"]),
         dilation=st.integers(1, 3),
         groups=st.integers(1, 16),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=30000)
     def test_conv2d(
-            self,
-            N: int,
-            C: int,
-            H: int,
-            W: int,
-            out_channels_mapper: Callable[[int], int],
-            kernel_size: int,
-            stride: int,
-            padding: int,
-            dilation: int,
-            groups: int,
-            test_or_check: int
+        self,
+        N: int,
+        C: int,
+        H: int,
+        W: int,
+        out_channels_mapper: Callable[[int], int],
+        kernel_size: int,
+        stride: int,
+        padding: int,
+        dilation: int,
+        groups: int,
+        test_or_check: int,
     ):
         if padding == "same" and stride != 1:
             return
         out_channels = out_channels_mapper(C)
         if (
-                C % groups != 0 or out_channels % groups != 0
+            C % groups != 0 or out_channels % groups != 0
         ):  # since in_channels and out_channels must be divisible by groups
             return
 
@@ -76,7 +79,7 @@ def test_conv2d(
             groups=groups,
         )
         is_ew_compatible = (
-                padding != "same"
+            padding != "same"
         )  # TODO add support for padding = 'same' with EW
 
         # Test regular GSM
@@ -97,7 +100,7 @@ def test_conv2d(
                     batch_first=True,
                     atol=10e-5,
                     rtol=10e-4,
-                    grad_sample_mode=grad_sample_mode
+                    grad_sample_mode=grad_sample_mode,
                 )
 
         if padding != "same":
@@ -144,19 +147,19 @@ def test_conv2d(
     )
     @settings(deadline=30000)
     def test_unfold2d(
-            self,
-            B: int,
-            C: int,
-            H: int,
-            W: int,
-            k_h: int,
-            k_w: int,
-            pad_h: int,
-            pad_w: int,
-            stride_h: int,
-            stride_w: int,
-            dilation_h: int,
-            dilation_w: int,
+        self,
+        B: int,
+        C: int,
+        H: int,
+        W: int,
+        k_h: int,
+        k_w: int,
+        pad_h: int,
+        pad_w: int,
+        stride_h: int,
+        stride_w: int,
+        dilation_h: int,
+        dilation_w: int,
     ):
         X = torch.randn(B, C, H, W)
         X_unfold_torch = torch.nn.functional.unfold(
diff --git a/opacus/tests/grad_samples/conv3d_test.py b/opacus/tests/grad_samples/conv3d_test.py
index ea7b95a6..04b60383 100644
--- a/opacus/tests/grad_samples/conv3d_test.py
+++ b/opacus/tests/grad_samples/conv3d_test.py
@@ -21,7 +21,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test, expander, shrinker
-from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
+from ...utils.per_sample_gradients_utils import (
+    get_grad_sample_modes,
+    check_per_sample_gradients_are_correct,
+)
 
 
 class Conv3d_test(GradSampleHooks_test):
@@ -37,30 +40,30 @@ class Conv3d_test(GradSampleHooks_test):
         padding=st.sampled_from([0, 2, (1, 2, 3), "same", "valid"]),
         dilation=st.sampled_from([1, (1, 2, 2)]),
         groups=st.integers(1, 16),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=30000)
     def test_conv3d(
-            self,
-            N: int,
-            C: int,
-            D: int,
-            H: int,
-            W: int,
-            out_channels_mapper: int,
-            kernel_size: Union[int, Tuple[int]],
-            stride: Union[int, Tuple[int]],
-            padding: Union[int, Tuple[int]],
-            dilation: int,
-            groups: int,
-            test_or_check: int
+        self,
+        N: int,
+        C: int,
+        D: int,
+        H: int,
+        W: int,
+        out_channels_mapper: int,
+        kernel_size: Union[int, Tuple[int]],
+        stride: Union[int, Tuple[int]],
+        padding: Union[int, Tuple[int]],
+        dilation: int,
+        groups: int,
+        test_or_check: int,
     ):
 
         if padding == "same" and stride != 1:
             return
         out_channels = out_channels_mapper(C)
         if (
-                C % groups != 0 or out_channels % groups != 0
+            C % groups != 0 or out_channels % groups != 0
         ):  # since in_channels and out_channels must be divisible by groups
             return
         x = torch.randn([N, C, D, H, W])
@@ -74,7 +77,7 @@ def test_conv3d(
             groups=groups,
         )
         is_ew_compatible = (
-                dilation == 1 and padding != "same"
+            dilation == 1 and padding != "same"
         )  # TODO add support for padding = 'same' with EW
         if test_or_check == 1:
             self.run_test(
@@ -93,5 +96,5 @@ def test_conv3d(
                     batch_first=True,
                     atol=10e-5,
                     rtol=10e-3,
-                    grad_sample_mode=grad_sample_mode
+                    grad_sample_mode=grad_sample_mode,
                 )
diff --git a/opacus/tests/grad_samples/dp_multihead_attention_test.py b/opacus/tests/grad_samples/dp_multihead_attention_test.py
index 1f255e89..b1192e2a 100644
--- a/opacus/tests/grad_samples/dp_multihead_attention_test.py
+++ b/opacus/tests/grad_samples/dp_multihead_attention_test.py
@@ -20,7 +20,10 @@
 from opacus.layers import DPMultiheadAttention
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
+from ...utils.per_sample_gradients_utils import (
+    check_per_sample_gradients_are_correct,
+    get_grad_sample_modes,
+)
 
 
 class DPMultiheadAttentionAdapter(nn.Module):
@@ -54,20 +57,20 @@ class MultiHeadAttention_test(GradSampleHooks_test):
         add_bias_kv=st.booleans(),
         add_zero_attn=st.booleans(),
         kv_dim=st.booleans(),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
     def test_multihead_attention(
-            self,
-            N: int,
-            T: int,
-            D: int,
-            P: int,
-            bias: bool,
-            add_bias_kv: bool,
-            add_zero_attn: bool,
-            kv_dim: bool,
-            test_or_check: int
+        self,
+        N: int,
+        T: int,
+        D: int,
+        P: int,
+        bias: bool,
+        add_bias_kv: bool,
+        add_zero_attn: bool,
+        kv_dim: bool,
+        test_or_check: int,
     ):
 
         if kv_dim:
@@ -93,5 +96,6 @@ def test_multihead_attention(
             self.run_test(x, attn, batch_first=False)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(x, attn, batch_first=False,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x, attn, batch_first=False, grad_sample_mode=grad_sample_mode
+                )
diff --git a/opacus/tests/grad_samples/dp_rnn_test.py b/opacus/tests/grad_samples/dp_rnn_test.py
index 23402ccf..390cd707 100644
--- a/opacus/tests/grad_samples/dp_rnn_test.py
+++ b/opacus/tests/grad_samples/dp_rnn_test.py
@@ -21,7 +21,10 @@
 from opacus.utils.packed_sequences import _gen_packed_data
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
+from ...utils.per_sample_gradients_utils import (
+    check_per_sample_gradients_are_correct,
+    get_grad_sample_modes,
+)
 
 MODELS = [
     DPRNN,
@@ -59,23 +62,23 @@ class RNN_test(GradSampleHooks_test):
         bidirectional=st.booleans(),
         using_packed_sequences=st.booleans(),
         packed_sequences_sorted=st.booleans(),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=30000)
     def test_rnn(
-            self,
-            model,
-            N: int,
-            T: int,
-            D: int,
-            H: int,
-            num_layers: int,
-            bias: bool,
-            batch_first: bool,
-            bidirectional: bool,
-            using_packed_sequences: bool,
-            packed_sequences_sorted: bool,
-            test_or_check: int
+        self,
+        model,
+        N: int,
+        T: int,
+        D: int,
+        H: int,
+        num_layers: int,
+        bias: bool,
+        batch_first: bool,
+        bidirectional: bool,
+        using_packed_sequences: bool,
+        packed_sequences_sorted: bool,
+        test_or_check: int,
     ):
         rnn = model(
             D,
@@ -99,5 +102,6 @@ def test_rnn(
             self.run_test(x, rnn, batch_first=batch_first, ew_compatible=False)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(x, rnn, batch_first=batch_first,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x, rnn, batch_first=batch_first, grad_sample_mode=grad_sample_mode
+                )
diff --git a/opacus/tests/grad_samples/embedding_test.py b/opacus/tests/grad_samples/embedding_test.py
index 7e053c1a..21cfcc18 100644
--- a/opacus/tests/grad_samples/embedding_test.py
+++ b/opacus/tests/grad_samples/embedding_test.py
@@ -19,7 +19,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
+from ...utils.per_sample_gradients_utils import (
+    check_per_sample_gradients_are_correct,
+    get_grad_sample_modes,
+)
 
 
 class Embedding_test(GradSampleHooks_test):
@@ -32,20 +35,20 @@ class Embedding_test(GradSampleHooks_test):
         D=st.integers(10, 17),
         dim=st.integers(2, 4),
         batch_first=st.booleans(),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
     def test_input_across_dims(
-            self,
-            N: int,
-            T: int,
-            Q: int,
-            R: int,
-            V: int,
-            D: int,
-            dim: int,
-            batch_first: bool,
-            test_or_check: int
+        self,
+        N: int,
+        T: int,
+        Q: int,
+        R: int,
+        V: int,
+        D: int,
+        dim: int,
+        batch_first: bool,
+        test_or_check: int,
     ):
 
         if dim == 1:  # TODO: fix when dim is 1
@@ -64,5 +67,6 @@ def test_input_across_dims(
             self.run_test(x, emb, batch_first=batch_first)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(x, emb, batch_first=batch_first,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x, emb, batch_first=batch_first, grad_sample_mode=grad_sample_mode
+                )
diff --git a/opacus/tests/grad_samples/group_norm_test.py b/opacus/tests/grad_samples/group_norm_test.py
index b7f13cb9..c69a71cc 100644
--- a/opacus/tests/grad_samples/group_norm_test.py
+++ b/opacus/tests/grad_samples/group_norm_test.py
@@ -21,7 +21,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
+from ...utils.per_sample_gradients_utils import (
+    check_per_sample_gradients_are_correct,
+    get_grad_sample_modes,
+)
 
 
 class GroupNorm_test(GradSampleHooks_test):
@@ -36,17 +39,17 @@ class GroupNorm_test(GradSampleHooks_test):
         H=st.integers(5, 10),
         W=st.integers(4, 8),
         num_groups=st.sampled_from([1, 4, "C"]),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
     def test_3d_input_groups(
-            self,
-            N: int,
-            C: int,
-            H: int,
-            W: int,
-            num_groups: Union[int, str],
-            test_or_check: int
+        self,
+        N: int,
+        C: int,
+        H: int,
+        W: int,
+        num_groups: Union[int, str],
+        test_or_check: int,
     ):
 
         if num_groups == "C":
@@ -62,5 +65,6 @@ def test_3d_input_groups(
             self.run_test(x, norm, batch_first=True)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x, norm, batch_first=True, grad_sample_mode=grad_sample_mode
+                )
diff --git a/opacus/tests/grad_samples/instance_norm1d_test.py b/opacus/tests/grad_samples/instance_norm1d_test.py
index 7001f1c8..68bdebda 100644
--- a/opacus/tests/grad_samples/instance_norm1d_test.py
+++ b/opacus/tests/grad_samples/instance_norm1d_test.py
@@ -19,7 +19,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
+from ...utils.per_sample_gradients_utils import (
+    check_per_sample_gradients_are_correct,
+    get_grad_sample_modes,
+)
 
 
 class InstanceNorm1d_test(GradSampleHooks_test):
@@ -27,16 +30,10 @@ class InstanceNorm1d_test(GradSampleHooks_test):
         N=st.integers(1, 4),
         C=st.integers(1, 3),
         W=st.integers(5, 10),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
-    def test_3d_input(
-            self,
-            N: int,
-            C: int,
-            W: int,
-            test_or_check: int
-    ):
+    def test_3d_input(self, N: int, C: int, W: int, test_or_check: int):
 
         x = torch.randn([N, C, W])
         norm = nn.InstanceNorm1d(num_features=C, affine=True, track_running_stats=False)
@@ -45,5 +42,6 @@ def test_3d_input(
             self.run_test(x, norm, batch_first=True)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x, norm, batch_first=True, grad_sample_mode=grad_sample_mode
+                )
diff --git a/opacus/tests/grad_samples/instance_norm2d_test.py b/opacus/tests/grad_samples/instance_norm2d_test.py
index 6f955f87..be622009 100644
--- a/opacus/tests/grad_samples/instance_norm2d_test.py
+++ b/opacus/tests/grad_samples/instance_norm2d_test.py
@@ -19,7 +19,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
+from ...utils.per_sample_gradients_utils import (
+    get_grad_sample_modes,
+    check_per_sample_gradients_are_correct,
+)
 
 
 class InstanceNorm2d_test(GradSampleHooks_test):
@@ -28,17 +31,10 @@ class InstanceNorm2d_test(GradSampleHooks_test):
         C=st.integers(1, 3),
         W=st.integers(5, 10),
         H=st.integers(4, 8),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
-    def test_4d_input(
-            self,
-            N: int,
-            C: int,
-            W: int,
-            H: int,
-            test_or_check: int
-    ):
+    def test_4d_input(self, N: int, C: int, W: int, H: int, test_or_check: int):
 
         x = torch.randn([N, C, H, W])
         norm = nn.InstanceNorm2d(num_features=C, affine=True, track_running_stats=False)
@@ -46,5 +42,6 @@ def test_4d_input(
             self.run_test(x, norm, batch_first=True)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x, norm, batch_first=True, grad_sample_mode=grad_sample_mode
+                )
diff --git a/opacus/tests/grad_samples/instance_norm3d_test.py b/opacus/tests/grad_samples/instance_norm3d_test.py
index 68d5298e..14a9d3b6 100644
--- a/opacus/tests/grad_samples/instance_norm3d_test.py
+++ b/opacus/tests/grad_samples/instance_norm3d_test.py
@@ -19,7 +19,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
+from ...utils.per_sample_gradients_utils import (
+    get_grad_sample_modes,
+    check_per_sample_gradients_are_correct,
+)
 
 
 class InstanceNorm3d_test(GradSampleHooks_test):
@@ -29,24 +32,16 @@ class InstanceNorm3d_test(GradSampleHooks_test):
         W=st.integers(5, 10),
         H=st.integers(4, 8),
         Z=st.integers(1, 4),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
-    def test_5d_input(
-            self,
-            N: int,
-            C: int,
-            W: int,
-            H: int,
-            Z: int,
-            test_or_check: int
-
-    ):
+    def test_5d_input(self, N: int, C: int, W: int, H: int, Z: int, test_or_check: int):
         x = torch.randn([N, C, Z, H, W])
         norm = nn.InstanceNorm3d(num_features=C, affine=True, track_running_stats=False)
         if test_or_check == 1:
             self.run_test(x, norm, batch_first=True)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x, norm, batch_first=True, grad_sample_mode=grad_sample_mode
+                )
diff --git a/opacus/tests/grad_samples/layer_norm_test.py b/opacus/tests/grad_samples/layer_norm_test.py
index ea556dd8..29c3c8f5 100644
--- a/opacus/tests/grad_samples/layer_norm_test.py
+++ b/opacus/tests/grad_samples/layer_norm_test.py
@@ -19,7 +19,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
+from ...utils.per_sample_gradients_utils import (
+    check_per_sample_gradients_are_correct,
+    get_grad_sample_modes,
+)
 
 
 class LayerNorm_test(GradSampleHooks_test):
@@ -30,18 +33,18 @@ class LayerNorm_test(GradSampleHooks_test):
         W=st.integers(5, 10),
         input_dim=st.integers(2, 4),
         norm_dim=st.integers(1, 3),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
     def test_input_norm(
-            self,
-            N: int,
-            Z: int,
-            W: int,
-            H: int,
-            input_dim: int,
-            norm_dim: int,
-            test_or_check: int
+        self,
+        N: int,
+        Z: int,
+        W: int,
+        H: int,
+        input_dim: int,
+        norm_dim: int,
+        test_or_check: int,
     ):
 
         if norm_dim >= input_dim:
@@ -71,5 +74,6 @@ def test_input_norm(
             self.run_test(x, norm, batch_first=True)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x, norm, batch_first=True, grad_sample_mode=grad_sample_mode
+                )
diff --git a/opacus/tests/grad_samples/linear_test.py b/opacus/tests/grad_samples/linear_test.py
index cb3ef89e..bb5e3ba6 100644
--- a/opacus/tests/grad_samples/linear_test.py
+++ b/opacus/tests/grad_samples/linear_test.py
@@ -19,7 +19,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
+from ...utils.per_sample_gradients_utils import (
+    get_grad_sample_modes,
+    check_per_sample_gradients_are_correct,
+)
 
 
 class Linear_test(GradSampleHooks_test):
@@ -31,19 +34,19 @@ class Linear_test(GradSampleHooks_test):
         input_dim=st.integers(2, 4),
         bias=st.booleans(),
         batch_first=st.booleans(),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
     def test_input_bias(
-            self,
-            N: int,
-            Z: int,
-            W: int,
-            H: int,
-            input_dim: int,
-            bias: bool,
-            batch_first: bool,
-            test_or_check: int
+        self,
+        N: int,
+        Z: int,
+        W: int,
+        H: int,
+        input_dim: int,
+        bias: bool,
+        batch_first: bool,
+        test_or_check: int,
     ):
 
         if input_dim == 2:
@@ -64,5 +67,9 @@ def test_input_bias(
             self.run_test(x, linear, batch_first=batch_first)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(x, linear, batch_first=batch_first,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x,
+                    linear,
+                    batch_first=batch_first,
+                    grad_sample_mode=grad_sample_mode,
+                )
diff --git a/opacus/tests/grad_samples/sequence_bias_test.py b/opacus/tests/grad_samples/sequence_bias_test.py
index 0bbf0e40..0e8d39ee 100644
--- a/opacus/tests/grad_samples/sequence_bias_test.py
+++ b/opacus/tests/grad_samples/sequence_bias_test.py
@@ -19,7 +19,10 @@
 from opacus.layers import SequenceBias
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
+from ...utils.per_sample_gradients_utils import (
+    check_per_sample_gradients_are_correct,
+    get_grad_sample_modes,
+)
 
 
 class SequenceBias_test(GradSampleHooks_test):
@@ -28,16 +31,11 @@ class SequenceBias_test(GradSampleHooks_test):
         T=st.integers(10, 20),
         D=st.integers(4, 8),
         batch_first=st.booleans(),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
     def test_batch_second(
-            self,
-            N: int,
-            T: int,
-            D: int,
-            batch_first: bool,
-            test_or_check: int
+        self, N: int, T: int, D: int, batch_first: bool, test_or_check: int
     ):
 
         seqbias = SequenceBias(D, batch_first)
@@ -49,5 +47,9 @@ def test_batch_second(
             self.run_test(x, seqbias, batch_first, ew_compatible=False)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=False):
-                assert check_per_sample_gradients_are_correct(x, seqbias, batch_first=batch_first,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x,
+                    seqbias,
+                    batch_first=batch_first,
+                    grad_sample_mode=grad_sample_mode,
+                )
diff --git a/opacus/utils/per_sample_gradients_utils.py b/opacus/utils/per_sample_gradients_utils.py
index 7e00a9ec..9b80d9f7 100644
--- a/opacus/utils/per_sample_gradients_utils.py
+++ b/opacus/utils/per_sample_gradients_utils.py
@@ -85,10 +85,10 @@ def forward(self, x):
 
 
 def compute_microbatch_grad_sample(
-        x: Union[torch.Tensor, List[torch.Tensor]],
-        module: nn.Module,
-        batch_first=True,
-        loss_reduction="mean",
+    x: Union[torch.Tensor, List[torch.Tensor]],
+    module: nn.Module,
+    batch_first=True,
+    loss_reduction="mean",
 ) -> Dict[str, torch.tensor]:
     """
     Computes per-sample gradients with the microbatch method, i.e. by computing normal gradients
@@ -152,11 +152,11 @@ def compute_microbatch_grad_sample(
 
 
 def compute_opacus_grad_sample(
-        x: Union[torch.Tensor, PackedSequence],
-        module: nn.Module,
-        batch_first=True,
-        loss_reduction="mean",
-        grad_sample_mode="hooks",
+    x: Union[torch.Tensor, PackedSequence],
+    module: nn.Module,
+    batch_first=True,
+    loss_reduction="mean",
+    grad_sample_mode="hooks",
 ) -> Dict[str, torch.tensor]:
     """
     Runs Opacus to compute per-sample gradients and return them for testing purposes.
@@ -190,9 +190,7 @@ def compute_opacus_grad_sample(
 
     opacus_grad_samples = {
         name: p.grad_sample
-        for name, p in trainable_parameters(
-            grad_sample_module.wrapped_module._module
-        )
+        for name, p in trainable_parameters(grad_sample_module.wrapped_module._module)
     }
 
     return opacus_grad_samples
@@ -209,13 +207,15 @@ def get_grad_sample_modes(use_ew: bool = False):
     return grad_sample_modes
 
 
-def check_per_sample_gradients_are_correct(x: Union[torch.Tensor, PackedSequence],
-                                           module: nn.Module,
-                                           *,
-                                           batch_first=True,
-                                           atol=10e-6,
-                                           rtol=10e-5,
-                                           grad_sample_mode="hooks") -> bool:
+def check_per_sample_gradients_are_correct(
+    x: Union[torch.Tensor, PackedSequence],
+    module: nn.Module,
+    *,
+    batch_first=True,
+    atol=10e-6,
+    rtol=10e-5,
+    grad_sample_mode="hooks",
+) -> bool:
     """
     A utility to check whether per sample gradients are computed correctly with a particular model.
     Args:
@@ -255,10 +255,12 @@ def check_per_sample_gradients_are_correct(x: Union[torch.Tensor, PackedSequence
     return correct
 
 
-def compute_microbatch_grad_sample_tensor_or_seq(x: Union[torch.Tensor, PackedSequence],
-                                                 module: nn.Module,
-                                                 batch_first=True,
-                                                 loss_reduction="mean"):
+def compute_microbatch_grad_sample_tensor_or_seq(
+    x: Union[torch.Tensor, PackedSequence],
+    module: nn.Module,
+    batch_first=True,
+    loss_reduction="mean",
+):
     if type(x) is PackedSequence:
         x_unpacked = unpack_packedsequences(x)
         microbatch_grad_samples = compute_microbatch_grad_sample(
@@ -275,21 +277,22 @@ def compute_microbatch_grad_sample_tensor_or_seq(x: Union[torch.Tensor, PackedSe
     return microbatch_grad_samples
 
 
-def compute_grad_samples_microbatch_and_opacus(x: Union[torch.Tensor, PackedSequence],
-                                               module: nn.Module,
-                                               batch_first=True,
-                                               loss_reduction="mean",
-                                               grad_sample_mode="hooks"):
-    microbatch_grad_samples = compute_microbatch_grad_sample_tensor_or_seq(x,
-                                                                           module,
-                                                                           batch_first=batch_first,
-                                                                           loss_reduction=loss_reduction)
+def compute_grad_samples_microbatch_and_opacus(
+    x: Union[torch.Tensor, PackedSequence],
+    module: nn.Module,
+    batch_first=True,
+    loss_reduction="mean",
+    grad_sample_mode="hooks",
+):
+    microbatch_grad_samples = compute_microbatch_grad_sample_tensor_or_seq(
+        x, module, batch_first=batch_first, loss_reduction=loss_reduction
+    )
     opacus_grad_samples = compute_opacus_grad_sample(
         x,
         module,
         batch_first=batch_first,
         loss_reduction=loss_reduction,
-        grad_sample_mode=grad_sample_mode
+        grad_sample_mode=grad_sample_mode,
     )
 
     if microbatch_grad_samples.keys() != opacus_grad_samples.keys():
@@ -303,26 +306,33 @@ def compute_grad_samples_microbatch_and_opacus(x: Union[torch.Tensor, PackedSequ
 
 
 def _check_per_sample_gradients_are_correct_with_reduction(
-        x: Union[torch.Tensor, PackedSequence],
-        module: nn.Module,
-        batch_first=True,
-        loss_reduction="mean",
-        atol=10e-6,
-        rtol=10e-5,
-        grad_sample_mode="hooks"
+    x: Union[torch.Tensor, PackedSequence],
+    module: nn.Module,
+    batch_first=True,
+    loss_reduction="mean",
+    atol=10e-6,
+    rtol=10e-5,
+    grad_sample_mode="hooks",
 ) -> bool:
-    microbatch_grad_samples, opacus_grad_samples = \
-        compute_grad_samples_microbatch_and_opacus(x,
-                                                   module,
-                                                   batch_first=batch_first,
-                                                   loss_reduction=loss_reduction,
-                                                   grad_sample_mode=grad_sample_mode)
+    (
+        microbatch_grad_samples,
+        opacus_grad_samples,
+    ) = compute_grad_samples_microbatch_and_opacus(
+        x,
+        module,
+        batch_first=batch_first,
+        loss_reduction=loss_reduction,
+        grad_sample_mode=grad_sample_mode,
+    )
 
     correct = True
     for name, opacus_grad_sample in opacus_grad_samples.items():
         microbatch_grad_sample = microbatch_grad_samples[name]
-        correct = correct and np.allclose(microbatch_grad_sample, opacus_grad_sample, atol,
-                                          rtol) and opacus_grad_sample.shape == microbatch_grad_sample.shape
+        correct = (
+            correct
+            and np.allclose(microbatch_grad_sample, opacus_grad_sample, atol, rtol)
+            and opacus_grad_sample.shape == microbatch_grad_sample.shape
+        )
 
     return correct
 
@@ -353,7 +363,7 @@ def unpack_packedsequences(X: PackedSequence) -> List[torch.Tensor]:
 
 
 def _compute_loss_packedsequences(
-        criterion: nn.L1Loss, x: PackedSequence
+    criterion: nn.L1Loss, x: PackedSequence
 ) -> torch.Tensor:
     r"""
     This function computes the loss in a different way for 'mean' reduced L1 loss while for 'sum' reduced L1 loss,

From 585be684163c31d1a654e7d51750e37314e99b98 Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Fri, 28 Oct 2022 17:08:27 +0100
Subject: [PATCH 11/32] Fix flake8 errors

---
 opacus/tests/grad_samples/layer_norm_test.py | 27 ++++++++++++--------
 opacus/utils/per_sample_gradients_utils.py   |  2 +-
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/opacus/tests/grad_samples/layer_norm_test.py b/opacus/tests/grad_samples/layer_norm_test.py
index 29c3c8f5..b303ec49 100644
--- a/opacus/tests/grad_samples/layer_norm_test.py
+++ b/opacus/tests/grad_samples/layer_norm_test.py
@@ -49,6 +49,22 @@ def test_input_norm(
 
         if norm_dim >= input_dim:
             return
+        normalized_shape, x_shape = self.get_x_shape_and_norm_shape(
+            H, N, W, Z, input_dim, norm_dim
+        )
+
+        norm = nn.LayerNorm(normalized_shape, elementwise_affine=True)
+        x = torch.randn(x_shape)
+        if test_or_check == 1:
+            self.run_test(x, norm, batch_first=True)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(
+                    x, norm, batch_first=True, grad_sample_mode=grad_sample_mode
+                )
+
+    @staticmethod
+    def get_x_shape_and_norm_shape(H, N, W, Z, input_dim, norm_dim):
         if norm_dim == 1:
             normalized_shape = W
             if input_dim == 2:
@@ -67,13 +83,4 @@ def test_input_norm(
         elif norm_dim == 3:
             normalized_shape = [Z, H, W]
             x_shape = [N, Z, H, W]
-
-        norm = nn.LayerNorm(normalized_shape, elementwise_affine=True)
-        x = torch.randn(x_shape)
-        if test_or_check == 1:
-            self.run_test(x, norm, batch_first=True)
-        if test_or_check == 2:
-            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(
-                    x, norm, batch_first=True, grad_sample_mode=grad_sample_mode
-                )
+        return normalized_shape, x_shape
diff --git a/opacus/utils/per_sample_gradients_utils.py b/opacus/utils/per_sample_gradients_utils.py
index 9b80d9f7..724b1542 100644
--- a/opacus/utils/per_sample_gradients_utils.py
+++ b/opacus/utils/per_sample_gradients_utils.py
@@ -235,7 +235,7 @@ def check_per_sample_gradients_are_correct(
     reductions = ["sum", "mean"]
     if grad_sample_mode == "ew":
         if not batch_first:
-            raise RuntimeError(f"Batch should be first dimension.")
+            raise RuntimeError("Batch should be first dimension.")
         if not check_torch_version_for_ew_sample():
             raise RuntimeError(f"Unsupported torch version: {torch.__version__}.")
         reductions = ["sum"]

From 82c8f524761319b3db6d8e981e0e638d9270b561 Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Thu, 27 Oct 2022 18:53:14 +0100
Subject: [PATCH 12/32] Implement per sample grads util and refactor code

---
 opacus/tests/grad_samples/common.py           | 324 ++--------------
 opacus/tests/grad_samples/conv1d_test.py      |  34 +-
 opacus/tests/grad_samples/conv2d_test.py      | 107 +++--
 opacus/tests/grad_samples/conv3d_test.py      |  56 ++-
 .../dp_multihead_attention_test.py            |  28 +-
 opacus/tests/grad_samples/dp_rnn_test.py      |  36 +-
 opacus/tests/grad_samples/embedding_test.py   |  29 +-
 opacus/tests/grad_samples/group_norm_test.py  |  24 +-
 .../grad_samples/instance_norm1d_test.py      |  19 +-
 .../grad_samples/instance_norm2d_test.py      |  20 +-
 .../grad_samples/instance_norm3d_test.py      |  23 +-
 opacus/tests/grad_samples/layer_norm_test.py  |  24 +-
 opacus/tests/grad_samples/linear_test.py      |  27 +-
 .../tests/grad_samples/sequence_bias_test.py  |  20 +-
 opacus/utils/per_sample_gradients_utils.py    | 367 ++++++++++++++++++
 15 files changed, 692 insertions(+), 446 deletions(-)
 create mode 100644 opacus/utils/per_sample_gradients_utils.py

diff --git a/opacus/tests/grad_samples/common.py b/opacus/tests/grad_samples/common.py
index 5a981ba2..c24d924a 100644
--- a/opacus/tests/grad_samples/common.py
+++ b/opacus/tests/grad_samples/common.py
@@ -15,18 +15,16 @@
 
 import io
 import unittest
-from typing import Dict, List, Union
+from typing import Union
 
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from opacus.grad_sample import wrap_model
-from opacus.utils.module_utils import trainable_parameters
-from opacus.utils.packed_sequences import compute_seq_lengths
-from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence
+from torch.nn.utils.rnn import PackedSequence
 from torch.testing import assert_allclose
 
+from opacus.utils.per_sample_gradients_utils import compute_grad_samples_microbatch_and_opacus
+
 
 def expander(x, factor: int = 2):
     return x * factor
@@ -36,189 +34,20 @@ def shrinker(x, factor: int = 2):
     return max(1, x // factor)  # if avoid returning 0 for x == 1
 
 
-class ModelWithLoss(nn.Module):
-    """
-    To test the gradients of a module, we need to have a loss.
-    This module makes it easy to get a loss from any nn.Module, and automatically generates
-    a target y vector for it in the forward (of all zeros of the correct size).
-    This reduces boilerplate while testing.
-    """
-
-    supported_reductions = ["mean", "sum"]
-
-    def __init__(self, module: nn.Module, loss_reduction: str = "mean"):
-        """
-        Instantiates this module.
-
-        Args:
-            module: The nn.Module you want to test.
-            loss_reduction: What reduction to apply to the loss. Defaults to "mean".
-
-        Raises:
-            ValueError: If ``loss_reduction`` is not among those supported.
-        """
-        super().__init__()
-        self.wrapped_module = module
-
-        if loss_reduction not in self.supported_reductions:
-            raise ValueError(
-                f"Passed loss_reduction={loss_reduction}. Only {self.supported_reductions} supported."
-            )
-        self.criterion = nn.L1Loss(reduction=loss_reduction)
-
-    def forward(self, x):
-        x = self.wrapped_module(x)
-        if type(x) is PackedSequence:
-            loss = _compute_loss_packedsequences(self.criterion, x)
-        else:
-            y = torch.zeros_like(x)
-            loss = self.criterion(x, y)
-        return loss
-
-
-def clone_module(module: nn.Module) -> nn.Module:
-    """
-    Handy utility to clone an nn.Module. PyTorch doesn't always support copy.deepcopy(), so it is
-    just easier to serialize the model to a BytesIO and read it from there.
-
-    Args:
-        module: The module to clone
-
-    Returns:
-        The clone of ``module``
-    """
-    with io.BytesIO() as bytesio:
-        torch.save(module, bytesio)
-        bytesio.seek(0)
-        module_copy = torch.load(bytesio)
-    return module_copy
-
-
 class GradSampleHooks_test(unittest.TestCase):
     """
     Set of common testing utils. It is meant to be subclassed by your test.
     See other tests as an example of how this is done.
     """
 
-    def compute_microbatch_grad_sample(
-        self,
-        x: Union[torch.Tensor, List[torch.Tensor]],
-        module: nn.Module,
-        batch_first=True,
-        loss_reduction="mean",
-    ) -> Dict[str, torch.tensor]:
-        """
-        Computes per-sample gradients with the microbatch method, i.e. by computing normal gradients
-        with batch_size set to 1, and manually accumulating them. This is our reference for testing
-        as this method is obviously correct, but slow.
-
-        Args:
-            x: The tensor in input to the ``module``
-            module: The ``ModelWithLoss`` that wraps the nn.Module you want to test.
-            batch_first: Whether batch size is the first dimension (as opposed to the second).
-                Defaults to True.
-
-        Returns:
-            Dictionary mapping parameter_name -> per-sample-gradient for that parameter
-        """
-        torch.use_deterministic_algorithms(True)
-        torch.manual_seed(0)
-        np.random.seed(0)
-
-        module = ModelWithLoss(clone_module(module), loss_reduction)
-
-        for _, p in trainable_parameters(module):
-            p.microbatch_grad_sample = []
-
-        if not batch_first and type(x) is not list:
-            # This allows us to iterate with x_i
-            x = x.transpose(0, 1)
-
-        # Invariant: x is [B, T, ...]
-
-        for x_i in x:
-            # x_i is [T, ...]
-            x_i = x_i.unsqueeze(
-                0 if batch_first else 1
-            )  # x_i of size [1, T, ...] if batch_first, else [T, 1, ...]
-            module.zero_grad()
-            loss_i = module(x_i)
-            loss_i.backward()
-            for p in module.parameters():
-                p.microbatch_grad_sample.append(p.grad.detach().clone())
-
-        for _, p in trainable_parameters(module):
-            if batch_first:
-                p.microbatch_grad_sample = torch.stack(
-                    p.microbatch_grad_sample, dim=0  # [B, T, ...]
-                )
-            else:
-                p.microbatch_grad_sample = torch.stack(
-                    p.microbatch_grad_sample, dim=1  # [T, B, ...]
-                ).transpose(
-                    0, 1
-                )  # Opacus's semantics is that grad_samples are ALWAYS batch_first: [B, T, ...]
-
-        microbatch_grad_samples = {
-            name: p.microbatch_grad_sample
-            for name, p in trainable_parameters(module.wrapped_module)
-        }
-        return microbatch_grad_samples
-
-    def compute_opacus_grad_sample(
-        self,
-        x: Union[torch.Tensor, PackedSequence],
-        module: nn.Module,
-        batch_first=True,
-        loss_reduction="mean",
-        grad_sample_mode="hooks",
-    ) -> Dict[str, torch.tensor]:
-        """
-        Runs Opacus to compute per-sample gradients and return them for testing purposes.
-
-        Args:
-            x: The tensor in input to the ``module``
-            module: The ``ModelWithLoss`` that wraps the nn.Module you want to test.
-            batch_first: Whether batch size is the first dimension (as opposed to the second).
-                Defaults to True.
-            loss_reduction: What reduction to apply to the loss. Defaults to "mean".
-
-        Returns:
-            Dictionary mapping parameter_name -> per-sample-gradient for that parameter
-        """
-        torch.use_deterministic_algorithms(True)
-        torch.manual_seed(0)
-        np.random.seed(0)
-
-        gs_module = wrap_model(
-            model=clone_module(module),
-            grad_sample_mode=grad_sample_mode,
-            batch_first=batch_first,
-            loss_reduction=loss_reduction,
-        )
-        grad_sample_module = ModelWithLoss(gs_module, loss_reduction)
-
-        grad_sample_module.zero_grad()
-        loss = grad_sample_module(x)
-        loss.backward()
-
-        opacus_grad_samples = {
-            name: p.grad_sample
-            for name, p in trainable_parameters(
-                grad_sample_module.wrapped_module._module
-            )
-        }
-
-        return opacus_grad_samples
-
     def run_test(
-        self,
-        x: Union[torch.Tensor, PackedSequence],
-        module: nn.Module,
-        batch_first=True,
-        atol=10e-6,
-        rtol=10e-5,
-        ew_compatible=True,
+            self,
+            x: Union[torch.Tensor, PackedSequence],
+            module: nn.Module,
+            batch_first=True,
+            atol=10e-6,
+            rtol=10e-5,
+            ew_compatible=True,
     ):
         grad_sample_modes = ["hooks", "functorch"]
         try:
@@ -231,9 +60,8 @@ def run_test(
 
         for grad_sample_mode in grad_sample_modes:
             for loss_reduction in ["sum", "mean"]:
-
                 with self.subTest(
-                    grad_sample_mode=grad_sample_mode, loss_reduction=loss_reduction
+                        grad_sample_mode=grad_sample_mode, loss_reduction=loss_reduction
                 ):
                     self.run_test_with_reduction(
                         x,
@@ -256,45 +84,21 @@ def run_test(
             )
 
     def run_test_with_reduction(
-        self,
-        x: Union[torch.Tensor, PackedSequence],
-        module: nn.Module,
-        batch_first=True,
-        loss_reduction="mean",
-        atol=10e-6,
-        rtol=10e-5,
-        grad_sample_mode="hooks",
+            self,
+            x: Union[torch.Tensor, PackedSequence],
+            module: nn.Module,
+            batch_first=True,
+            loss_reduction="mean",
+            atol=10e-6,
+            rtol=10e-5,
+            grad_sample_mode="hooks",
     ):
-        opacus_grad_samples = self.compute_opacus_grad_sample(
-            x,
-            module,
-            batch_first=batch_first,
-            loss_reduction=loss_reduction,
-            grad_sample_mode=grad_sample_mode,
-        )
-
-        if type(x) is PackedSequence:
-            x_unpacked = _unpack_packedsequences(x)
-            microbatch_grad_samples = self.compute_microbatch_grad_sample(
-                x_unpacked,
-                module,
-                batch_first=batch_first,
-                loss_reduction=loss_reduction,
-            )
-        elif x.numel() > 0:
-            microbatch_grad_samples = self.compute_microbatch_grad_sample(
-                x, module, batch_first=batch_first, loss_reduction=loss_reduction
-            )
-        else:
-            # We've checked opacus can handle 0-sized batch. Microbatch doesn't make sense
-            return
-
-        if microbatch_grad_samples.keys() != opacus_grad_samples.keys():
-            raise ValueError(
-                "Keys not matching! "
-                f"Keys only in microbatch: {microbatch_grad_samples.keys() - opacus_grad_samples.keys()}; "
-                f"Keys only in Opacus: {opacus_grad_samples.keys() - microbatch_grad_samples.keys()}"
-            )
+        microbatch_grad_samples, opacus_grad_samples = \
+            compute_grad_samples_microbatch_and_opacus(x,
+                                                       module,
+                                                       batch_first=batch_first,
+                                                       loss_reduction=loss_reduction,
+                                                       grad_sample_mode=grad_sample_mode)
 
         self.check_shapes(microbatch_grad_samples, opacus_grad_samples, loss_reduction)
         self.check_values(
@@ -302,10 +106,10 @@ def run_test_with_reduction(
         )
 
     def check_shapes(
-        self,
-        microbatch_grad_samples,
-        opacus_grad_samples,
-        loss_reduction,
+            self,
+            microbatch_grad_samples,
+            opacus_grad_samples,
+            loss_reduction,
     ) -> None:
         failed = []
         for name, opacus_grad_sample in opacus_grad_samples.items():
@@ -333,12 +137,12 @@ def check_shapes(
             )
 
     def check_values(
-        self,
-        microbatch_grad_samples,
-        opacus_grad_samples,
-        loss_reduction,
-        atol,
-        rtol,
+            self,
+            microbatch_grad_samples,
+            opacus_grad_samples,
+            loss_reduction,
+            atol,
+            rtol,
     ) -> None:
         failed = []
         for name, opacus_grad_sample in opacus_grad_samples.items():
@@ -364,59 +168,3 @@ def check_values(
                 f"A total of {len(failed)} values do not match "
                 f"for loss_reduction={loss_reduction}: \n\t{failed_str}"
             )
-
-
-def _unpack_packedsequences(X: PackedSequence) -> List[torch.Tensor]:
-    r"""
-    Produces a list of tensors from X (PackedSequence) such that this list was used to create X with batch_first=True
-
-    Args:
-        X: A PackedSequence from which the output list of tensors will be produced.
-
-    Returns:
-        unpacked_data: The list of tensors produced from X.
-    """
-
-    X_padded = pad_packed_sequence(X)
-    X_padded = X_padded[0].permute((1, 0, 2))
-
-    if X.sorted_indices is not None:
-        X_padded = X_padded[X.sorted_indices]
-
-    seq_lens = compute_seq_lengths(X.batch_sizes)
-    unpacked_data = [0] * len(seq_lens)
-    for idx, length in enumerate(seq_lens):
-        unpacked_data[idx] = X_padded[idx][:length, :]
-
-    return unpacked_data
-
-
-def _compute_loss_packedsequences(
-    criterion: nn.L1Loss, x: PackedSequence
-) -> torch.Tensor:
-    r"""
-    This function computes the loss in a different way for 'mean' reduced L1 loss while for 'sum' reduced L1 loss,
-    it computes the same way as with non-packed data. For 'mean' reduced L1 loss, it transforms x (PackedSequence)
-    into a list of tensors such that this list of tensors was used to create this PackedSequence in the first
-    place using batch_first=True and then takes the mean of the loss values produced from applying criterion on
-    each sequence sample.
-
-    Args:
-        criterion: An L1 loss function with reduction either set to 'sum' or 'mean'.
-        x: Data in the form of a PackedSequence.
-
-    Returns:
-        A loss variable, reduced either using summation or averaging from L1 errors.
-    """
-
-    if criterion.reduction == "sum":
-        y = torch.zeros_like(x[0])
-        return criterion(x[0], y)
-    elif criterion.reduction == "mean":
-        x = _unpack_packedsequences(x)
-        loss_sum = 0
-        for x_i in x:
-            y_i = torch.zeros_like(x_i)
-            loss_sum += criterion(x_i, y_i)
-        loss_mean = loss_sum / len(x)
-        return loss_mean
diff --git a/opacus/tests/grad_samples/conv1d_test.py b/opacus/tests/grad_samples/conv1d_test.py
index 2576f159..08598902 100644
--- a/opacus/tests/grad_samples/conv1d_test.py
+++ b/opacus/tests/grad_samples/conv1d_test.py
@@ -21,6 +21,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test, expander, shrinker
+from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
 
 
 class Conv1d_test(GradSampleHooks_test):
@@ -34,26 +35,28 @@ class Conv1d_test(GradSampleHooks_test):
         padding=st.sampled_from([0, 1, 2, "same", "valid"]),
         dilation=st.integers(1, 2),
         groups=st.integers(1, 12),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_conv1d(
-        self,
-        N: int,
-        C: int,
-        W: int,
-        out_channels_mapper: Callable[[int], int],
-        kernel_size: int,
-        stride: int,
-        padding: int,
-        dilation: int,
-        groups: int,
+            self,
+            N: int,
+            C: int,
+            W: int,
+            out_channels_mapper: Callable[[int], int],
+            kernel_size: int,
+            stride: int,
+            padding: int,
+            dilation: int,
+            groups: int,
+            test_or_check: int
     ):
 
         if padding == "same" and stride != 1:
             return
         out_channels = out_channels_mapper(C)
         if (
-            C % groups != 0 or out_channels % groups != 0
+                C % groups != 0 or out_channels % groups != 0
         ):  # since in_channels and out_channels must be divisible by groups
             return
 
@@ -67,6 +70,9 @@ def test_conv1d(
             dilation=dilation,
             groups=groups,
         )
-        self.run_test(
-            x, conv, batch_first=True, atol=10e-5, rtol=10e-4, ew_compatible=N > 0
-        )
+        if test_or_check == 1:
+            self.run_test(x, conv, batch_first=True, atol=10e-5, rtol=10e-4, ew_compatible=N > 0)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew= N>0):
+                assert check_per_sample_gradients_are_correct(x, conv, batch_first=True, atol=10e-5, rtol=10e-4,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/conv2d_test.py b/opacus/tests/grad_samples/conv2d_test.py
index 6d9a5b33..558a71d6 100644
--- a/opacus/tests/grad_samples/conv2d_test.py
+++ b/opacus/tests/grad_samples/conv2d_test.py
@@ -25,6 +25,7 @@
 from torch.testing import assert_allclose
 
 from .common import GradSampleHooks_test, expander, shrinker
+from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
 
 
 class Conv2d_test(GradSampleHooks_test):
@@ -39,26 +40,28 @@ class Conv2d_test(GradSampleHooks_test):
         padding=st.sampled_from([0, 2, "same", "valid"]),
         dilation=st.integers(1, 3),
         groups=st.integers(1, 16),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=30000)
     def test_conv2d(
-        self,
-        N: int,
-        C: int,
-        H: int,
-        W: int,
-        out_channels_mapper: Callable[[int], int],
-        kernel_size: int,
-        stride: int,
-        padding: int,
-        dilation: int,
-        groups: int,
+            self,
+            N: int,
+            C: int,
+            H: int,
+            W: int,
+            out_channels_mapper: Callable[[int], int],
+            kernel_size: int,
+            stride: int,
+            padding: int,
+            dilation: int,
+            groups: int,
+            test_or_check: int
     ):
         if padding == "same" and stride != 1:
             return
         out_channels = out_channels_mapper(C)
         if (
-            C % groups != 0 or out_channels % groups != 0
+                C % groups != 0 or out_channels % groups != 0
         ):  # since in_channels and out_channels must be divisible by groups
             return
 
@@ -77,22 +80,7 @@ def test_conv2d(
         )  # TODO add support for padding = 'same' with EW
 
         # Test regular GSM
-        self.run_test(
-            x,
-            conv,
-            batch_first=True,
-            atol=10e-5,
-            rtol=10e-4,
-            ew_compatible=is_ew_compatible,
-        )
-
-        if padding != "same" and N > 0:
-            # Test 'convolution as a backward' GSM
-            # 'convolution as a backward' doesn't support padding=same
-            conv2d_gsm = GradSampleModule.GRAD_SAMPLERS[nn.Conv2d]
-            GradSampleModule.GRAD_SAMPLERS[
-                nn.Conv2d
-            ] = convolution2d_backward_as_a_convolution
+        if test_or_check == 1:
             self.run_test(
                 x,
                 conv,
@@ -101,6 +89,43 @@ def test_conv2d(
                 rtol=10e-4,
                 ew_compatible=is_ew_compatible,
             )
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=is_ew_compatible):
+                assert check_per_sample_gradients_are_correct(
+                    x,
+                    conv,
+                    batch_first=True,
+                    atol=10e-5,
+                    rtol=10e-4,
+                    grad_sample_mode=grad_sample_mode
+                )
+
+        if padding != "same" and N > 0:
+            # Test 'convolution as a backward' GSM
+            # 'convolution as a backward' doesn't support padding=same
+            conv2d_gsm = GradSampleModule.GRAD_SAMPLERS[nn.Conv2d]
+            GradSampleModule.GRAD_SAMPLERS[
+                nn.Conv2d
+            ] = convolution2d_backward_as_a_convolution
+            if test_or_check == 1:
+                self.run_test(
+                    x,
+                    conv,
+                    batch_first=True,
+                    atol=10e-5,
+                    rtol=10e-4,
+                    ew_compatible=is_ew_compatible,
+                )
+            if test_or_check == 2:
+                for grad_sample_mode in get_grad_sample_modes(use_ew=is_ew_compatible):
+                    assert check_per_sample_gradients_are_correct(
+                        x,
+                        conv,
+                        batch_first=True,
+                        atol=10e-5,
+                        rtol=10e-4,
+                        grad_sample_mode=grad_sample_mode,
+                    )
             GradSampleModule.GRAD_SAMPLERS[nn.Conv2d] = conv2d_gsm
 
     @given(
@@ -119,19 +144,19 @@ def test_conv2d(
     )
     @settings(deadline=30000)
     def test_unfold2d(
-        self,
-        B: int,
-        C: int,
-        H: int,
-        W: int,
-        k_h: int,
-        k_w: int,
-        pad_h: int,
-        pad_w: int,
-        stride_h: int,
-        stride_w: int,
-        dilation_h: int,
-        dilation_w: int,
+            self,
+            B: int,
+            C: int,
+            H: int,
+            W: int,
+            k_h: int,
+            k_w: int,
+            pad_h: int,
+            pad_w: int,
+            stride_h: int,
+            stride_w: int,
+            dilation_h: int,
+            dilation_w: int,
     ):
         X = torch.randn(B, C, H, W)
         X_unfold_torch = torch.nn.functional.unfold(
diff --git a/opacus/tests/grad_samples/conv3d_test.py b/opacus/tests/grad_samples/conv3d_test.py
index e50909e2..5881beb3 100644
--- a/opacus/tests/grad_samples/conv3d_test.py
+++ b/opacus/tests/grad_samples/conv3d_test.py
@@ -21,6 +21,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test, expander, shrinker
+from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
 
 
 class Conv3d_test(GradSampleHooks_test):
@@ -36,28 +37,30 @@ class Conv3d_test(GradSampleHooks_test):
         padding=st.sampled_from([0, 2, (1, 2, 3), "same", "valid"]),
         dilation=st.sampled_from([1, (1, 2, 2)]),
         groups=st.integers(1, 16),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=30000)
     def test_conv3d(
-        self,
-        N: int,
-        C: int,
-        D: int,
-        H: int,
-        W: int,
-        out_channels_mapper: int,
-        kernel_size: Union[int, Tuple[int]],
-        stride: Union[int, Tuple[int]],
-        padding: Union[int, Tuple[int]],
-        dilation: int,
-        groups: int,
+            self,
+            N: int,
+            C: int,
+            D: int,
+            H: int,
+            W: int,
+            out_channels_mapper: int,
+            kernel_size: Union[int, Tuple[int]],
+            stride: Union[int, Tuple[int]],
+            padding: Union[int, Tuple[int]],
+            dilation: int,
+            groups: int,
+            test_or_check: int
     ):
 
         if padding == "same" and stride != 1:
             return
         out_channels = out_channels_mapper(C)
         if (
-            C % groups != 0 or out_channels % groups != 0
+                C % groups != 0 or out_channels % groups != 0
         ):  # since in_channels and out_channels must be divisible by groups
             return
         x = torch.randn([N, C, D, H, W])
@@ -73,11 +76,22 @@ def test_conv3d(
         is_ew_compatible = (
             dilation == 1 and padding != "same" and N > 0
         )  # TODO add support for padding = 'same' with EW
-        self.run_test(
-            x,
-            conv,
-            batch_first=True,
-            atol=10e-5,
-            rtol=10e-3,
-            ew_compatible=is_ew_compatible,
-        )
+        if test_or_check == 1:
+            self.run_test(
+                x,
+                conv,
+                batch_first=True,
+                atol=10e-5,
+                rtol=10e-3,
+                ew_compatible=is_ew_compatible,
+            )
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=is_ew_compatible):
+                assert check_per_sample_gradients_are_correct(
+                    x,
+                    conv,
+                    batch_first=True,
+                    atol=10e-5,
+                    rtol=10e-3,
+                    grad_sample_mode=grad_sample_mode
+                )
diff --git a/opacus/tests/grad_samples/dp_multihead_attention_test.py b/opacus/tests/grad_samples/dp_multihead_attention_test.py
index 057f2391..1f255e89 100644
--- a/opacus/tests/grad_samples/dp_multihead_attention_test.py
+++ b/opacus/tests/grad_samples/dp_multihead_attention_test.py
@@ -20,6 +20,7 @@
 from opacus.layers import DPMultiheadAttention
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
 
 
 class DPMultiheadAttentionAdapter(nn.Module):
@@ -53,18 +54,20 @@ class MultiHeadAttention_test(GradSampleHooks_test):
         add_bias_kv=st.booleans(),
         add_zero_attn=st.booleans(),
         kv_dim=st.booleans(),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_multihead_attention(
-        self,
-        N: int,
-        T: int,
-        D: int,
-        P: int,
-        bias: bool,
-        add_bias_kv: bool,
-        add_zero_attn: bool,
-        kv_dim: bool,
+            self,
+            N: int,
+            T: int,
+            D: int,
+            P: int,
+            bias: bool,
+            add_bias_kv: bool,
+            add_zero_attn: bool,
+            kv_dim: bool,
+            test_or_check: int
     ):
 
         if kv_dim:
@@ -86,4 +89,9 @@ def test_multihead_attention(
         v = torch.randn([T, N, D])
         x = torch.stack((q, k, v), dim=-1)
 
-        self.run_test(x, attn, batch_first=False)
+        if test_or_check == 1:
+            self.run_test(x, attn, batch_first=False)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(x, attn, batch_first=False,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/dp_rnn_test.py b/opacus/tests/grad_samples/dp_rnn_test.py
index 39f29ad6..23402ccf 100644
--- a/opacus/tests/grad_samples/dp_rnn_test.py
+++ b/opacus/tests/grad_samples/dp_rnn_test.py
@@ -21,7 +21,7 @@
 from opacus.utils.packed_sequences import _gen_packed_data
 
 from .common import GradSampleHooks_test
-
+from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
 
 MODELS = [
     DPRNN,
@@ -59,21 +59,23 @@ class RNN_test(GradSampleHooks_test):
         bidirectional=st.booleans(),
         using_packed_sequences=st.booleans(),
         packed_sequences_sorted=st.booleans(),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=30000)
     def test_rnn(
-        self,
-        model,
-        N: int,
-        T: int,
-        D: int,
-        H: int,
-        num_layers: int,
-        bias: bool,
-        batch_first: bool,
-        bidirectional: bool,
-        using_packed_sequences: bool,
-        packed_sequences_sorted: bool,
+            self,
+            model,
+            N: int,
+            T: int,
+            D: int,
+            H: int,
+            num_layers: int,
+            bias: bool,
+            batch_first: bool,
+            bidirectional: bool,
+            using_packed_sequences: bool,
+            packed_sequences_sorted: bool,
+            test_or_check: int
     ):
         rnn = model(
             D,
@@ -92,4 +94,10 @@ def test_rnn(
                 x = torch.randn([N, T, D])
             else:
                 x = torch.randn([T, N, D])
-        self.run_test(x, rnn, batch_first=batch_first, ew_compatible=False)
+
+        if test_or_check == 1:
+            self.run_test(x, rnn, batch_first=batch_first, ew_compatible=False)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(x, rnn, batch_first=batch_first,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/embedding_test.py b/opacus/tests/grad_samples/embedding_test.py
index e0142d36..4e6bab79 100644
--- a/opacus/tests/grad_samples/embedding_test.py
+++ b/opacus/tests/grad_samples/embedding_test.py
@@ -19,6 +19,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
 
 
 class Embedding_test(GradSampleHooks_test):
@@ -31,18 +32,20 @@ class Embedding_test(GradSampleHooks_test):
         D=st.integers(10, 17),
         dim=st.integers(2, 4),
         batch_first=st.booleans(),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_input_across_dims(
-        self,
-        N: int,
-        T: int,
-        Q: int,
-        R: int,
-        V: int,
-        D: int,
-        dim: int,
-        batch_first: bool,
+            self,
+            N: int,
+            T: int,
+            Q: int,
+            R: int,
+            V: int,
+            D: int,
+            dim: int,
+            batch_first: bool,
+            test_or_check: int
     ):
 
         if dim == 1:  # TODO: fix when dim is 1
@@ -56,4 +59,10 @@ def test_input_across_dims(
 
         emb = nn.Embedding(V, D)
         x = torch.randint(low=0, high=V - 1, size=size)
-        self.run_test(x, emb, batch_first=batch_first, ew_compatible=N > 0)
+        ew_compatible = N > 0
+        if test_or_check == 1:
+            self.run_test(x, emb, batch_first=batch_first, ew_compatible=ew_compatible)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=ew_compatible):
+                assert check_per_sample_gradients_are_correct(x, emb, batch_first=batch_first,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/group_norm_test.py b/opacus/tests/grad_samples/group_norm_test.py
index e3836b93..39f4b100 100644
--- a/opacus/tests/grad_samples/group_norm_test.py
+++ b/opacus/tests/grad_samples/group_norm_test.py
@@ -21,6 +21,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
 
 
 class GroupNorm_test(GradSampleHooks_test):
@@ -35,15 +36,17 @@ class GroupNorm_test(GradSampleHooks_test):
         H=st.integers(5, 10),
         W=st.integers(4, 8),
         num_groups=st.sampled_from([1, 4, "C"]),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_3d_input_groups(
-        self,
-        N: int,
-        C: int,
-        H: int,
-        W: int,
-        num_groups: Union[int, str],
+            self,
+            N: int,
+            C: int,
+            H: int,
+            W: int,
+            num_groups: Union[int, str],
+            test_or_check: int
     ):
 
         if num_groups == "C":
@@ -53,5 +56,12 @@ def test_3d_input_groups(
             return
 
         x = torch.randn([N, C, H, W])
+        ew_compatible=N > 0
         norm = nn.GroupNorm(num_groups=num_groups, num_channels=C, affine=True)
-        self.run_test(x, norm, batch_first=True, ew_compatible=N > 0)
+        self.run_test(x, norm, batch_first=True)
+        if test_or_check == 1:
+            self.run_test(x, norm, batch_first=True, ew_compatible=ew_compatible)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=ew_compatible):
+                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/instance_norm1d_test.py b/opacus/tests/grad_samples/instance_norm1d_test.py
index 151e3b1d..7001f1c8 100644
--- a/opacus/tests/grad_samples/instance_norm1d_test.py
+++ b/opacus/tests/grad_samples/instance_norm1d_test.py
@@ -19,6 +19,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
 
 
 class InstanceNorm1d_test(GradSampleHooks_test):
@@ -26,15 +27,23 @@ class InstanceNorm1d_test(GradSampleHooks_test):
         N=st.integers(1, 4),
         C=st.integers(1, 3),
         W=st.integers(5, 10),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_3d_input(
-        self,
-        N: int,
-        C: int,
-        W: int,
+            self,
+            N: int,
+            C: int,
+            W: int,
+            test_or_check: int
     ):
 
         x = torch.randn([N, C, W])
         norm = nn.InstanceNorm1d(num_features=C, affine=True, track_running_stats=False)
-        self.run_test(x, norm, batch_first=True)
+
+        if test_or_check == 1:
+            self.run_test(x, norm, batch_first=True)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/instance_norm2d_test.py b/opacus/tests/grad_samples/instance_norm2d_test.py
index cc7ba1f1..6f955f87 100644
--- a/opacus/tests/grad_samples/instance_norm2d_test.py
+++ b/opacus/tests/grad_samples/instance_norm2d_test.py
@@ -19,6 +19,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
 
 
 class InstanceNorm2d_test(GradSampleHooks_test):
@@ -27,16 +28,23 @@ class InstanceNorm2d_test(GradSampleHooks_test):
         C=st.integers(1, 3),
         W=st.integers(5, 10),
         H=st.integers(4, 8),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_4d_input(
-        self,
-        N: int,
-        C: int,
-        W: int,
-        H: int,
+            self,
+            N: int,
+            C: int,
+            W: int,
+            H: int,
+            test_or_check: int
     ):
 
         x = torch.randn([N, C, H, W])
         norm = nn.InstanceNorm2d(num_features=C, affine=True, track_running_stats=False)
-        self.run_test(x, norm, batch_first=True)
+        if test_or_check == 1:
+            self.run_test(x, norm, batch_first=True)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/instance_norm3d_test.py b/opacus/tests/grad_samples/instance_norm3d_test.py
index 1b3b3de3..68d5298e 100644
--- a/opacus/tests/grad_samples/instance_norm3d_test.py
+++ b/opacus/tests/grad_samples/instance_norm3d_test.py
@@ -19,6 +19,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
 
 
 class InstanceNorm3d_test(GradSampleHooks_test):
@@ -28,16 +29,24 @@ class InstanceNorm3d_test(GradSampleHooks_test):
         W=st.integers(5, 10),
         H=st.integers(4, 8),
         Z=st.integers(1, 4),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_5d_input(
-        self,
-        N: int,
-        C: int,
-        W: int,
-        H: int,
-        Z: int,
+            self,
+            N: int,
+            C: int,
+            W: int,
+            H: int,
+            Z: int,
+            test_or_check: int
+
     ):
         x = torch.randn([N, C, Z, H, W])
         norm = nn.InstanceNorm3d(num_features=C, affine=True, track_running_stats=False)
-        self.run_test(x, norm, batch_first=True)
+        if test_or_check == 1:
+            self.run_test(x, norm, batch_first=True)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/layer_norm_test.py b/opacus/tests/grad_samples/layer_norm_test.py
index 3e69eaa2..ea556dd8 100644
--- a/opacus/tests/grad_samples/layer_norm_test.py
+++ b/opacus/tests/grad_samples/layer_norm_test.py
@@ -19,6 +19,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
 
 
 class LayerNorm_test(GradSampleHooks_test):
@@ -29,16 +30,18 @@ class LayerNorm_test(GradSampleHooks_test):
         W=st.integers(5, 10),
         input_dim=st.integers(2, 4),
         norm_dim=st.integers(1, 3),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_input_norm(
-        self,
-        N: int,
-        Z: int,
-        W: int,
-        H: int,
-        input_dim: int,
-        norm_dim: int,
+            self,
+            N: int,
+            Z: int,
+            W: int,
+            H: int,
+            input_dim: int,
+            norm_dim: int,
+            test_or_check: int
     ):
 
         if norm_dim >= input_dim:
@@ -64,4 +67,9 @@ def test_input_norm(
 
         norm = nn.LayerNorm(normalized_shape, elementwise_affine=True)
         x = torch.randn(x_shape)
-        self.run_test(x, norm, batch_first=True)
+        if test_or_check == 1:
+            self.run_test(x, norm, batch_first=True)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/linear_test.py b/opacus/tests/grad_samples/linear_test.py
index e856e9d3..516c6c05 100644
--- a/opacus/tests/grad_samples/linear_test.py
+++ b/opacus/tests/grad_samples/linear_test.py
@@ -19,6 +19,7 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
 
 
 class Linear_test(GradSampleHooks_test):
@@ -30,17 +31,19 @@ class Linear_test(GradSampleHooks_test):
         input_dim=st.integers(2, 4),
         bias=st.booleans(),
         batch_first=st.booleans(),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_input_bias(
-        self,
-        N: int,
-        Z: int,
-        W: int,
-        H: int,
-        input_dim: int,
-        bias: bool,
-        batch_first: bool,
+            self,
+            N: int,
+            Z: int,
+            W: int,
+            H: int,
+            input_dim: int,
+            bias: bool,
+            batch_first: bool,
+            test_or_check: int
     ):
 
         if input_dim == 2:
@@ -57,4 +60,10 @@ def test_input_bias(
         x = torch.randn(x_shape)
         if not batch_first:
             x = x.transpose(0, 1)
-        self.run_test(x, linear, batch_first=batch_first, ew_compatible=N > 0)
+        ew_compatible = N > 0
+        if test_or_check == 1:
+            self.run_test(x, linear, batch_first=batch_first, ew_compatible=ew_compatible)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=ew_compatible):
+                assert check_per_sample_gradients_are_correct(x, linear, batch_first=batch_first,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/sequence_bias_test.py b/opacus/tests/grad_samples/sequence_bias_test.py
index ec36d74b..2a3a1617 100644
--- a/opacus/tests/grad_samples/sequence_bias_test.py
+++ b/opacus/tests/grad_samples/sequence_bias_test.py
@@ -19,6 +19,7 @@
 from opacus.layers import SequenceBias
 
 from .common import GradSampleHooks_test
+from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
 
 
 class SequenceBias_test(GradSampleHooks_test):
@@ -27,14 +28,16 @@ class SequenceBias_test(GradSampleHooks_test):
         T=st.integers(10, 20),
         D=st.integers(4, 8),
         batch_first=st.booleans(),
+        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_batch_second(
-        self,
-        N: int,
-        T: int,
-        D: int,
-        batch_first: bool,
+            self,
+            N: int,
+            T: int,
+            D: int,
+            batch_first: bool,
+            test_or_check: int
     ):
 
         seqbias = SequenceBias(D, batch_first)
@@ -42,4 +45,9 @@ def test_batch_second(
             x = torch.randn([N, T, D])
         else:
             x = torch.randn([T, N, D])
-        self.run_test(x, seqbias, batch_first, ew_compatible=False)
+        if test_or_check == 1:
+            self.run_test(x, seqbias, batch_first, ew_compatible=False)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=False):
+                assert check_per_sample_gradients_are_correct(x, seqbias, batch_first=batch_first,
+                                                              grad_sample_mode=grad_sample_mode)
diff --git a/opacus/utils/per_sample_gradients_utils.py b/opacus/utils/per_sample_gradients_utils.py
new file mode 100644
index 00000000..74ce32ed
--- /dev/null
+++ b/opacus/utils/per_sample_gradients_utils.py
@@ -0,0 +1,367 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+from typing import Union, Dict, List
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence
+
+from opacus.grad_sample import wrap_model
+from opacus.utils.module_utils import trainable_parameters
+from opacus.utils.packed_sequences import compute_seq_lengths
+
+
+def clone_module(module: nn.Module) -> nn.Module:
+    """
+    Handy utility to clone an nn.Module. PyTorch doesn't always support copy.deepcopy(), so it is
+    just easier to serialize the model to a BytesIO and read it from there.
+
+    Args:
+        module: The module to clone
+
+    Returns:
+        The clone of ``module``
+    """
+    with io.BytesIO() as bytesio:
+        torch.save(module, bytesio)
+        bytesio.seek(0)
+        module_copy = torch.load(bytesio)
+    return module_copy
+
+
+class ModelWithLoss(nn.Module):
+    """
+    To test the gradients of a module, we need to have a loss.
+    This module makes it easy to get a loss from any nn.Module, and automatically generates
+    a target y vector for it in the forward (of all zeros of the correct size).
+    This reduces boilerplate while testing.
+    """
+
+    supported_reductions = ["mean", "sum"]
+
+    def __init__(self, module: nn.Module, loss_reduction: str = "mean"):
+        """
+        Instantiates this module.
+
+        Args:
+            module: The nn.Module you want to test.
+            loss_reduction: What reduction to apply to the loss. Defaults to "mean".
+
+        Raises:
+            ValueError: If ``loss_reduction`` is not among those supported.
+        """
+        super().__init__()
+        self.wrapped_module = module
+
+        if loss_reduction not in self.supported_reductions:
+            raise ValueError(
+                f"Passed loss_reduction={loss_reduction}. Only {self.supported_reductions} supported."
+            )
+        self.criterion = nn.L1Loss(reduction=loss_reduction)
+
+    def forward(self, x):
+        x = self.wrapped_module(x)
+        if type(x) is PackedSequence:
+            loss = _compute_loss_packedsequences(self.criterion, x)
+        else:
+            y = torch.zeros_like(x)
+            loss = self.criterion(x, y)
+        return loss
+
+
+def compute_microbatch_grad_sample(
+        x: Union[torch.Tensor, List[torch.Tensor]],
+        module: nn.Module,
+        batch_first=True,
+        loss_reduction="mean",
+) -> Dict[str, torch.tensor]:
+    """
+    Computes per-sample gradients with the microbatch method, i.e. by computing normal gradients
+    with batch_size set to 1, and manually accumulating them. This is our reference for testing
+    as this method is obviously correct, but slow.
+
+    Args:
+        x: The tensor in input to the ``module``
+        module: The ``ModelWithLoss`` that wraps the nn.Module you want to test.
+        batch_first: Whether batch size is the first dimension (as opposed to the second).
+            Defaults to True.
+
+    Returns:
+        Dictionary mapping parameter_name -> per-sample-gradient for that parameter
+    """
+    torch.use_deterministic_algorithms(True)
+    torch.manual_seed(0)
+    np.random.seed(0)
+
+    module = ModelWithLoss(clone_module(module), loss_reduction)
+
+    for _, p in trainable_parameters(module):
+        p.microbatch_grad_sample = []
+
+    if not batch_first and type(x) is not list:
+        # This allows us to iterate with x_i
+        x = x.transpose(0, 1)
+
+    # Invariant: x is [B, T, ...]
+
+    for x_i in x:
+        # x_i is [T, ...]
+        x_i = x_i.unsqueeze(
+            0 if batch_first else 1
+        )  # x_i of size [1, T, ...] if batch_first, else [T, 1, ...]
+        module.zero_grad()
+        loss_i = module(x_i)
+        loss_i.backward()
+        for p in module.parameters():
+            p.microbatch_grad_sample.append(p.grad.detach().clone())
+
+    for _, p in trainable_parameters(module):
+        if batch_first:
+            p.microbatch_grad_sample = torch.stack(
+                p.microbatch_grad_sample, dim=0  # [B, T, ...]
+            )
+        else:
+            p.microbatch_grad_sample = torch.stack(
+                p.microbatch_grad_sample, dim=1  # [T, B, ...]
+            ).transpose(
+                0, 1
+            )  # Opacus's semantics is that grad_samples are ALWAYS batch_first: [B, T, ...]
+
+    microbatch_grad_samples = {
+        name: p.microbatch_grad_sample
+        for name, p in trainable_parameters(module.wrapped_module)
+    }
+    return microbatch_grad_samples
+
+
+def compute_opacus_grad_sample(
+        x: Union[torch.Tensor, PackedSequence],
+        module: nn.Module,
+        batch_first=True,
+        loss_reduction="mean",
+        grad_sample_mode="hooks",
+) -> Dict[str, torch.tensor]:
+    """
+    Runs Opacus to compute per-sample gradients and return them for testing purposes.
+
+    Args:
+        x: The tensor in input to the ``module``
+        module: The ``ModelWithLoss`` that wraps the nn.Module you want to test.
+        batch_first: Whether batch size is the first dimension (as opposed to the second).
+            Defaults to True.
+        loss_reduction: What reduction to apply to the loss. Defaults to "mean".
+
+    Returns:
+        Dictionary mapping parameter_name -> per-sample-gradient for that parameter
+    """
+    torch.use_deterministic_algorithms(True)
+    torch.manual_seed(0)
+    np.random.seed(0)
+
+    gs_module = wrap_model(
+        model=clone_module(module),
+        grad_sample_mode=grad_sample_mode,
+        batch_first=batch_first,
+        loss_reduction=loss_reduction,
+    )
+    grad_sample_module = ModelWithLoss(gs_module, loss_reduction)
+
+    grad_sample_module.zero_grad()
+    loss = grad_sample_module(x)
+    loss.backward()
+
+    opacus_grad_samples = {
+        name: p.grad_sample
+        for name, p in trainable_parameters(
+            grad_sample_module.wrapped_module._module
+        )
+    }
+
+    return opacus_grad_samples
+
+
+def check_torch_version_for_ew_sample() -> bool:
+    return torch.__version__ >= (1, 13)
+
+
+def get_grad_sample_modes(use_ew: bool = False):
+    grad_sample_modes = ["hooks", "functorch"]
+    if use_ew and check_torch_version_for_ew_sample():
+        grad_sample_modes.append("ew")
+    return grad_sample_modes
+
+
+def check_per_sample_gradients_are_correct(x: Union[torch.Tensor, PackedSequence],
+                                           module: nn.Module,
+                                           *,
+                                           batch_first=True,
+                                           atol=10e-6,
+                                           rtol=10e-5,
+                                           grad_sample_mode="hooks") -> bool:
+    if grad_sample_mode == "functorch":
+        import functorch  # noqa
+
+    reductions = ["sum", "mean"]
+    if grad_sample_mode == "ew":
+        if not batch_first:
+            raise RuntimeError(f"Batch should be first dimension.")
+        if not check_torch_version_for_ew_sample():
+            raise RuntimeError(f"Unsupported torch version: {torch.__version__}.")
+        reductions = ["sum"]
+
+    correct = True
+    for loss_reduction in reductions:
+        correct = correct and check_per_sample_gradients_are_correct_with_reduction(
+            x,
+            module,
+            batch_first=batch_first,
+            loss_reduction=loss_reduction,
+            atol=atol,
+            rtol=rtol,
+            grad_sample_mode=grad_sample_mode,
+        )
+
+    return correct
+
+
+def compute_microbatch_grad_sample_tensor_or_seq(x: Union[torch.Tensor, PackedSequence],
+                                                 module: nn.Module,
+                                                 batch_first=True,
+                                                 loss_reduction="mean"):
+    if type(x) is PackedSequence:
+        x_unpacked = unpack_packedsequences(x)
+        microbatch_grad_samples = compute_microbatch_grad_sample(
+            x_unpacked,
+            module,
+            batch_first=batch_first,
+            loss_reduction=loss_reduction,
+        )
+    else:
+        microbatch_grad_samples = compute_microbatch_grad_sample(
+            x, module, batch_first=batch_first, loss_reduction=loss_reduction
+        )
+
+    return microbatch_grad_samples
+
+
+def compute_grad_samples_microbatch_and_opacus(x: Union[torch.Tensor, PackedSequence],
+                                               module: nn.Module,
+                                               batch_first=True,
+                                               loss_reduction="mean",
+                                               grad_sample_mode="hooks"):
+    microbatch_grad_samples = compute_microbatch_grad_sample_tensor_or_seq(x,
+                                                                           module,
+                                                                           batch_first=batch_first,
+                                                                           loss_reduction=loss_reduction)
+    opacus_grad_samples = compute_opacus_grad_sample(
+        x,
+        module,
+        batch_first=batch_first,
+        loss_reduction=loss_reduction,
+        grad_sample_mode=grad_sample_mode
+    )
+
+    if microbatch_grad_samples.keys() != opacus_grad_samples.keys():
+        raise ValueError(
+            "Keys not matching! "
+            f"Keys only in microbatch: {microbatch_grad_samples.keys() - opacus_grad_samples.keys()}; "
+            f"Keys only in Opacus: {opacus_grad_samples.keys() - microbatch_grad_samples.keys()}"
+        )
+
+    return microbatch_grad_samples, opacus_grad_samples
+
+
+def check_per_sample_gradients_are_correct_with_reduction(
+        x: Union[torch.Tensor, PackedSequence],
+        module: nn.Module,
+        batch_first=True,
+        loss_reduction="mean",
+        atol=10e-6,
+        rtol=10e-5,
+        grad_sample_mode="hooks"
+) -> bool:
+    microbatch_grad_samples, opacus_grad_samples = \
+        compute_grad_samples_microbatch_and_opacus(x,
+                                                   module,
+                                                   batch_first=batch_first,
+                                                   loss_reduction=loss_reduction,
+                                                   grad_sample_mode=grad_sample_mode)
+
+    correct = True
+    for name, opacus_grad_sample in opacus_grad_samples.items():
+        microbatch_grad_sample = microbatch_grad_samples[name]
+        correct = correct and np.allclose(microbatch_grad_sample, opacus_grad_sample, atol,
+                                          rtol) and opacus_grad_sample.shape == microbatch_grad_sample.shape
+
+    return correct
+
+
+def unpack_packedsequences(X: PackedSequence) -> List[torch.Tensor]:
+    r"""
+    Produces a list of tensors from X (PackedSequence) such that this list was used to create X with batch_first=True
+
+    Args:
+        X: A PackedSequence from which the output list of tensors will be produced.
+
+    Returns:
+        unpacked_data: The list of tensors produced from X.
+    """
+
+    X_padded = pad_packed_sequence(X)
+    X_padded = X_padded[0].permute((1, 0, 2))
+
+    if X.sorted_indices is not None:
+        X_padded = X_padded[X.sorted_indices]
+
+    seq_lens = compute_seq_lengths(X.batch_sizes)
+    unpacked_data = [0] * len(seq_lens)
+    for idx, length in enumerate(seq_lens):
+        unpacked_data[idx] = X_padded[idx][:length, :]
+
+    return unpacked_data
+
+
+def _compute_loss_packedsequences(
+        criterion: nn.L1Loss, x: PackedSequence
+) -> torch.Tensor:
+    r"""
+    This function computes the loss in a different way for 'mean' reduced L1 loss while for 'sum' reduced L1 loss,
+    it computes the same way as with non-packed data. For 'mean' reduced L1 loss, it transforms x (PackedSequence)
+    into a list of tensors such that this list of tensors was used to create this PackedSequence in the first
+    place using batch_first=True and then takes the mean of the loss values produced from applying criterion on
+    each sequence sample.
+
+    Args:
+        criterion: An L1 loss function with reduction either set to 'sum' or 'mean'.
+        x: Data in the form of a PackedSequence.
+
+    Returns:
+        A loss variable, reduced either using summation or averaging from L1 errors.
+    """
+
+    if criterion.reduction == "sum":
+        y = torch.zeros_like(x[0])
+        return criterion(x[0], y)
+    elif criterion.reduction == "mean":
+        x = unpack_packedsequences(x)
+        loss_sum = 0
+        for x_i in x:
+            y_i = torch.zeros_like(x_i)
+            loss_sum += criterion(x_i, y_i)
+        loss_mean = loss_sum / len(x)
+        return loss_mean

From 1d957fa349c70c036155f2c9075d87d1cfcd58fe Mon Sep 17 00:00:00 2001
From: Goutham Rajendran <gouthamuoc@meta.com>
Date: Thu, 27 Oct 2022 08:18:39 -0700
Subject: [PATCH 13/32] Fixed issue with missing argument in MNIST example
 (#520)

Summary:
Pull Request resolved: https://github.com/pytorch/opacus/pull/520

sr stands for sampling rate, which is now legacy code. Now, it's just sample_rate = 1 / len(data_loader). This has been fixed in the example, by setting batch size to be 60000 * 0.004 = 240 (thanks to https://github.com/ffuuugor for the clarification).

On another note, when running with DP, the following error is thrown
```
AttributeError: Can't pickle local object 'wrap_collate_with_empty.<locals>.collate'
```

For now, a temporary fix (based on https://github.com/IBM/Project_CodeNet/issues/21#issuecomment-864619383) is to make num_workers=0 in the dataset loaders. This commit does that.

Reviewed By: ffuuugor

Differential Revision: D40253037

fbshipit-source-id: 99984f8963a4efea6829d109bb81acff0e587c93
---
 examples/mnist.py        |  4 ++--
 examples/mnist_README.md | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/mnist.py b/examples/mnist.py
index f5d50f6c..a97c34c7 100644
--- a/examples/mnist.py
+++ b/examples/mnist.py
@@ -224,7 +224,7 @@ def main():
             ),
         ),
         batch_size=args.batch_size,
-        num_workers=1,
+        num_workers=0,
         pin_memory=True,
     )
     test_loader = torch.utils.data.DataLoader(
@@ -240,7 +240,7 @@ def main():
         ),
         batch_size=args.test_batch_size,
         shuffle=True,
-        num_workers=1,
+        num_workers=0,
         pin_memory=True,
     )
     run_results = []
diff --git a/examples/mnist_README.md b/examples/mnist_README.md
index d822a397..8cbcc3a0 100644
--- a/examples/mnist_README.md
+++ b/examples/mnist_README.md
@@ -1,7 +1,7 @@
 # First run
 To run a basic training script without differential privacy:
 ```shell
-python mnist.py --device=cpu --disable-dp --n=20 --lr=.1 -sr=0.004
+python mnist.py --device=cpu --disable-dp --n=20 --lr=.1 -b=240
 ```
 The first time the script runs, it attempts to download the MNIST dataset from http://yann.lecun.com and place it in `../mnist/MNIST/raw`. If you prefer a different location or your execution environment does not have access to the outside world, download and unpack the dataset yourself and pass the location as `--data-root=custom_dir_name`. The script will expect to find under `custom_dir_name/MNIST/processed` two files: `test.pt` (7.9 MB) and `training.pt` (47.5 MB).
 
@@ -21,7 +21,7 @@ Test set: Average loss: 0.0000, Accuracy: 9893/10000 (98.93%)
 
 To train a differentially private model, run the following command:
 ```shell
-python mnist.py --device=cpu -n=15 --lr=.25 --sigma=1.3 -c=1.5 -sr=0.004
+python mnist.py --device=cpu -n=15 --lr=.25 --sigma=1.3 -c=1.5 -b=240
 ```
 If the run is successful, expect to see
 ```
@@ -39,24 +39,24 @@ Test set: Average loss: 0.0004, Accuracy: 9486/10000 (94.86%)
 
 **Baseline: no differential privacy**
 
-Command: `--disable-dp --n=20 --lr=.1 -sr=0.004`
+Command: `--disable-dp --n=20 --lr=.1 -b=240`
 
 Result: accuracy averaged over 10 runs 98.94% ± 0.32%
 
 **(6.86, 10<sup>-5</sup>)-DP**
 
-Command: `-n=45 --lr=.25 --sigma=.7 -c=1.5 -sr=0.004`
+Command: `-n=45 --lr=.25 --sigma=.7 -c=1.5 -b=240`
 
 Result: accuracy averaged over 10 runs 97.09% ± 0.17%
 
 **(2.91, 10<sup>-5</sup>)-DP**
 
-Command: `-n 60 --lr=.15 --sigma=1.1 -c=1.0 -sr=0.004`
+Command: `-n 60 --lr=.15 --sigma=1.1 -c=1.0 -b=240`
 
 Result: accuracy averaged over 10 runs 96.78% ± 0.21%
 
 **(1.16, 10<sup>-5</sup>)-DP**
 
-Command: `-n=15 --lr=.25 --sigma=1.3 -c=1.5 -sr=0.004`
+Command: `-n=15 --lr=.25 --sigma=1.3 -c=1.5 -b=240`
 
 Result: accuracy averaged over 10 runs 94.63% ± 0.34%

From 4e3a9797b44273ff691f6b1da07bd7453606c63f Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Fri, 28 Oct 2022 12:22:15 +0100
Subject: [PATCH 14/32] Add docs and refactor

---
 opacus/utils/per_sample_gradients_utils.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/opacus/utils/per_sample_gradients_utils.py b/opacus/utils/per_sample_gradients_utils.py
index 74ce32ed..7e00a9ec 100644
--- a/opacus/utils/per_sample_gradients_utils.py
+++ b/opacus/utils/per_sample_gradients_utils.py
@@ -100,6 +100,8 @@ def compute_microbatch_grad_sample(
         module: The ``ModelWithLoss`` that wraps the nn.Module you want to test.
         batch_first: Whether batch size is the first dimension (as opposed to the second).
             Defaults to True.
+        loss_reduction: Indicates if the loss reduction (for aggregating the gradients)
+                is a sum or a mean operation. Can take values "sum" or "mean".
 
     Returns:
         Dictionary mapping parameter_name -> per-sample-gradient for that parameter
@@ -165,6 +167,7 @@ def compute_opacus_grad_sample(
         batch_first: Whether batch size is the first dimension (as opposed to the second).
             Defaults to True.
         loss_reduction: What reduction to apply to the loss. Defaults to "mean".
+        grad_sample_mode: What sampling method to use to get gradients.
 
     Returns:
         Dictionary mapping parameter_name -> per-sample-gradient for that parameter
@@ -213,6 +216,19 @@ def check_per_sample_gradients_are_correct(x: Union[torch.Tensor, PackedSequence
                                            atol=10e-6,
                                            rtol=10e-5,
                                            grad_sample_mode="hooks") -> bool:
+    """
+    A utility to check whether per sample gradients are computed correctly with a particular model.
+    Args:
+        x: The tensor in input to the ``module``
+        module: The ``ModelWithLoss`` that wraps the nn.Module you want to check.
+        batch_first: Whether batch size is the first dimension (as opposed to the second).
+            Defaults to True.
+        atol: The relative tolerance parameter (numpy).
+        rtol: The absolute tolerance parameter (numpy).
+        grad_sample_mode: What sampling method to use to get gradients.
+
+    Returns: True if per sample gradients were computed correctly. False otherwise.
+    """
     if grad_sample_mode == "functorch":
         import functorch  # noqa
 
@@ -226,7 +242,7 @@ def check_per_sample_gradients_are_correct(x: Union[torch.Tensor, PackedSequence
 
     correct = True
     for loss_reduction in reductions:
-        correct = correct and check_per_sample_gradients_are_correct_with_reduction(
+        correct = correct and _check_per_sample_gradients_are_correct_with_reduction(
             x,
             module,
             batch_first=batch_first,
@@ -286,7 +302,7 @@ def compute_grad_samples_microbatch_and_opacus(x: Union[torch.Tensor, PackedSequ
     return microbatch_grad_samples, opacus_grad_samples
 
 
-def check_per_sample_gradients_are_correct_with_reduction(
+def _check_per_sample_gradients_are_correct_with_reduction(
         x: Union[torch.Tensor, PackedSequence],
         module: nn.Module,
         batch_first=True,

From 3d0a5dba045f5a4f26710946dd478f9ebfb60d22 Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Fri, 28 Oct 2022 12:51:18 +0100
Subject: [PATCH 15/32] Apply code style fixes

---
 opacus/tests/grad_samples/common.py           |  72 ++++++------
 opacus/tests/grad_samples/conv1d_test.py      |  36 +++---
 opacus/tests/grad_samples/conv2d_test.py      |  33 +++---
 opacus/tests/grad_samples/conv3d_test.py      |   5 +-
 .../dp_multihead_attention_test.py            |  32 +++---
 opacus/tests/grad_samples/dp_rnn_test.py      |  38 +++---
 opacus/tests/grad_samples/embedding_test.py   |   5 +-
 opacus/tests/grad_samples/group_norm_test.py  |   5 +-
 .../grad_samples/instance_norm1d_test.py      |  20 ++--
 .../grad_samples/instance_norm2d_test.py      |  21 ++--
 .../grad_samples/instance_norm3d_test.py      |  23 ++--
 opacus/tests/grad_samples/layer_norm_test.py  |  28 +++--
 opacus/tests/grad_samples/linear_test.py      |   5 +-
 .../tests/grad_samples/sequence_bias_test.py  |  22 ++--
 opacus/utils/per_sample_gradients_utils.py    | 108 ++++++++++--------
 15 files changed, 246 insertions(+), 207 deletions(-)

diff --git a/opacus/tests/grad_samples/common.py b/opacus/tests/grad_samples/common.py
index c24d924a..ec0a1c53 100644
--- a/opacus/tests/grad_samples/common.py
+++ b/opacus/tests/grad_samples/common.py
@@ -23,7 +23,9 @@
 from torch.nn.utils.rnn import PackedSequence
 from torch.testing import assert_allclose
 
-from opacus.utils.per_sample_gradients_utils import compute_grad_samples_microbatch_and_opacus
+from opacus.utils.per_sample_gradients_utils import (
+    compute_grad_samples_microbatch_and_opacus,
+)
 
 
 def expander(x, factor: int = 2):
@@ -41,13 +43,13 @@ class GradSampleHooks_test(unittest.TestCase):
     """
 
     def run_test(
-            self,
-            x: Union[torch.Tensor, PackedSequence],
-            module: nn.Module,
-            batch_first=True,
-            atol=10e-6,
-            rtol=10e-5,
-            ew_compatible=True,
+        self,
+        x: Union[torch.Tensor, PackedSequence],
+        module: nn.Module,
+        batch_first=True,
+        atol=10e-6,
+        rtol=10e-5,
+        ew_compatible=True,
     ):
         grad_sample_modes = ["hooks", "functorch"]
         try:
@@ -61,7 +63,7 @@ def run_test(
         for grad_sample_mode in grad_sample_modes:
             for loss_reduction in ["sum", "mean"]:
                 with self.subTest(
-                        grad_sample_mode=grad_sample_mode, loss_reduction=loss_reduction
+                    grad_sample_mode=grad_sample_mode, loss_reduction=loss_reduction
                 ):
                     self.run_test_with_reduction(
                         x,
@@ -84,21 +86,25 @@ def run_test(
             )
 
     def run_test_with_reduction(
-            self,
-            x: Union[torch.Tensor, PackedSequence],
-            module: nn.Module,
-            batch_first=True,
-            loss_reduction="mean",
-            atol=10e-6,
-            rtol=10e-5,
-            grad_sample_mode="hooks",
+        self,
+        x: Union[torch.Tensor, PackedSequence],
+        module: nn.Module,
+        batch_first=True,
+        loss_reduction="mean",
+        atol=10e-6,
+        rtol=10e-5,
+        grad_sample_mode="hooks",
     ):
-        microbatch_grad_samples, opacus_grad_samples = \
-            compute_grad_samples_microbatch_and_opacus(x,
-                                                       module,
-                                                       batch_first=batch_first,
-                                                       loss_reduction=loss_reduction,
-                                                       grad_sample_mode=grad_sample_mode)
+        (
+            microbatch_grad_samples,
+            opacus_grad_samples,
+        ) = compute_grad_samples_microbatch_and_opacus(
+            x,
+            module,
+            batch_first=batch_first,
+            loss_reduction=loss_reduction,
+            grad_sample_mode=grad_sample_mode,
+        )
 
         self.check_shapes(microbatch_grad_samples, opacus_grad_samples, loss_reduction)
         self.check_values(
@@ -106,10 +112,10 @@ def run_test_with_reduction(
         )
 
     def check_shapes(
-            self,
-            microbatch_grad_samples,
-            opacus_grad_samples,
-            loss_reduction,
+        self,
+        microbatch_grad_samples,
+        opacus_grad_samples,
+        loss_reduction,
     ) -> None:
         failed = []
         for name, opacus_grad_sample in opacus_grad_samples.items():
@@ -137,12 +143,12 @@ def check_shapes(
             )
 
     def check_values(
-            self,
-            microbatch_grad_samples,
-            opacus_grad_samples,
-            loss_reduction,
-            atol,
-            rtol,
+        self,
+        microbatch_grad_samples,
+        opacus_grad_samples,
+        loss_reduction,
+        atol,
+        rtol,
     ) -> None:
         failed = []
         for name, opacus_grad_sample in opacus_grad_samples.items():
diff --git a/opacus/tests/grad_samples/conv1d_test.py b/opacus/tests/grad_samples/conv1d_test.py
index 08598902..795539dc 100644
--- a/opacus/tests/grad_samples/conv1d_test.py
+++ b/opacus/tests/grad_samples/conv1d_test.py
@@ -21,7 +21,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test, expander, shrinker
-from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
+from ...utils.per_sample_gradients_utils import (
+    check_per_sample_gradients_are_correct,
+    get_grad_sample_modes,
+)
 
 
 class Conv1d_test(GradSampleHooks_test):
@@ -35,28 +38,28 @@ class Conv1d_test(GradSampleHooks_test):
         padding=st.sampled_from([0, 1, 2, "same", "valid"]),
         dilation=st.integers(1, 2),
         groups=st.integers(1, 12),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
     def test_conv1d(
-            self,
-            N: int,
-            C: int,
-            W: int,
-            out_channels_mapper: Callable[[int], int],
-            kernel_size: int,
-            stride: int,
-            padding: int,
-            dilation: int,
-            groups: int,
-            test_or_check: int
+        self,
+        N: int,
+        C: int,
+        W: int,
+        out_channels_mapper: Callable[[int], int],
+        kernel_size: int,
+        stride: int,
+        padding: int,
+        dilation: int,
+        groups: int,
+        test_or_check: int,
     ):
 
         if padding == "same" and stride != 1:
             return
         out_channels = out_channels_mapper(C)
         if (
-                C % groups != 0 or out_channels % groups != 0
+            C % groups != 0 or out_channels % groups != 0
         ):  # since in_channels and out_channels must be divisible by groups
             return
 
@@ -70,9 +73,10 @@ def test_conv1d(
             dilation=dilation,
             groups=groups,
         )
+        ew_compatible=N > 0
         if test_or_check == 1:
-            self.run_test(x, conv, batch_first=True, atol=10e-5, rtol=10e-4, ew_compatible=N > 0)
+            self.run_test(x, conv, batch_first=True, atol=10e-5, rtol=10e-4, ew_compatible=ew_compatible)
         if test_or_check == 2:
-            for grad_sample_mode in get_grad_sample_modes(use_ew= N>0):
+            for grad_sample_mode in get_grad_sample_modes(use_ew=ew_compatible):
                 assert check_per_sample_gradients_are_correct(x, conv, batch_first=True, atol=10e-5, rtol=10e-4,
                                                               grad_sample_mode=grad_sample_mode)
diff --git a/opacus/tests/grad_samples/conv2d_test.py b/opacus/tests/grad_samples/conv2d_test.py
index 558a71d6..185c9246 100644
--- a/opacus/tests/grad_samples/conv2d_test.py
+++ b/opacus/tests/grad_samples/conv2d_test.py
@@ -25,7 +25,10 @@
 from torch.testing import assert_allclose
 
 from .common import GradSampleHooks_test, expander, shrinker
-from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
+from ...utils.per_sample_gradients_utils import (
+    get_grad_sample_modes,
+    check_per_sample_gradients_are_correct,
+)
 
 
 class Conv2d_test(GradSampleHooks_test):
@@ -61,7 +64,7 @@ def test_conv2d(
             return
         out_channels = out_channels_mapper(C)
         if (
-                C % groups != 0 or out_channels % groups != 0
+            C % groups != 0 or out_channels % groups != 0
         ):  # since in_channels and out_channels must be divisible by groups
             return
 
@@ -144,19 +147,19 @@ def test_conv2d(
     )
     @settings(deadline=30000)
     def test_unfold2d(
-            self,
-            B: int,
-            C: int,
-            H: int,
-            W: int,
-            k_h: int,
-            k_w: int,
-            pad_h: int,
-            pad_w: int,
-            stride_h: int,
-            stride_w: int,
-            dilation_h: int,
-            dilation_w: int,
+        self,
+        B: int,
+        C: int,
+        H: int,
+        W: int,
+        k_h: int,
+        k_w: int,
+        pad_h: int,
+        pad_w: int,
+        stride_h: int,
+        stride_w: int,
+        dilation_h: int,
+        dilation_w: int,
     ):
         X = torch.randn(B, C, H, W)
         X_unfold_torch = torch.nn.functional.unfold(
diff --git a/opacus/tests/grad_samples/conv3d_test.py b/opacus/tests/grad_samples/conv3d_test.py
index 5881beb3..5f2d3c04 100644
--- a/opacus/tests/grad_samples/conv3d_test.py
+++ b/opacus/tests/grad_samples/conv3d_test.py
@@ -21,7 +21,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test, expander, shrinker
-from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
+from ...utils.per_sample_gradients_utils import (
+    get_grad_sample_modes,
+    check_per_sample_gradients_are_correct,
+)
 
 
 class Conv3d_test(GradSampleHooks_test):
diff --git a/opacus/tests/grad_samples/dp_multihead_attention_test.py b/opacus/tests/grad_samples/dp_multihead_attention_test.py
index 1f255e89..b1192e2a 100644
--- a/opacus/tests/grad_samples/dp_multihead_attention_test.py
+++ b/opacus/tests/grad_samples/dp_multihead_attention_test.py
@@ -20,7 +20,10 @@
 from opacus.layers import DPMultiheadAttention
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
+from ...utils.per_sample_gradients_utils import (
+    check_per_sample_gradients_are_correct,
+    get_grad_sample_modes,
+)
 
 
 class DPMultiheadAttentionAdapter(nn.Module):
@@ -54,20 +57,20 @@ class MultiHeadAttention_test(GradSampleHooks_test):
         add_bias_kv=st.booleans(),
         add_zero_attn=st.booleans(),
         kv_dim=st.booleans(),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
     def test_multihead_attention(
-            self,
-            N: int,
-            T: int,
-            D: int,
-            P: int,
-            bias: bool,
-            add_bias_kv: bool,
-            add_zero_attn: bool,
-            kv_dim: bool,
-            test_or_check: int
+        self,
+        N: int,
+        T: int,
+        D: int,
+        P: int,
+        bias: bool,
+        add_bias_kv: bool,
+        add_zero_attn: bool,
+        kv_dim: bool,
+        test_or_check: int,
     ):
 
         if kv_dim:
@@ -93,5 +96,6 @@ def test_multihead_attention(
             self.run_test(x, attn, batch_first=False)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(x, attn, batch_first=False,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x, attn, batch_first=False, grad_sample_mode=grad_sample_mode
+                )
diff --git a/opacus/tests/grad_samples/dp_rnn_test.py b/opacus/tests/grad_samples/dp_rnn_test.py
index 23402ccf..390cd707 100644
--- a/opacus/tests/grad_samples/dp_rnn_test.py
+++ b/opacus/tests/grad_samples/dp_rnn_test.py
@@ -21,7 +21,10 @@
 from opacus.utils.packed_sequences import _gen_packed_data
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
+from ...utils.per_sample_gradients_utils import (
+    check_per_sample_gradients_are_correct,
+    get_grad_sample_modes,
+)
 
 MODELS = [
     DPRNN,
@@ -59,23 +62,23 @@ class RNN_test(GradSampleHooks_test):
         bidirectional=st.booleans(),
         using_packed_sequences=st.booleans(),
         packed_sequences_sorted=st.booleans(),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=30000)
     def test_rnn(
-            self,
-            model,
-            N: int,
-            T: int,
-            D: int,
-            H: int,
-            num_layers: int,
-            bias: bool,
-            batch_first: bool,
-            bidirectional: bool,
-            using_packed_sequences: bool,
-            packed_sequences_sorted: bool,
-            test_or_check: int
+        self,
+        model,
+        N: int,
+        T: int,
+        D: int,
+        H: int,
+        num_layers: int,
+        bias: bool,
+        batch_first: bool,
+        bidirectional: bool,
+        using_packed_sequences: bool,
+        packed_sequences_sorted: bool,
+        test_or_check: int,
     ):
         rnn = model(
             D,
@@ -99,5 +102,6 @@ def test_rnn(
             self.run_test(x, rnn, batch_first=batch_first, ew_compatible=False)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(x, rnn, batch_first=batch_first,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x, rnn, batch_first=batch_first, grad_sample_mode=grad_sample_mode
+                )
diff --git a/opacus/tests/grad_samples/embedding_test.py b/opacus/tests/grad_samples/embedding_test.py
index 4e6bab79..e368a6c1 100644
--- a/opacus/tests/grad_samples/embedding_test.py
+++ b/opacus/tests/grad_samples/embedding_test.py
@@ -19,7 +19,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
+from ...utils.per_sample_gradients_utils import (
+    check_per_sample_gradients_are_correct,
+    get_grad_sample_modes,
+)
 
 
 class Embedding_test(GradSampleHooks_test):
diff --git a/opacus/tests/grad_samples/group_norm_test.py b/opacus/tests/grad_samples/group_norm_test.py
index 39f4b100..b3319aae 100644
--- a/opacus/tests/grad_samples/group_norm_test.py
+++ b/opacus/tests/grad_samples/group_norm_test.py
@@ -21,7 +21,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
+from ...utils.per_sample_gradients_utils import (
+    check_per_sample_gradients_are_correct,
+    get_grad_sample_modes,
+)
 
 
 class GroupNorm_test(GradSampleHooks_test):
diff --git a/opacus/tests/grad_samples/instance_norm1d_test.py b/opacus/tests/grad_samples/instance_norm1d_test.py
index 7001f1c8..68bdebda 100644
--- a/opacus/tests/grad_samples/instance_norm1d_test.py
+++ b/opacus/tests/grad_samples/instance_norm1d_test.py
@@ -19,7 +19,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
+from ...utils.per_sample_gradients_utils import (
+    check_per_sample_gradients_are_correct,
+    get_grad_sample_modes,
+)
 
 
 class InstanceNorm1d_test(GradSampleHooks_test):
@@ -27,16 +30,10 @@ class InstanceNorm1d_test(GradSampleHooks_test):
         N=st.integers(1, 4),
         C=st.integers(1, 3),
         W=st.integers(5, 10),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
-    def test_3d_input(
-            self,
-            N: int,
-            C: int,
-            W: int,
-            test_or_check: int
-    ):
+    def test_3d_input(self, N: int, C: int, W: int, test_or_check: int):
 
         x = torch.randn([N, C, W])
         norm = nn.InstanceNorm1d(num_features=C, affine=True, track_running_stats=False)
@@ -45,5 +42,6 @@ def test_3d_input(
             self.run_test(x, norm, batch_first=True)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x, norm, batch_first=True, grad_sample_mode=grad_sample_mode
+                )
diff --git a/opacus/tests/grad_samples/instance_norm2d_test.py b/opacus/tests/grad_samples/instance_norm2d_test.py
index 6f955f87..be622009 100644
--- a/opacus/tests/grad_samples/instance_norm2d_test.py
+++ b/opacus/tests/grad_samples/instance_norm2d_test.py
@@ -19,7 +19,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
+from ...utils.per_sample_gradients_utils import (
+    get_grad_sample_modes,
+    check_per_sample_gradients_are_correct,
+)
 
 
 class InstanceNorm2d_test(GradSampleHooks_test):
@@ -28,17 +31,10 @@ class InstanceNorm2d_test(GradSampleHooks_test):
         C=st.integers(1, 3),
         W=st.integers(5, 10),
         H=st.integers(4, 8),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
-    def test_4d_input(
-            self,
-            N: int,
-            C: int,
-            W: int,
-            H: int,
-            test_or_check: int
-    ):
+    def test_4d_input(self, N: int, C: int, W: int, H: int, test_or_check: int):
 
         x = torch.randn([N, C, H, W])
         norm = nn.InstanceNorm2d(num_features=C, affine=True, track_running_stats=False)
@@ -46,5 +42,6 @@ def test_4d_input(
             self.run_test(x, norm, batch_first=True)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x, norm, batch_first=True, grad_sample_mode=grad_sample_mode
+                )
diff --git a/opacus/tests/grad_samples/instance_norm3d_test.py b/opacus/tests/grad_samples/instance_norm3d_test.py
index 68d5298e..14a9d3b6 100644
--- a/opacus/tests/grad_samples/instance_norm3d_test.py
+++ b/opacus/tests/grad_samples/instance_norm3d_test.py
@@ -19,7 +19,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
+from ...utils.per_sample_gradients_utils import (
+    get_grad_sample_modes,
+    check_per_sample_gradients_are_correct,
+)
 
 
 class InstanceNorm3d_test(GradSampleHooks_test):
@@ -29,24 +32,16 @@ class InstanceNorm3d_test(GradSampleHooks_test):
         W=st.integers(5, 10),
         H=st.integers(4, 8),
         Z=st.integers(1, 4),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
-    def test_5d_input(
-            self,
-            N: int,
-            C: int,
-            W: int,
-            H: int,
-            Z: int,
-            test_or_check: int
-
-    ):
+    def test_5d_input(self, N: int, C: int, W: int, H: int, Z: int, test_or_check: int):
         x = torch.randn([N, C, Z, H, W])
         norm = nn.InstanceNorm3d(num_features=C, affine=True, track_running_stats=False)
         if test_or_check == 1:
             self.run_test(x, norm, batch_first=True)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x, norm, batch_first=True, grad_sample_mode=grad_sample_mode
+                )
diff --git a/opacus/tests/grad_samples/layer_norm_test.py b/opacus/tests/grad_samples/layer_norm_test.py
index ea556dd8..29c3c8f5 100644
--- a/opacus/tests/grad_samples/layer_norm_test.py
+++ b/opacus/tests/grad_samples/layer_norm_test.py
@@ -19,7 +19,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
+from ...utils.per_sample_gradients_utils import (
+    check_per_sample_gradients_are_correct,
+    get_grad_sample_modes,
+)
 
 
 class LayerNorm_test(GradSampleHooks_test):
@@ -30,18 +33,18 @@ class LayerNorm_test(GradSampleHooks_test):
         W=st.integers(5, 10),
         input_dim=st.integers(2, 4),
         norm_dim=st.integers(1, 3),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
     def test_input_norm(
-            self,
-            N: int,
-            Z: int,
-            W: int,
-            H: int,
-            input_dim: int,
-            norm_dim: int,
-            test_or_check: int
+        self,
+        N: int,
+        Z: int,
+        W: int,
+        H: int,
+        input_dim: int,
+        norm_dim: int,
+        test_or_check: int,
     ):
 
         if norm_dim >= input_dim:
@@ -71,5 +74,6 @@ def test_input_norm(
             self.run_test(x, norm, batch_first=True)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x, norm, batch_first=True, grad_sample_mode=grad_sample_mode
+                )
diff --git a/opacus/tests/grad_samples/linear_test.py b/opacus/tests/grad_samples/linear_test.py
index 516c6c05..e0c72b18 100644
--- a/opacus/tests/grad_samples/linear_test.py
+++ b/opacus/tests/grad_samples/linear_test.py
@@ -19,7 +19,10 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import get_grad_sample_modes, check_per_sample_gradients_are_correct
+from ...utils.per_sample_gradients_utils import (
+    get_grad_sample_modes,
+    check_per_sample_gradients_are_correct,
+)
 
 
 class Linear_test(GradSampleHooks_test):
diff --git a/opacus/tests/grad_samples/sequence_bias_test.py b/opacus/tests/grad_samples/sequence_bias_test.py
index 2a3a1617..6ad6d411 100644
--- a/opacus/tests/grad_samples/sequence_bias_test.py
+++ b/opacus/tests/grad_samples/sequence_bias_test.py
@@ -19,7 +19,10 @@
 from opacus.layers import SequenceBias
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import check_per_sample_gradients_are_correct, get_grad_sample_modes
+from ...utils.per_sample_gradients_utils import (
+    check_per_sample_gradients_are_correct,
+    get_grad_sample_modes,
+)
 
 
 class SequenceBias_test(GradSampleHooks_test):
@@ -28,16 +31,11 @@ class SequenceBias_test(GradSampleHooks_test):
         T=st.integers(10, 20),
         D=st.integers(4, 8),
         batch_first=st.booleans(),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
     def test_batch_second(
-            self,
-            N: int,
-            T: int,
-            D: int,
-            batch_first: bool,
-            test_or_check: int
+        self, N: int, T: int, D: int, batch_first: bool, test_or_check: int
     ):
 
         seqbias = SequenceBias(D, batch_first)
@@ -49,5 +47,9 @@ def test_batch_second(
             self.run_test(x, seqbias, batch_first, ew_compatible=False)
         if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=False):
-                assert check_per_sample_gradients_are_correct(x, seqbias, batch_first=batch_first,
-                                                              grad_sample_mode=grad_sample_mode)
+                assert check_per_sample_gradients_are_correct(
+                    x,
+                    seqbias,
+                    batch_first=batch_first,
+                    grad_sample_mode=grad_sample_mode,
+                )
diff --git a/opacus/utils/per_sample_gradients_utils.py b/opacus/utils/per_sample_gradients_utils.py
index 7e00a9ec..9b80d9f7 100644
--- a/opacus/utils/per_sample_gradients_utils.py
+++ b/opacus/utils/per_sample_gradients_utils.py
@@ -85,10 +85,10 @@ def forward(self, x):
 
 
 def compute_microbatch_grad_sample(
-        x: Union[torch.Tensor, List[torch.Tensor]],
-        module: nn.Module,
-        batch_first=True,
-        loss_reduction="mean",
+    x: Union[torch.Tensor, List[torch.Tensor]],
+    module: nn.Module,
+    batch_first=True,
+    loss_reduction="mean",
 ) -> Dict[str, torch.tensor]:
     """
     Computes per-sample gradients with the microbatch method, i.e. by computing normal gradients
@@ -152,11 +152,11 @@ def compute_microbatch_grad_sample(
 
 
 def compute_opacus_grad_sample(
-        x: Union[torch.Tensor, PackedSequence],
-        module: nn.Module,
-        batch_first=True,
-        loss_reduction="mean",
-        grad_sample_mode="hooks",
+    x: Union[torch.Tensor, PackedSequence],
+    module: nn.Module,
+    batch_first=True,
+    loss_reduction="mean",
+    grad_sample_mode="hooks",
 ) -> Dict[str, torch.tensor]:
     """
     Runs Opacus to compute per-sample gradients and return them for testing purposes.
@@ -190,9 +190,7 @@ def compute_opacus_grad_sample(
 
     opacus_grad_samples = {
         name: p.grad_sample
-        for name, p in trainable_parameters(
-            grad_sample_module.wrapped_module._module
-        )
+        for name, p in trainable_parameters(grad_sample_module.wrapped_module._module)
     }
 
     return opacus_grad_samples
@@ -209,13 +207,15 @@ def get_grad_sample_modes(use_ew: bool = False):
     return grad_sample_modes
 
 
-def check_per_sample_gradients_are_correct(x: Union[torch.Tensor, PackedSequence],
-                                           module: nn.Module,
-                                           *,
-                                           batch_first=True,
-                                           atol=10e-6,
-                                           rtol=10e-5,
-                                           grad_sample_mode="hooks") -> bool:
+def check_per_sample_gradients_are_correct(
+    x: Union[torch.Tensor, PackedSequence],
+    module: nn.Module,
+    *,
+    batch_first=True,
+    atol=10e-6,
+    rtol=10e-5,
+    grad_sample_mode="hooks",
+) -> bool:
     """
     A utility to check whether per sample gradients are computed correctly with a particular model.
     Args:
@@ -255,10 +255,12 @@ def check_per_sample_gradients_are_correct(x: Union[torch.Tensor, PackedSequence
     return correct
 
 
-def compute_microbatch_grad_sample_tensor_or_seq(x: Union[torch.Tensor, PackedSequence],
-                                                 module: nn.Module,
-                                                 batch_first=True,
-                                                 loss_reduction="mean"):
+def compute_microbatch_grad_sample_tensor_or_seq(
+    x: Union[torch.Tensor, PackedSequence],
+    module: nn.Module,
+    batch_first=True,
+    loss_reduction="mean",
+):
     if type(x) is PackedSequence:
         x_unpacked = unpack_packedsequences(x)
         microbatch_grad_samples = compute_microbatch_grad_sample(
@@ -275,21 +277,22 @@ def compute_microbatch_grad_sample_tensor_or_seq(x: Union[torch.Tensor, PackedSe
     return microbatch_grad_samples
 
 
-def compute_grad_samples_microbatch_and_opacus(x: Union[torch.Tensor, PackedSequence],
-                                               module: nn.Module,
-                                               batch_first=True,
-                                               loss_reduction="mean",
-                                               grad_sample_mode="hooks"):
-    microbatch_grad_samples = compute_microbatch_grad_sample_tensor_or_seq(x,
-                                                                           module,
-                                                                           batch_first=batch_first,
-                                                                           loss_reduction=loss_reduction)
+def compute_grad_samples_microbatch_and_opacus(
+    x: Union[torch.Tensor, PackedSequence],
+    module: nn.Module,
+    batch_first=True,
+    loss_reduction="mean",
+    grad_sample_mode="hooks",
+):
+    microbatch_grad_samples = compute_microbatch_grad_sample_tensor_or_seq(
+        x, module, batch_first=batch_first, loss_reduction=loss_reduction
+    )
     opacus_grad_samples = compute_opacus_grad_sample(
         x,
         module,
         batch_first=batch_first,
         loss_reduction=loss_reduction,
-        grad_sample_mode=grad_sample_mode
+        grad_sample_mode=grad_sample_mode,
     )
 
     if microbatch_grad_samples.keys() != opacus_grad_samples.keys():
@@ -303,26 +306,33 @@ def compute_grad_samples_microbatch_and_opacus(x: Union[torch.Tensor, PackedSequ
 
 
 def _check_per_sample_gradients_are_correct_with_reduction(
-        x: Union[torch.Tensor, PackedSequence],
-        module: nn.Module,
-        batch_first=True,
-        loss_reduction="mean",
-        atol=10e-6,
-        rtol=10e-5,
-        grad_sample_mode="hooks"
+    x: Union[torch.Tensor, PackedSequence],
+    module: nn.Module,
+    batch_first=True,
+    loss_reduction="mean",
+    atol=10e-6,
+    rtol=10e-5,
+    grad_sample_mode="hooks",
 ) -> bool:
-    microbatch_grad_samples, opacus_grad_samples = \
-        compute_grad_samples_microbatch_and_opacus(x,
-                                                   module,
-                                                   batch_first=batch_first,
-                                                   loss_reduction=loss_reduction,
-                                                   grad_sample_mode=grad_sample_mode)
+    (
+        microbatch_grad_samples,
+        opacus_grad_samples,
+    ) = compute_grad_samples_microbatch_and_opacus(
+        x,
+        module,
+        batch_first=batch_first,
+        loss_reduction=loss_reduction,
+        grad_sample_mode=grad_sample_mode,
+    )
 
     correct = True
     for name, opacus_grad_sample in opacus_grad_samples.items():
         microbatch_grad_sample = microbatch_grad_samples[name]
-        correct = correct and np.allclose(microbatch_grad_sample, opacus_grad_sample, atol,
-                                          rtol) and opacus_grad_sample.shape == microbatch_grad_sample.shape
+        correct = (
+            correct
+            and np.allclose(microbatch_grad_sample, opacus_grad_sample, atol, rtol)
+            and opacus_grad_sample.shape == microbatch_grad_sample.shape
+        )
 
     return correct
 
@@ -353,7 +363,7 @@ def unpack_packedsequences(X: PackedSequence) -> List[torch.Tensor]:
 
 
 def _compute_loss_packedsequences(
-        criterion: nn.L1Loss, x: PackedSequence
+    criterion: nn.L1Loss, x: PackedSequence
 ) -> torch.Tensor:
     r"""
     This function computes the loss in a different way for 'mean' reduced L1 loss while for 'sum' reduced L1 loss,

From 36dd386a0d99efa34823396973191828fbe348bc Mon Sep 17 00:00:00 2001
From: Igor Shilov <shilov@fb.com>
Date: Fri, 28 Oct 2022 07:52:26 -0700
Subject: [PATCH 16/32] Functorch gradients: investigation and fix (#510)

Summary:
*The investigation part for this PR was done by alexandresablayrolles, thanks for figuring out the reason the tests were failing*

## Background
Current implementation of functorch-based per sample gradients fails on modules which have both trainable non-recursive parameters and standard submodules, e.g. below
```
class LinearWithExtraParam(nn.Module):
    def __init__(self, in_features: int, out_features: int, hidden_dim: int = 8):
        super().__init__()
        self.fc = nn.Linear(in_features, hidden_dim)
        self.extra_param = nn.Parameter(torch.randn(hidden_dim, out_features))

    def forward(self, x):
        x = self.fc(x)
        x = x.matmul(self.extra_param)
        return x
```

The reason is - functorch hook actually computes gradients for recursive submodules too. The problem is, normal hooks are also attached to these submodules. GradSampleModule then sees two grad_sample tensors, thinks it needs to accumulate and adds them up together

## Solution(s)

There are essentially two ways we can fix this: either make functorch compute per sample gradients for non-recursive parameters only or don't attach normal hooks to submodules where the parent module is handled by functorch.

This diff implements the latter option (reasoning below), for demo purposes the former option can be seen in https://github.com/pytorch/opacus/issues/531

For the pure code perspective the former option (let's call it "non-recursive functorch") is more appealing to me. It better fits the existing paradigm and matches normal hooks behaviour - all of the existing code only deals with the immediate non-recursive parameters.
However, it doesn't make much sense from the efficiency perspective. "non-recursive functorch" would do all the work to compute per-sample gradients for its submodules, only for them to be filtered out at the very last stage.
Alternative option (a.k.a. "functorch for subtrees") does involve a bit more convoluted

This has a noticeable effect on performance.
Below is the results of MNIST benchmarks with different configurations. I've tested this with different configurations, because at the end of the day, the impact on performance depends on how deep are subtrees

* Standard model- our model from MNIST example, standard layers only (2 conv + 2 linear). No overhead expected, functorch doesn't kick in
* Mid-level model - leaf nodes (two linear layers) have one extra param and are computed with functorch. Overhead: 2x Linear hook
* Extreme model - root model have one extra param and needs to be handled by functorch. Overhead: 2x linear hook + 2x conv hook

| Mode                               | non-recursive functorch | functorch for subtrees |
|:-----------------------:|:------------------------:|:-----------------------:|
| Standard model (CPU)  |  138s                                 | 136s                                |
| Standard model (GPU)  |  149s                                 | 150s                                |
| Mid-level model (CPU)  |  157s                                 | 150s                                |
| Mid-level model (GPU)  |  100s                                 | 97s                                 |
| Extreme model (CPU)    |  207s                                 | 172s                               |
| Extreme model (GPU)    |  101s                                 | 94s                                |

Pull Request resolved: https://github.com/pytorch/opacus/pull/510

Reviewed By: alexandresablayrolles

Differential Revision: D39579487

Pulled By: ffuuugor

fbshipit-source-id: 1b089bd04ab110174a1f2ebb371380eb2ce76054
---
 opacus/grad_sample/functorch.py               |  2 +-
 opacus/grad_sample/grad_sample_module.py      | 20 ++++-
 opacus/tests/privacy_engine_test.py           | 85 ++++++++++++++++---
 .../tests/privacy_engine_validation_test.py   | 54 ++----------
 opacus/tests/utils.py                         | 50 +++++++++++
 opacus/utils/module_utils.py                  |  6 +-
 6 files changed, 151 insertions(+), 66 deletions(-)
 create mode 100644 opacus/tests/utils.py

diff --git a/opacus/grad_sample/functorch.py b/opacus/grad_sample/functorch.py
index 97779506..ade37a1c 100644
--- a/opacus/grad_sample/functorch.py
+++ b/opacus/grad_sample/functorch.py
@@ -48,7 +48,7 @@ def ft_compute_per_sample_gradient(layer, activations, backprops):
         activations: the input to the layer
         backprops: the  gradient of the loss w.r.t. outputs of the layer
     """
-    parameters = list(layer.parameters())
+    parameters = list(layer.parameters(recurse=True))
     if not hasattr(layer, "ft_compute_sample_grad"):
         prepare_layer(layer)
 
diff --git a/opacus/grad_sample/grad_sample_module.py b/opacus/grad_sample/grad_sample_module.py
index d2fb0987..3b2a226e 100644
--- a/opacus/grad_sample/grad_sample_module.py
+++ b/opacus/grad_sample/grad_sample_module.py
@@ -18,7 +18,7 @@
 import logging
 import warnings
 from functools import partial
-from typing import List, Tuple
+from typing import Iterable, List, Tuple
 
 import torch
 import torch.nn as nn
@@ -26,6 +26,7 @@
 from opacus.grad_sample.gsm_base import AbstractGradSampleModule
 from opacus.layers.dp_rnn import DPGRU, DPLSTM, DPRNN, RNNLinear
 from opacus.utils.module_utils import (
+    has_trainable_params,
     requires_grad,
     trainable_modules,
     trainable_parameters,
@@ -146,6 +147,21 @@ def __init__(
     def forward(self, *args, **kwargs):
         return self._module(*args, **kwargs)
 
+    def iterate_submodules(self, module: nn.Module) -> Iterable[nn.Module]:
+        if has_trainable_params(module):
+            yield module
+
+        # Don't recurse if module is handled by functorch
+        if (
+            has_trainable_params(module)
+            and type(module) not in self.GRAD_SAMPLERS
+            and type(module) not in [DPRNN, DPLSTM, DPGRU]
+        ):
+            return
+
+        for m in module.children():
+            yield from self.iterate_submodules(m)
+
     def add_hooks(
         self,
         *,
@@ -177,7 +193,7 @@ def add_hooks(
             self._module.autograd_grad_sample_hooks = []
             self.autograd_grad_sample_hooks = self._module.autograd_grad_sample_hooks
 
-        for _module_name, module in trainable_modules(self._module):
+        for module in self.iterate_submodules(self._module):
             # Do not add hooks to DPRNN, DPLSTM or DPGRU as the hooks are handled by the `RNNLinear`
             if type(module) in [DPRNN, DPLSTM, DPGRU]:
                 continue
diff --git a/opacus/tests/privacy_engine_test.py b/opacus/tests/privacy_engine_test.py
index aede7578..ed5b1f7a 100644
--- a/opacus/tests/privacy_engine_test.py
+++ b/opacus/tests/privacy_engine_test.py
@@ -40,6 +40,18 @@
 from torchvision import models, transforms
 from torchvision.datasets import FakeData
 
+from .utils import CustomLinearModule, LinearWithExtraParam
+
+
+def _is_functorch_available():
+    try:
+        # flake8: noqa F401
+        import functorch
+
+        return True
+    except ImportError:
+        return False
+
 
 def get_grad_sample_aggregated(tensor: torch.Tensor, loss_type: str = "mean"):
     if tensor.grad_sample is None:
@@ -246,7 +258,7 @@ def _compare_to_vanilla(
                 # vanilla gradient is nearly zero: will match even with clipping
                 continue
 
-            atol = 1e-7 if max_steps == 1 else 1e-5
+            atol = 1e-7 if max_steps == 1 else 1e-4
             self.assertEqual(
                 torch.allclose(vp, pp, atol=atol, rtol=1e-3),
                 expected_match,
@@ -265,10 +277,6 @@ def _compare_to_vanilla(
         do_noise=st.booleans(),
         use_closure=st.booleans(),
         max_steps=st.sampled_from([1, 4]),
-        # do_clip=st.just(False),
-        # do_noise=st.just(False),
-        # use_closure=st.just(False),
-        # max_steps=st.sampled_from([4]),
     )
     @settings(deadline=None)
     def test_compare_to_vanilla(
@@ -799,9 +807,7 @@ def _init_data(self):
         )
         return DataLoader(ds, batch_size=self.BATCH_SIZE, drop_last=False)
 
-    def _init_model(
-        self, private=False, state_dict=None, model=None, **privacy_engine_kwargs
-    ):
+    def _init_model(self):
         return SampleConvNet()
 
 
@@ -831,9 +837,7 @@ def _init_data(self):
         )
         return DataLoader(ds, batch_size=self.BATCH_SIZE, drop_last=False)
 
-    def _init_model(
-        self, private=False, state_dict=None, model=None, **privacy_engine_kwargs
-    ):
+    def _init_model(self):
         m = SampleConvNet()
         for p in itertools.chain(m.conv1.parameters(), m.gnorm1.parameters()):
             p.requires_grad = False
@@ -841,6 +845,13 @@ def _init_model(
         return m
 
 
+@unittest.skipIf(not _is_functorch_available(), "not supported in this torch version")
+class PrivacyEngineConvNetFrozenTestFunctorch(PrivacyEngineConvNetFrozenTest):
+    def setUp(self):
+        super().setUp()
+        self.GRAD_SAMPLE_MODE = "functorch"
+
+
 @unittest.skipIf(
     torch.__version__ < API_CUTOFF_VERSION, "not supported in this torch version"
 )
@@ -854,6 +865,13 @@ def test_sample_grad_aggregation(self):
         pass
 
 
+@unittest.skipIf(not _is_functorch_available(), "not supported in this torch version")
+class PrivacyEngineConvNetTestFunctorch(PrivacyEngineConvNetTest):
+    def setUp(self):
+        super().setUp()
+        self.GRAD_SAMPLE_MODE = "functorch"
+
+
 class SampleAttnNet(nn.Module):
     def __init__(self):
         super().__init__()
@@ -933,6 +951,13 @@ def _init_model(
         return SampleAttnNet()
 
 
+@unittest.skipIf(not _is_functorch_available(), "not supported in this torch version")
+class PrivacyEngineTextTestFunctorch(PrivacyEngineTextTest):
+    def setUp(self):
+        super().setUp()
+        self.GRAD_SAMPLE_MODE = "functorch"
+
+
 class SampleTiedWeights(nn.Module):
     def __init__(self, tie=True):
         super().__init__()
@@ -972,7 +997,39 @@ def _init_data(self):
         )
         return DataLoader(ds, batch_size=self.BATCH_SIZE, drop_last=False)
 
-    def _init_model(
-        self, private=False, state_dict=None, model=None, **privacy_engine_kwargs
-    ):
+    def _init_model(self):
         return SampleTiedWeights(tie=True)
+
+
+@unittest.skipIf(not _is_functorch_available(), "not supported in this torch version")
+class PrivacyEngineTiedWeightsTestFunctorch(PrivacyEngineTiedWeightsTest):
+    def setUp(self):
+        super().setUp()
+        self.GRAD_SAMPLE_MODE = "functorch"
+
+
+class ModelWithCustomLinear(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = CustomLinearModule(4, 8)
+        self.fc2 = LinearWithExtraParam(8, 4)
+        self.extra_param = nn.Parameter(torch.randn(4, 4))
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.fc2(x)
+        x = x.matmul(self.extra_param)
+        return x
+
+
+@unittest.skipIf(not _is_functorch_available(), "not supported in this torch version")
+class PrivacyEngineCustomLayerTest(BasePrivacyEngineTest, unittest.TestCase):
+    def _init_data(self):
+        ds = TensorDataset(
+            torch.randn(self.DATA_SIZE, 4),
+            torch.randint(low=0, high=3, size=(self.DATA_SIZE,)),
+        )
+        return DataLoader(ds, batch_size=self.BATCH_SIZE, drop_last=False)
+
+    def _init_model(self):
+        return ModelWithCustomLinear()
diff --git a/opacus/tests/privacy_engine_validation_test.py b/opacus/tests/privacy_engine_validation_test.py
index 8548f73f..0ba061d8 100644
--- a/opacus/tests/privacy_engine_validation_test.py
+++ b/opacus/tests/privacy_engine_validation_test.py
@@ -1,58 +1,16 @@
 import unittest
 
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
 from opacus import PrivacyEngine
 from opacus.grad_sample.gsm_exp_weights import API_CUTOFF_VERSION
 from torch.utils.data import DataLoader
 
-
-class BasicSupportedModule(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.conv = nn.Conv1d(in_channels=16, out_channels=8, kernel_size=2)
-        self.gn = nn.GroupNorm(num_groups=2, num_channels=8)
-        self.fc = nn.Linear(in_features=4, out_features=8)
-        self.ln = nn.LayerNorm([8, 8])
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.gn(x)
-        x = self.fc(x)
-        x = self.ln(x)
-        return x
-
-
-class CustomLinearModule(nn.Module):
-    def __init__(self, in_features, out_features):
-        super().__init__()
-        self._weight = nn.Parameter(torch.randn(out_features, in_features))
-        self._bias = nn.Parameter(torch.randn(out_features))
-
-    def forward(self, x):
-        return F.linear(x, self._weight, self._bias)
-
-
-class MatmulModule(nn.Module):
-    def __init__(self, input_features, output_features):
-        super().__init__()
-        self.weight = nn.Parameter(torch.randn(input_features, output_features))
-
-    def forward(self, x):
-        return torch.matmul(x, self.weight)
-
-
-class LinearWithExtraParam(nn.Module):
-    def __init__(self, in_features, out_features):
-        super().__init__()
-        self.fc = nn.Linear(in_features, out_features)
-        self.extra_param = nn.Parameter(torch.randn(out_features, 2))
-
-    def forward(self, x):
-        x = self.fc(x)
-        x = x.matmul(self.extra_param)
-        return x
+from .utils import (
+    BasicSupportedModule,
+    CustomLinearModule,
+    LinearWithExtraParam,
+    MatmulModule,
+)
 
 
 class PrivacyEngineValidationTest(unittest.TestCase):
diff --git a/opacus/tests/utils.py b/opacus/tests/utils.py
new file mode 100644
index 00000000..36833977
--- /dev/null
+++ b/opacus/tests/utils.py
@@ -0,0 +1,50 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class BasicSupportedModule(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv1d(in_channels=16, out_channels=8, kernel_size=2)
+        self.gn = nn.GroupNorm(num_groups=2, num_channels=8)
+        self.fc = nn.Linear(in_features=4, out_features=8)
+        self.ln = nn.LayerNorm([8, 8])
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.gn(x)
+        x = self.fc(x)
+        x = self.ln(x)
+        return x
+
+
+class CustomLinearModule(nn.Module):
+    def __init__(self, in_features: int, out_features: int):
+        super().__init__()
+        self._weight = nn.Parameter(torch.randn(out_features, in_features))
+        self._bias = nn.Parameter(torch.randn(out_features))
+
+    def forward(self, x):
+        return F.linear(x, self._weight, self._bias)
+
+
+class MatmulModule(nn.Module):
+    def __init__(self, input_features: int, output_features: int):
+        super().__init__()
+        self.weight = nn.Parameter(torch.randn(input_features, output_features))
+
+    def forward(self, x):
+        return torch.matmul(x, self.weight)
+
+
+class LinearWithExtraParam(nn.Module):
+    def __init__(self, in_features: int, out_features: int, hidden_dim: int = 8):
+        super().__init__()
+        self.fc = nn.Linear(in_features, hidden_dim)
+        self.extra_param = nn.Parameter(torch.randn(hidden_dim, out_features))
+
+    def forward(self, x):
+        x = self.fc(x)
+        x = x.matmul(self.extra_param)
+        return x
diff --git a/opacus/utils/module_utils.py b/opacus/utils/module_utils.py
index da2f6c9a..28146cef 100644
--- a/opacus/utils/module_utils.py
+++ b/opacus/utils/module_utils.py
@@ -31,7 +31,11 @@
 logger.setLevel(level=logging.INFO)
 
 
-def parametrized_modules(module: nn.Module) -> Iterable[nn.Module]:
+def has_trainable_params(module: nn.Module) -> bool:
+    return any(p.requires_grad for p in module.parameters(recurse=False))
+
+
+def parametrized_modules(module: nn.Module) -> Iterable[Tuple[str, nn.Module]]:
     """
     Recursively iterates over all submodules, returning those that
     have parameters (as opposed to "wrapper modules" that just organize modules).

From c06ebec0d2d024e15d0cd976b42a6dd978347f86 Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Fri, 28 Oct 2022 17:08:27 +0100
Subject: [PATCH 17/32] Fix flake8 errors

---
 opacus/tests/grad_samples/layer_norm_test.py | 27 ++++++++++++--------
 opacus/utils/per_sample_gradients_utils.py   |  2 +-
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/opacus/tests/grad_samples/layer_norm_test.py b/opacus/tests/grad_samples/layer_norm_test.py
index 29c3c8f5..b303ec49 100644
--- a/opacus/tests/grad_samples/layer_norm_test.py
+++ b/opacus/tests/grad_samples/layer_norm_test.py
@@ -49,6 +49,22 @@ def test_input_norm(
 
         if norm_dim >= input_dim:
             return
+        normalized_shape, x_shape = self.get_x_shape_and_norm_shape(
+            H, N, W, Z, input_dim, norm_dim
+        )
+
+        norm = nn.LayerNorm(normalized_shape, elementwise_affine=True)
+        x = torch.randn(x_shape)
+        if test_or_check == 1:
+            self.run_test(x, norm, batch_first=True)
+        if test_or_check == 2:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
+                assert check_per_sample_gradients_are_correct(
+                    x, norm, batch_first=True, grad_sample_mode=grad_sample_mode
+                )
+
+    @staticmethod
+    def get_x_shape_and_norm_shape(H, N, W, Z, input_dim, norm_dim):
         if norm_dim == 1:
             normalized_shape = W
             if input_dim == 2:
@@ -67,13 +83,4 @@ def test_input_norm(
         elif norm_dim == 3:
             normalized_shape = [Z, H, W]
             x_shape = [N, Z, H, W]
-
-        norm = nn.LayerNorm(normalized_shape, elementwise_affine=True)
-        x = torch.randn(x_shape)
-        if test_or_check == 1:
-            self.run_test(x, norm, batch_first=True)
-        if test_or_check == 2:
-            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(
-                    x, norm, batch_first=True, grad_sample_mode=grad_sample_mode
-                )
+        return normalized_shape, x_shape
diff --git a/opacus/utils/per_sample_gradients_utils.py b/opacus/utils/per_sample_gradients_utils.py
index 9b80d9f7..724b1542 100644
--- a/opacus/utils/per_sample_gradients_utils.py
+++ b/opacus/utils/per_sample_gradients_utils.py
@@ -235,7 +235,7 @@ def check_per_sample_gradients_are_correct(
     reductions = ["sum", "mean"]
     if grad_sample_mode == "ew":
         if not batch_first:
-            raise RuntimeError(f"Batch should be first dimension.")
+            raise RuntimeError("Batch should be first dimension.")
         if not check_torch_version_for_ew_sample():
             raise RuntimeError(f"Unsupported torch version: {torch.__version__}.")
         reductions = ["sum"]

From 5168e205364757d93de23389a8277191c9766bb4 Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Mon, 31 Oct 2022 11:44:21 +0000
Subject: [PATCH 18/32] Add type hints

---
 opacus/utils/per_sample_gradients_utils.py | 38 +++++++++++-----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/opacus/utils/per_sample_gradients_utils.py b/opacus/utils/per_sample_gradients_utils.py
index 724b1542..7b777152 100644
--- a/opacus/utils/per_sample_gradients_utils.py
+++ b/opacus/utils/per_sample_gradients_utils.py
@@ -87,8 +87,8 @@ def forward(self, x):
 def compute_microbatch_grad_sample(
     x: Union[torch.Tensor, List[torch.Tensor]],
     module: nn.Module,
-    batch_first=True,
-    loss_reduction="mean",
+    batch_first: bool = True,
+    loss_reduction: str = "mean",
 ) -> Dict[str, torch.tensor]:
     """
     Computes per-sample gradients with the microbatch method, i.e. by computing normal gradients
@@ -154,9 +154,9 @@ def compute_microbatch_grad_sample(
 def compute_opacus_grad_sample(
     x: Union[torch.Tensor, PackedSequence],
     module: nn.Module,
-    batch_first=True,
-    loss_reduction="mean",
-    grad_sample_mode="hooks",
+    batch_first: bool = True,
+    loss_reduction: str = "mean",
+    grad_sample_mode: str = "hooks",
 ) -> Dict[str, torch.tensor]:
     """
     Runs Opacus to compute per-sample gradients and return them for testing purposes.
@@ -211,10 +211,10 @@ def check_per_sample_gradients_are_correct(
     x: Union[torch.Tensor, PackedSequence],
     module: nn.Module,
     *,
-    batch_first=True,
-    atol=10e-6,
-    rtol=10e-5,
-    grad_sample_mode="hooks",
+    batch_first: bool = True,
+    atol: float = 10e-6,
+    rtol: float = 10e-5,
+    grad_sample_mode: str = "hooks",
 ) -> bool:
     """
     A utility to check whether per sample gradients are computed correctly with a particular model.
@@ -258,8 +258,8 @@ def check_per_sample_gradients_are_correct(
 def compute_microbatch_grad_sample_tensor_or_seq(
     x: Union[torch.Tensor, PackedSequence],
     module: nn.Module,
-    batch_first=True,
-    loss_reduction="mean",
+    batch_first: bool = True,
+    loss_reduction: str = "mean",
 ):
     if type(x) is PackedSequence:
         x_unpacked = unpack_packedsequences(x)
@@ -280,9 +280,9 @@ def compute_microbatch_grad_sample_tensor_or_seq(
 def compute_grad_samples_microbatch_and_opacus(
     x: Union[torch.Tensor, PackedSequence],
     module: nn.Module,
-    batch_first=True,
-    loss_reduction="mean",
-    grad_sample_mode="hooks",
+    batch_first: bool = True,
+    loss_reduction: str = "mean",
+    grad_sample_mode: str = "hooks",
 ):
     microbatch_grad_samples = compute_microbatch_grad_sample_tensor_or_seq(
         x, module, batch_first=batch_first, loss_reduction=loss_reduction
@@ -308,11 +308,11 @@ def compute_grad_samples_microbatch_and_opacus(
 def _check_per_sample_gradients_are_correct_with_reduction(
     x: Union[torch.Tensor, PackedSequence],
     module: nn.Module,
-    batch_first=True,
-    loss_reduction="mean",
-    atol=10e-6,
-    rtol=10e-5,
-    grad_sample_mode="hooks",
+    batch_first: bool = True,
+    loss_reduction: str = "mean",
+    atol: float = 10e-6,
+    rtol: float = 10e-5,
+    grad_sample_mode: str = "hooks",
 ) -> bool:
     (
         microbatch_grad_samples,

From b71fb30a134a4b6aa6cb44639d92184d9ef7b7ce Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Mon, 31 Oct 2022 13:07:49 +0000
Subject: [PATCH 19/32] Refactor

---
 opacus/utils/per_sample_gradients_utils.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/opacus/utils/per_sample_gradients_utils.py b/opacus/utils/per_sample_gradients_utils.py
index 7b777152..f1bb4d74 100644
--- a/opacus/utils/per_sample_gradients_utils.py
+++ b/opacus/utils/per_sample_gradients_utils.py
@@ -240,9 +240,8 @@ def check_per_sample_gradients_are_correct(
             raise RuntimeError(f"Unsupported torch version: {torch.__version__}.")
         reductions = ["sum"]
 
-    correct = True
     for loss_reduction in reductions:
-        correct = correct and _check_per_sample_gradients_are_correct_with_reduction(
+        if not _check_per_sample_gradients_are_correct_with_reduction(
             x,
             module,
             batch_first=batch_first,
@@ -250,9 +249,10 @@ def check_per_sample_gradients_are_correct(
             atol=atol,
             rtol=rtol,
             grad_sample_mode=grad_sample_mode,
-        )
+        ):
+            return False
 
-    return correct
+    return True
 
 
 def compute_microbatch_grad_sample_tensor_or_seq(
@@ -325,16 +325,13 @@ def _check_per_sample_gradients_are_correct_with_reduction(
         grad_sample_mode=grad_sample_mode,
     )
 
-    correct = True
     for name, opacus_grad_sample in opacus_grad_samples.items():
         microbatch_grad_sample = microbatch_grad_samples[name]
-        correct = (
-            correct
-            and np.allclose(microbatch_grad_sample, opacus_grad_sample, atol, rtol)
-            and opacus_grad_sample.shape == microbatch_grad_sample.shape
-        )
-
-    return correct
+        if not opacus_grad_sample.shape == microbatch_grad_sample.shape:
+            return False
+        if not torch.allclose(microbatch_grad_sample, opacus_grad_sample, atol, rtol):
+            return False
+    return True
 
 
 def unpack_packedsequences(X: PackedSequence) -> List[torch.Tensor]:

From cdcae861a37f5250d776996aa7a0903bb9547a1e Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Mon, 31 Oct 2022 13:14:39 +0000
Subject: [PATCH 20/32] Update docstrings

---
 opacus/utils/per_sample_gradients_utils.py | 26 ++++++++++++++++------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/opacus/utils/per_sample_gradients_utils.py b/opacus/utils/per_sample_gradients_utils.py
index f1bb4d74..d60383dd 100644
--- a/opacus/utils/per_sample_gradients_utils.py
+++ b/opacus/utils/per_sample_gradients_utils.py
@@ -96,8 +96,8 @@ def compute_microbatch_grad_sample(
     as this method is obviously correct, but slow.
 
     Args:
-        x: The tensor in input to the ``module``
-        module: The ``ModelWithLoss`` that wraps the nn.Module you want to test.
+        x: Sample input batch
+         module: The nn.Module you want to test.
         batch_first: Whether batch size is the first dimension (as opposed to the second).
             Defaults to True.
         loss_reduction: Indicates if the loss reduction (for aggregating the gradients)
@@ -162,8 +162,8 @@ def compute_opacus_grad_sample(
     Runs Opacus to compute per-sample gradients and return them for testing purposes.
 
     Args:
-        x: The tensor in input to the ``module``
-        module: The ``ModelWithLoss`` that wraps the nn.Module you want to test.
+        x: Sample input batch
+        module: The nn.Module you want to test.
         batch_first: Whether batch size is the first dimension (as opposed to the second).
             Defaults to True.
         loss_reduction: What reduction to apply to the loss. Defaults to "mean".
@@ -218,16 +218,28 @@ def check_per_sample_gradients_are_correct(
 ) -> bool:
     """
     A utility to check whether per sample gradients are computed correctly with a particular model.
+    The check is performed by comparing the result of the slow but reliable micro-batch method `compute_microbatch_grad_sample`
+    with the result of optimized opacus method.
+
     Args:
-        x: The tensor in input to the ``module``
+        x: Sample input batch
         module: The ``ModelWithLoss`` that wraps the nn.Module you want to check.
         batch_first: Whether batch size is the first dimension (as opposed to the second).
             Defaults to True.
-        atol: The relative tolerance parameter (numpy).
-        rtol: The absolute tolerance parameter (numpy).
+        atol: The relative tolerance parameter (torch.allclose).
+        rtol: The absolute tolerance parameter (torch.allclose).
         grad_sample_mode: What sampling method to use to get gradients.
 
     Returns: True if per sample gradients were computed correctly. False otherwise.
+
+    Example:
+        >>> x_shape = [N, Z, W]
+        >>> x = torch.randn(x_shape)
+        >>> model = nn.Linear(W, W + 2)
+        >>> assert check_per_sample_gradients_are_correct(
+        ...            x,
+        ...            model
+        ...        ) # This will fail only if the opacus per sample gradients do not match the micro-batch gradients.
     """
     if grad_sample_mode == "functorch":
         import functorch  # noqa

From ab1d6a7e503798e304516208348d84cd7e219728 Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Mon, 31 Oct 2022 16:11:42 +0000
Subject: [PATCH 21/32] Fix reduction modes for EW

---
 opacus/tests/grad_samples/common.py        | 18 +++++-------------
 opacus/utils/per_sample_gradients_utils.py |  4 ----
 2 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/opacus/tests/grad_samples/common.py b/opacus/tests/grad_samples/common.py
index ec0a1c53..a16a2f25 100644
--- a/opacus/tests/grad_samples/common.py
+++ b/opacus/tests/grad_samples/common.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
 import unittest
 from typing import Union
 
@@ -60,8 +59,11 @@ def run_test(
         if type(x) is not PackedSequence and x.numel() == 0:
             grad_sample_modes = ["hooks"]
 
-        for grad_sample_mode in grad_sample_modes:
-            for loss_reduction in ["sum", "mean"]:
+        if ew_compatible and batch_first and torch.__version__ >= (1, 13):
+            grad_sample_modes += ["ew"]
+
+        for loss_reduction in ["sum", "mean"]:
+            for grad_sample_mode in grad_sample_modes:
                 with self.subTest(
                     grad_sample_mode=grad_sample_mode, loss_reduction=loss_reduction
                 ):
@@ -74,16 +76,6 @@ def run_test(
                         rtol=rtol,
                         grad_sample_mode=grad_sample_mode,
                     )
-        if ew_compatible and batch_first and torch.__version__ >= (1, 13):
-            self.run_test_with_reduction(
-                x,
-                module,
-                batch_first=batch_first,
-                loss_reduction="sum",
-                atol=atol,
-                rtol=rtol,
-                grad_sample_mode="ew",
-            )
 
     def run_test_with_reduction(
         self,
diff --git a/opacus/utils/per_sample_gradients_utils.py b/opacus/utils/per_sample_gradients_utils.py
index d60383dd..c75ae152 100644
--- a/opacus/utils/per_sample_gradients_utils.py
+++ b/opacus/utils/per_sample_gradients_utils.py
@@ -241,16 +241,12 @@ def check_per_sample_gradients_are_correct(
         ...            model
         ...        ) # This will fail only if the opacus per sample gradients do not match the micro-batch gradients.
     """
-    if grad_sample_mode == "functorch":
-        import functorch  # noqa
-
     reductions = ["sum", "mean"]
     if grad_sample_mode == "ew":
         if not batch_first:
             raise RuntimeError("Batch should be first dimension.")
         if not check_torch_version_for_ew_sample():
             raise RuntimeError(f"Unsupported torch version: {torch.__version__}.")
-        reductions = ["sum"]
 
     for loss_reduction in reductions:
         if not _check_per_sample_gradients_are_correct_with_reduction(

From 206a0424fa9c2cf515effc072343ff7157345649 Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Tue, 1 Nov 2022 11:48:58 +0000
Subject: [PATCH 22/32] Rebase on #530, separate utils tests, refactor

---
 opacus/tests/grad_samples/common.py           |   4 +
 opacus/tests/grad_samples/conv1d_test.py      |  20 +--
 opacus/tests/grad_samples/conv2d_test.py      |  70 ++++-----
 opacus/tests/grad_samples/conv3d_test.py      |  59 +++-----
 .../dp_multihead_attention_test.py            |  10 +-
 opacus/tests/grad_samples/dp_rnn_test.py      |  10 +-
 opacus/tests/grad_samples/embedding_test.py   |  27 ++--
 opacus/tests/grad_samples/group_norm_test.py  |  19 +--
 .../grad_samples/instance_norm1d_test.py      |  17 +--
 .../grad_samples/instance_norm2d_test.py      |  15 +-
 .../grad_samples/instance_norm3d_test.py      |  11 +-
 opacus/tests/grad_samples/layer_norm_test.py  |  18 +--
 opacus/tests/grad_samples/linear_test.py      |  27 ++--
 .../tests/grad_samples/sequence_bias_test.py  |  20 +--
 .../tests/per_sample_gradients_utils_test.py  | 139 ++++++++++++++++++
 opacus/utils/per_sample_gradients_utils.py    |  18 ++-
 16 files changed, 254 insertions(+), 230 deletions(-)
 create mode 100644 opacus/tests/per_sample_gradients_utils_test.py

diff --git a/opacus/tests/grad_samples/common.py b/opacus/tests/grad_samples/common.py
index a16a2f25..fc74b79c 100644
--- a/opacus/tests/grad_samples/common.py
+++ b/opacus/tests/grad_samples/common.py
@@ -87,6 +87,10 @@ def run_test_with_reduction(
         rtol=10e-5,
         grad_sample_mode="hooks",
     ):
+        if (
+            not type(x) is PackedSequence and x.numel() <= 0
+        ):  # We've checked opacus can handle 0-sized batch. Microbatch doesn't make sense
+            return
         (
             microbatch_grad_samples,
             opacus_grad_samples,
diff --git a/opacus/tests/grad_samples/conv1d_test.py b/opacus/tests/grad_samples/conv1d_test.py
index 795539dc..d9912e44 100644
--- a/opacus/tests/grad_samples/conv1d_test.py
+++ b/opacus/tests/grad_samples/conv1d_test.py
@@ -37,8 +37,7 @@ class Conv1d_test(GradSampleHooks_test):
         stride=st.integers(1, 2),
         padding=st.sampled_from([0, 1, 2, "same", "valid"]),
         dilation=st.integers(1, 2),
-        groups=st.integers(1, 12),
-        test_or_check=st.integers(1, 2),
+        groups=st.integers(1, 12)
     )
     @settings(deadline=10000)
     def test_conv1d(
@@ -52,7 +51,6 @@ def test_conv1d(
         padding: int,
         dilation: int,
         groups: int,
-        test_or_check: int,
     ):
 
         if padding == "same" and stride != 1:
@@ -73,10 +71,12 @@ def test_conv1d(
             dilation=dilation,
             groups=groups,
         )
-        ew_compatible=N > 0
-        if test_or_check == 1:
-            self.run_test(x, conv, batch_first=True, atol=10e-5, rtol=10e-4, ew_compatible=ew_compatible)
-        if test_or_check == 2:
-            for grad_sample_mode in get_grad_sample_modes(use_ew=ew_compatible):
-                assert check_per_sample_gradients_are_correct(x, conv, batch_first=True, atol=10e-5, rtol=10e-4,
-                                                              grad_sample_mode=grad_sample_mode)
+        ew_compatible = N > 0
+        self.run_test(
+            x,
+            conv,
+            batch_first=True,
+            atol=10e-5,
+            rtol=10e-4,
+            ew_compatible=ew_compatible,
+        )
diff --git a/opacus/tests/grad_samples/conv2d_test.py b/opacus/tests/grad_samples/conv2d_test.py
index 185c9246..3f849c18 100644
--- a/opacus/tests/grad_samples/conv2d_test.py
+++ b/opacus/tests/grad_samples/conv2d_test.py
@@ -43,22 +43,20 @@ class Conv2d_test(GradSampleHooks_test):
         padding=st.sampled_from([0, 2, "same", "valid"]),
         dilation=st.integers(1, 3),
         groups=st.integers(1, 16),
-        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=30000)
     def test_conv2d(
-            self,
-            N: int,
-            C: int,
-            H: int,
-            W: int,
-            out_channels_mapper: Callable[[int], int],
-            kernel_size: int,
-            stride: int,
-            padding: int,
-            dilation: int,
-            groups: int,
-            test_or_check: int
+        self,
+        N: int,
+        C: int,
+        H: int,
+        W: int,
+        out_channels_mapper: Callable[[int], int],
+        kernel_size: int,
+        stride: int,
+        padding: int,
+        dilation: int,
+        groups: int,
     ):
         if padding == "same" and stride != 1:
             return
@@ -83,7 +81,21 @@ def test_conv2d(
         )  # TODO add support for padding = 'same' with EW
 
         # Test regular GSM
-        if test_or_check == 1:
+        self.run_test(
+            x,
+            conv,
+            batch_first=True,
+            atol=10e-5,
+            rtol=10e-4,
+            ew_compatible=is_ew_compatible,
+        )
+        if padding != "same" and N > 0:
+            # Test 'convolution as a backward' GSM
+            # 'convolution as a backward' doesn't support padding=same
+            conv2d_gsm = GradSampleModule.GRAD_SAMPLERS[nn.Conv2d]
+            GradSampleModule.GRAD_SAMPLERS[
+                nn.Conv2d
+            ] = convolution2d_backward_as_a_convolution
             self.run_test(
                 x,
                 conv,
@@ -92,7 +104,6 @@ def test_conv2d(
                 rtol=10e-4,
                 ew_compatible=is_ew_compatible,
             )
-        if test_or_check == 2:
             for grad_sample_mode in get_grad_sample_modes(use_ew=is_ew_compatible):
                 assert check_per_sample_gradients_are_correct(
                     x,
@@ -100,35 +111,8 @@ def test_conv2d(
                     batch_first=True,
                     atol=10e-5,
                     rtol=10e-4,
-                    grad_sample_mode=grad_sample_mode
-                )
-
-        if padding != "same" and N > 0:
-            # Test 'convolution as a backward' GSM
-            # 'convolution as a backward' doesn't support padding=same
-            conv2d_gsm = GradSampleModule.GRAD_SAMPLERS[nn.Conv2d]
-            GradSampleModule.GRAD_SAMPLERS[
-                nn.Conv2d
-            ] = convolution2d_backward_as_a_convolution
-            if test_or_check == 1:
-                self.run_test(
-                    x,
-                    conv,
-                    batch_first=True,
-                    atol=10e-5,
-                    rtol=10e-4,
-                    ew_compatible=is_ew_compatible,
+                    grad_sample_mode=grad_sample_mode,
                 )
-            if test_or_check == 2:
-                for grad_sample_mode in get_grad_sample_modes(use_ew=is_ew_compatible):
-                    assert check_per_sample_gradients_are_correct(
-                        x,
-                        conv,
-                        batch_first=True,
-                        atol=10e-5,
-                        rtol=10e-4,
-                        grad_sample_mode=grad_sample_mode,
-                    )
             GradSampleModule.GRAD_SAMPLERS[nn.Conv2d] = conv2d_gsm
 
     @given(
diff --git a/opacus/tests/grad_samples/conv3d_test.py b/opacus/tests/grad_samples/conv3d_test.py
index 5f2d3c04..e50909e2 100644
--- a/opacus/tests/grad_samples/conv3d_test.py
+++ b/opacus/tests/grad_samples/conv3d_test.py
@@ -21,10 +21,6 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test, expander, shrinker
-from ...utils.per_sample_gradients_utils import (
-    get_grad_sample_modes,
-    check_per_sample_gradients_are_correct,
-)
 
 
 class Conv3d_test(GradSampleHooks_test):
@@ -40,30 +36,28 @@ class Conv3d_test(GradSampleHooks_test):
         padding=st.sampled_from([0, 2, (1, 2, 3), "same", "valid"]),
         dilation=st.sampled_from([1, (1, 2, 2)]),
         groups=st.integers(1, 16),
-        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=30000)
     def test_conv3d(
-            self,
-            N: int,
-            C: int,
-            D: int,
-            H: int,
-            W: int,
-            out_channels_mapper: int,
-            kernel_size: Union[int, Tuple[int]],
-            stride: Union[int, Tuple[int]],
-            padding: Union[int, Tuple[int]],
-            dilation: int,
-            groups: int,
-            test_or_check: int
+        self,
+        N: int,
+        C: int,
+        D: int,
+        H: int,
+        W: int,
+        out_channels_mapper: int,
+        kernel_size: Union[int, Tuple[int]],
+        stride: Union[int, Tuple[int]],
+        padding: Union[int, Tuple[int]],
+        dilation: int,
+        groups: int,
     ):
 
         if padding == "same" and stride != 1:
             return
         out_channels = out_channels_mapper(C)
         if (
-                C % groups != 0 or out_channels % groups != 0
+            C % groups != 0 or out_channels % groups != 0
         ):  # since in_channels and out_channels must be divisible by groups
             return
         x = torch.randn([N, C, D, H, W])
@@ -79,22 +73,11 @@ def test_conv3d(
         is_ew_compatible = (
             dilation == 1 and padding != "same" and N > 0
         )  # TODO add support for padding = 'same' with EW
-        if test_or_check == 1:
-            self.run_test(
-                x,
-                conv,
-                batch_first=True,
-                atol=10e-5,
-                rtol=10e-3,
-                ew_compatible=is_ew_compatible,
-            )
-        if test_or_check == 2:
-            for grad_sample_mode in get_grad_sample_modes(use_ew=is_ew_compatible):
-                assert check_per_sample_gradients_are_correct(
-                    x,
-                    conv,
-                    batch_first=True,
-                    atol=10e-5,
-                    rtol=10e-3,
-                    grad_sample_mode=grad_sample_mode
-                )
+        self.run_test(
+            x,
+            conv,
+            batch_first=True,
+            atol=10e-5,
+            rtol=10e-3,
+            ew_compatible=is_ew_compatible,
+        )
diff --git a/opacus/tests/grad_samples/dp_multihead_attention_test.py b/opacus/tests/grad_samples/dp_multihead_attention_test.py
index b1192e2a..d3f902c4 100644
--- a/opacus/tests/grad_samples/dp_multihead_attention_test.py
+++ b/opacus/tests/grad_samples/dp_multihead_attention_test.py
@@ -57,7 +57,6 @@ class MultiHeadAttention_test(GradSampleHooks_test):
         add_bias_kv=st.booleans(),
         add_zero_attn=st.booleans(),
         kv_dim=st.booleans(),
-        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
     def test_multihead_attention(
@@ -70,7 +69,6 @@ def test_multihead_attention(
         add_bias_kv: bool,
         add_zero_attn: bool,
         kv_dim: bool,
-        test_or_check: int,
     ):
 
         if kv_dim:
@@ -92,10 +90,4 @@ def test_multihead_attention(
         v = torch.randn([T, N, D])
         x = torch.stack((q, k, v), dim=-1)
 
-        if test_or_check == 1:
-            self.run_test(x, attn, batch_first=False)
-        if test_or_check == 2:
-            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(
-                    x, attn, batch_first=False, grad_sample_mode=grad_sample_mode
-                )
+        self.run_test(x, attn, batch_first=False)
diff --git a/opacus/tests/grad_samples/dp_rnn_test.py b/opacus/tests/grad_samples/dp_rnn_test.py
index 390cd707..109020e9 100644
--- a/opacus/tests/grad_samples/dp_rnn_test.py
+++ b/opacus/tests/grad_samples/dp_rnn_test.py
@@ -62,7 +62,6 @@ class RNN_test(GradSampleHooks_test):
         bidirectional=st.booleans(),
         using_packed_sequences=st.booleans(),
         packed_sequences_sorted=st.booleans(),
-        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=30000)
     def test_rnn(
@@ -78,7 +77,6 @@ def test_rnn(
         bidirectional: bool,
         using_packed_sequences: bool,
         packed_sequences_sorted: bool,
-        test_or_check: int,
     ):
         rnn = model(
             D,
@@ -98,10 +96,4 @@ def test_rnn(
             else:
                 x = torch.randn([T, N, D])
 
-        if test_or_check == 1:
-            self.run_test(x, rnn, batch_first=batch_first, ew_compatible=False)
-        if test_or_check == 2:
-            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(
-                    x, rnn, batch_first=batch_first, grad_sample_mode=grad_sample_mode
-                )
+        self.run_test(x, rnn, batch_first=batch_first, ew_compatible=False)
diff --git a/opacus/tests/grad_samples/embedding_test.py b/opacus/tests/grad_samples/embedding_test.py
index e368a6c1..35d3e2bf 100644
--- a/opacus/tests/grad_samples/embedding_test.py
+++ b/opacus/tests/grad_samples/embedding_test.py
@@ -35,20 +35,18 @@ class Embedding_test(GradSampleHooks_test):
         D=st.integers(10, 17),
         dim=st.integers(2, 4),
         batch_first=st.booleans(),
-        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_input_across_dims(
-            self,
-            N: int,
-            T: int,
-            Q: int,
-            R: int,
-            V: int,
-            D: int,
-            dim: int,
-            batch_first: bool,
-            test_or_check: int
+        self,
+        N: int,
+        T: int,
+        Q: int,
+        R: int,
+        V: int,
+        D: int,
+        dim: int,
+        batch_first: bool,
     ):
 
         if dim == 1:  # TODO: fix when dim is 1
@@ -63,9 +61,4 @@ def test_input_across_dims(
         emb = nn.Embedding(V, D)
         x = torch.randint(low=0, high=V - 1, size=size)
         ew_compatible = N > 0
-        if test_or_check == 1:
-            self.run_test(x, emb, batch_first=batch_first, ew_compatible=ew_compatible)
-        if test_or_check == 2:
-            for grad_sample_mode in get_grad_sample_modes(use_ew=ew_compatible):
-                assert check_per_sample_gradients_are_correct(x, emb, batch_first=batch_first,
-                                                              grad_sample_mode=grad_sample_mode)
+        self.run_test(x, emb, batch_first=batch_first, ew_compatible=ew_compatible)
diff --git a/opacus/tests/grad_samples/group_norm_test.py b/opacus/tests/grad_samples/group_norm_test.py
index b3319aae..9d63ef09 100644
--- a/opacus/tests/grad_samples/group_norm_test.py
+++ b/opacus/tests/grad_samples/group_norm_test.py
@@ -39,17 +39,10 @@ class GroupNorm_test(GradSampleHooks_test):
         H=st.integers(5, 10),
         W=st.integers(4, 8),
         num_groups=st.sampled_from([1, 4, "C"]),
-        test_or_check=st.integers(1, 2)
     )
     @settings(deadline=10000)
     def test_3d_input_groups(
-            self,
-            N: int,
-            C: int,
-            H: int,
-            W: int,
-            num_groups: Union[int, str],
-            test_or_check: int
+        self, N: int, C: int, H: int, W: int, num_groups: Union[int, str]
     ):
 
         if num_groups == "C":
@@ -59,12 +52,6 @@ def test_3d_input_groups(
             return
 
         x = torch.randn([N, C, H, W])
-        ew_compatible=N > 0
+        ew_compatible = N > 0
         norm = nn.GroupNorm(num_groups=num_groups, num_channels=C, affine=True)
-        self.run_test(x, norm, batch_first=True)
-        if test_or_check == 1:
-            self.run_test(x, norm, batch_first=True, ew_compatible=ew_compatible)
-        if test_or_check == 2:
-            for grad_sample_mode in get_grad_sample_modes(use_ew=ew_compatible):
-                assert check_per_sample_gradients_are_correct(x, norm, batch_first=True,
-                                                              grad_sample_mode=grad_sample_mode)
+        self.run_test(x, norm, batch_first=True, ew_compatible=ew_compatible)
diff --git a/opacus/tests/grad_samples/instance_norm1d_test.py b/opacus/tests/grad_samples/instance_norm1d_test.py
index 68bdebda..fac4b738 100644
--- a/opacus/tests/grad_samples/instance_norm1d_test.py
+++ b/opacus/tests/grad_samples/instance_norm1d_test.py
@@ -26,22 +26,11 @@
 
 
 class InstanceNorm1d_test(GradSampleHooks_test):
-    @given(
-        N=st.integers(1, 4),
-        C=st.integers(1, 3),
-        W=st.integers(5, 10),
-        test_or_check=st.integers(1, 2),
-    )
+    @given(N=st.integers(1, 4), C=st.integers(1, 3), W=st.integers(5, 10))
     @settings(deadline=10000)
-    def test_3d_input(self, N: int, C: int, W: int, test_or_check: int):
+    def test_3d_input(self, N: int, C: int, W: int):
 
         x = torch.randn([N, C, W])
         norm = nn.InstanceNorm1d(num_features=C, affine=True, track_running_stats=False)
 
-        if test_or_check == 1:
-            self.run_test(x, norm, batch_first=True)
-        if test_or_check == 2:
-            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(
-                    x, norm, batch_first=True, grad_sample_mode=grad_sample_mode
-                )
+        self.run_test(x, norm, batch_first=True)
diff --git a/opacus/tests/grad_samples/instance_norm2d_test.py b/opacus/tests/grad_samples/instance_norm2d_test.py
index be622009..81fae806 100644
--- a/opacus/tests/grad_samples/instance_norm2d_test.py
+++ b/opacus/tests/grad_samples/instance_norm2d_test.py
@@ -19,10 +19,6 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import (
-    get_grad_sample_modes,
-    check_per_sample_gradients_are_correct,
-)
 
 
 class InstanceNorm2d_test(GradSampleHooks_test):
@@ -31,17 +27,10 @@ class InstanceNorm2d_test(GradSampleHooks_test):
         C=st.integers(1, 3),
         W=st.integers(5, 10),
         H=st.integers(4, 8),
-        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
-    def test_4d_input(self, N: int, C: int, W: int, H: int, test_or_check: int):
+    def test_4d_input(self, N: int, C: int, W: int, H: int):
 
         x = torch.randn([N, C, H, W])
         norm = nn.InstanceNorm2d(num_features=C, affine=True, track_running_stats=False)
-        if test_or_check == 1:
-            self.run_test(x, norm, batch_first=True)
-        if test_or_check == 2:
-            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(
-                    x, norm, batch_first=True, grad_sample_mode=grad_sample_mode
-                )
+        self.run_test(x, norm, batch_first=True)
diff --git a/opacus/tests/grad_samples/instance_norm3d_test.py b/opacus/tests/grad_samples/instance_norm3d_test.py
index 14a9d3b6..7145f2bc 100644
--- a/opacus/tests/grad_samples/instance_norm3d_test.py
+++ b/opacus/tests/grad_samples/instance_norm3d_test.py
@@ -32,16 +32,9 @@ class InstanceNorm3d_test(GradSampleHooks_test):
         W=st.integers(5, 10),
         H=st.integers(4, 8),
         Z=st.integers(1, 4),
-        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
-    def test_5d_input(self, N: int, C: int, W: int, H: int, Z: int, test_or_check: int):
+    def test_5d_input(self, N: int, C: int, W: int, H: int, Z: int):
         x = torch.randn([N, C, Z, H, W])
         norm = nn.InstanceNorm3d(num_features=C, affine=True, track_running_stats=False)
-        if test_or_check == 1:
-            self.run_test(x, norm, batch_first=True)
-        if test_or_check == 2:
-            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(
-                    x, norm, batch_first=True, grad_sample_mode=grad_sample_mode
-                )
+        self.run_test(x, norm, batch_first=True)
diff --git a/opacus/tests/grad_samples/layer_norm_test.py b/opacus/tests/grad_samples/layer_norm_test.py
index b303ec49..d0301ad7 100644
--- a/opacus/tests/grad_samples/layer_norm_test.py
+++ b/opacus/tests/grad_samples/layer_norm_test.py
@@ -33,18 +33,10 @@ class LayerNorm_test(GradSampleHooks_test):
         W=st.integers(5, 10),
         input_dim=st.integers(2, 4),
         norm_dim=st.integers(1, 3),
-        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
     def test_input_norm(
-        self,
-        N: int,
-        Z: int,
-        W: int,
-        H: int,
-        input_dim: int,
-        norm_dim: int,
-        test_or_check: int,
+        self, N: int, Z: int, W: int, H: int, input_dim: int, norm_dim: int
     ):
 
         if norm_dim >= input_dim:
@@ -55,13 +47,7 @@ def test_input_norm(
 
         norm = nn.LayerNorm(normalized_shape, elementwise_affine=True)
         x = torch.randn(x_shape)
-        if test_or_check == 1:
-            self.run_test(x, norm, batch_first=True)
-        if test_or_check == 2:
-            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(
-                    x, norm, batch_first=True, grad_sample_mode=grad_sample_mode
-                )
+        self.run_test(x, norm, batch_first=True)
 
     @staticmethod
     def get_x_shape_and_norm_shape(H, N, W, Z, input_dim, norm_dim):
diff --git a/opacus/tests/grad_samples/linear_test.py b/opacus/tests/grad_samples/linear_test.py
index e0c72b18..25eba672 100644
--- a/opacus/tests/grad_samples/linear_test.py
+++ b/opacus/tests/grad_samples/linear_test.py
@@ -34,19 +34,19 @@ class Linear_test(GradSampleHooks_test):
         input_dim=st.integers(2, 4),
         bias=st.booleans(),
         batch_first=st.booleans(),
-        test_or_check=st.integers(1, 2)
+        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
     def test_input_bias(
-            self,
-            N: int,
-            Z: int,
-            W: int,
-            H: int,
-            input_dim: int,
-            bias: bool,
-            batch_first: bool,
-            test_or_check: int
+        self,
+        N: int,
+        Z: int,
+        W: int,
+        H: int,
+        input_dim: int,
+        bias: bool,
+        batch_first: bool,
+        test_or_check: int,
     ):
 
         if input_dim == 2:
@@ -64,9 +64,4 @@ def test_input_bias(
         if not batch_first:
             x = x.transpose(0, 1)
         ew_compatible = N > 0
-        if test_or_check == 1:
-            self.run_test(x, linear, batch_first=batch_first, ew_compatible=ew_compatible)
-        if test_or_check == 2:
-            for grad_sample_mode in get_grad_sample_modes(use_ew=ew_compatible):
-                assert check_per_sample_gradients_are_correct(x, linear, batch_first=batch_first,
-                                                              grad_sample_mode=grad_sample_mode)
+        self.run_test(x, linear, batch_first=batch_first, ew_compatible=ew_compatible)
diff --git a/opacus/tests/grad_samples/sequence_bias_test.py b/opacus/tests/grad_samples/sequence_bias_test.py
index 6ad6d411..71888757 100644
--- a/opacus/tests/grad_samples/sequence_bias_test.py
+++ b/opacus/tests/grad_samples/sequence_bias_test.py
@@ -16,13 +16,9 @@
 import hypothesis.strategies as st
 import torch
 from hypothesis import given, settings
-from opacus.layers import SequenceBias
 
+from opacus.layers import SequenceBias
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import (
-    check_per_sample_gradients_are_correct,
-    get_grad_sample_modes,
-)
 
 
 class SequenceBias_test(GradSampleHooks_test):
@@ -31,11 +27,10 @@ class SequenceBias_test(GradSampleHooks_test):
         T=st.integers(10, 20),
         D=st.integers(4, 8),
         batch_first=st.booleans(),
-        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=10000)
     def test_batch_second(
-        self, N: int, T: int, D: int, batch_first: bool, test_or_check: int
+        self, N: int, T: int, D: int, batch_first: bool
     ):
 
         seqbias = SequenceBias(D, batch_first)
@@ -43,13 +38,4 @@ def test_batch_second(
             x = torch.randn([N, T, D])
         else:
             x = torch.randn([T, N, D])
-        if test_or_check == 1:
-            self.run_test(x, seqbias, batch_first, ew_compatible=False)
-        if test_or_check == 2:
-            for grad_sample_mode in get_grad_sample_modes(use_ew=False):
-                assert check_per_sample_gradients_are_correct(
-                    x,
-                    seqbias,
-                    batch_first=batch_first,
-                    grad_sample_mode=grad_sample_mode,
-                )
+        self.run_test(x, seqbias, batch_first, ew_compatible=False)
diff --git a/opacus/tests/per_sample_gradients_utils_test.py b/opacus/tests/per_sample_gradients_utils_test.py
new file mode 100644
index 00000000..62aacf33
--- /dev/null
+++ b/opacus/tests/per_sample_gradients_utils_test.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from typing import Callable
+
+import hypothesis.strategies as st
+import torch
+from hypothesis import given, settings
+from torch import nn
+
+from opacus.tests.grad_samples.common import expander, shrinker
+from opacus.utils.per_sample_gradients_utils import (
+    get_grad_sample_modes,
+    check_per_sample_gradients_are_correct,
+)
+
+
+class PerSampleGradientsUtilsTest(unittest.TestCase):
+    def per_sample_grads_utils_test(self, x, model, ew_compatible, is_empty=False):
+        if is_empty:
+            for grad_sample_mode in get_grad_sample_modes(use_ew=ew_compatible):
+                with self.assertRaises(RuntimeError):
+                    check_per_sample_gradients_are_correct(
+                        x,
+                        model,
+                        batch_first=True,
+                        atol=10e-5,
+                        rtol=10e-4,
+                        grad_sample_mode=grad_sample_mode,
+                    )
+            return
+
+        for grad_sample_mode in get_grad_sample_modes(use_ew=ew_compatible):
+            assert check_per_sample_gradients_are_correct(
+                x,
+                model,
+                batch_first=True,
+                atol=10e-5,
+                rtol=10e-4,
+                grad_sample_mode=grad_sample_mode,
+            )
+
+    @given(
+        N=st.integers(0, 4),
+        C=st.sampled_from([1, 3, 32]),
+        W=st.integers(6, 10),
+        out_channels_mapper=st.sampled_from([expander, shrinker]),
+        kernel_size=st.integers(2, 3),
+        stride=st.integers(1, 2),
+        padding=st.sampled_from([0, 1, 2, "same", "valid"]),
+        dilation=st.integers(1, 2),
+        groups=st.integers(1, 12),
+    )
+    @settings(deadline=10000)
+    def test_conv1d(
+        self,
+        N: int,
+        C: int,
+        W: int,
+        out_channels_mapper: Callable[[int], int],
+        kernel_size: int,
+        stride: int,
+        padding: int,
+        dilation: int,
+        groups: int,
+    ):
+        if padding == "same" and stride != 1:
+            return
+        out_channels = out_channels_mapper(C)
+        if (
+            C % groups != 0 or out_channels % groups != 0
+        ):  # since in_channels and out_channels must be divisible by groups
+            return
+
+        x = torch.randn([N, C, W])
+        conv = nn.Conv1d(
+            in_channels=C,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+        )
+        ew_compatible = N > 0
+
+        self.per_sample_grads_utils_test(x, conv, ew_compatible, N == 0)
+
+    @given(
+        N=st.integers(0, 4),
+        Z=st.integers(1, 4),
+        H=st.integers(1, 3),
+        W=st.integers(10, 17),
+        input_dim=st.integers(2, 4),
+        bias=st.booleans(),
+        batch_first=st.booleans(),
+    )
+    @settings(deadline=10000)
+    def test_linear(
+        self,
+        N: int,
+        Z: int,
+        H: int,
+        W: int,
+        input_dim: int,
+        bias: bool,
+        batch_first: bool,
+    ):
+
+        if input_dim == 2:
+            if not batch_first:
+                return  # see https://github.com/pytorch/opacus/pull/265
+            else:
+                x_shape = [N, W]
+        if input_dim == 3:
+            x_shape = [N, Z, W]
+        if input_dim == 4:
+            x_shape = [N, Z, H, W]
+
+        linear = nn.Linear(W, W + 2, bias=bias)
+        x = torch.randn(x_shape)
+        if not batch_first:
+            x = x.transpose(0, 1)
+        ew_compatible = N > 0
+
+        self.per_sample_grads_utils_test(x, linear, ew_compatible, N == 0)
diff --git a/opacus/utils/per_sample_gradients_utils.py b/opacus/utils/per_sample_gradients_utils.py
index c75ae152..bedfe579 100644
--- a/opacus/utils/per_sample_gradients_utils.py
+++ b/opacus/utils/per_sample_gradients_utils.py
@@ -292,9 +292,21 @@ def compute_grad_samples_microbatch_and_opacus(
     loss_reduction: str = "mean",
     grad_sample_mode: str = "hooks",
 ):
-    microbatch_grad_samples = compute_microbatch_grad_sample_tensor_or_seq(
-        x, module, batch_first=batch_first, loss_reduction=loss_reduction
-    )
+    if type(x) is PackedSequence:
+        x_unpacked = unpack_packedsequences(x)
+        microbatch_grad_samples = compute_microbatch_grad_sample(
+            x_unpacked,
+            module,
+            batch_first=batch_first,
+            loss_reduction=loss_reduction,
+        )
+    elif x.numel() > 0:
+        microbatch_grad_samples = compute_microbatch_grad_sample(
+            x, module, batch_first=batch_first, loss_reduction=loss_reduction
+        )
+    else:
+        raise RuntimeError("x is expected to be non-empty.")
+
     opacus_grad_samples = compute_opacus_grad_sample(
         x,
         module,

From f9a35de03d621ab262832e960ae435b1fcae8034 Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Tue, 1 Nov 2022 11:52:43 +0000
Subject: [PATCH 23/32] Optimize imports

---
 opacus/tests/grad_samples/conv1d_test.py        |  4 ----
 opacus/tests/grad_samples/conv2d_test.py        | 17 ++---------------
 .../grad_samples/dp_multihead_attention_test.py |  6 +-----
 opacus/tests/grad_samples/dp_rnn_test.py        |  6 +-----
 opacus/tests/grad_samples/embedding_test.py     |  4 ----
 opacus/tests/grad_samples/group_norm_test.py    |  4 ----
 .../tests/grad_samples/instance_norm1d_test.py  |  4 ----
 .../tests/grad_samples/instance_norm3d_test.py  |  4 ----
 opacus/tests/grad_samples/layer_norm_test.py    |  4 ----
 opacus/tests/grad_samples/linear_test.py        |  4 ----
 10 files changed, 4 insertions(+), 53 deletions(-)

diff --git a/opacus/tests/grad_samples/conv1d_test.py b/opacus/tests/grad_samples/conv1d_test.py
index d9912e44..0aaa1739 100644
--- a/opacus/tests/grad_samples/conv1d_test.py
+++ b/opacus/tests/grad_samples/conv1d_test.py
@@ -21,10 +21,6 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test, expander, shrinker
-from ...utils.per_sample_gradients_utils import (
-    check_per_sample_gradients_are_correct,
-    get_grad_sample_modes,
-)
 
 
 class Conv1d_test(GradSampleHooks_test):
diff --git a/opacus/tests/grad_samples/conv2d_test.py b/opacus/tests/grad_samples/conv2d_test.py
index 3f849c18..3316bd8e 100644
--- a/opacus/tests/grad_samples/conv2d_test.py
+++ b/opacus/tests/grad_samples/conv2d_test.py
@@ -19,16 +19,12 @@
 import torch
 import torch.nn as nn
 from hypothesis import given, settings
+from torch.testing import assert_allclose
+
 from opacus.grad_sample.conv import convolution2d_backward_as_a_convolution
 from opacus.grad_sample.grad_sample_module import GradSampleModule
 from opacus.utils.tensor_utils import unfold2d
-from torch.testing import assert_allclose
-
 from .common import GradSampleHooks_test, expander, shrinker
-from ...utils.per_sample_gradients_utils import (
-    get_grad_sample_modes,
-    check_per_sample_gradients_are_correct,
-)
 
 
 class Conv2d_test(GradSampleHooks_test):
@@ -104,15 +100,6 @@ def test_conv2d(
                 rtol=10e-4,
                 ew_compatible=is_ew_compatible,
             )
-            for grad_sample_mode in get_grad_sample_modes(use_ew=is_ew_compatible):
-                assert check_per_sample_gradients_are_correct(
-                    x,
-                    conv,
-                    batch_first=True,
-                    atol=10e-5,
-                    rtol=10e-4,
-                    grad_sample_mode=grad_sample_mode,
-                )
             GradSampleModule.GRAD_SAMPLERS[nn.Conv2d] = conv2d_gsm
 
     @given(
diff --git a/opacus/tests/grad_samples/dp_multihead_attention_test.py b/opacus/tests/grad_samples/dp_multihead_attention_test.py
index d3f902c4..9e927cae 100644
--- a/opacus/tests/grad_samples/dp_multihead_attention_test.py
+++ b/opacus/tests/grad_samples/dp_multihead_attention_test.py
@@ -17,13 +17,9 @@
 import torch
 import torch.nn as nn
 from hypothesis import given, settings
-from opacus.layers import DPMultiheadAttention
 
+from opacus.layers import DPMultiheadAttention
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import (
-    check_per_sample_gradients_are_correct,
-    get_grad_sample_modes,
-)
 
 
 class DPMultiheadAttentionAdapter(nn.Module):
diff --git a/opacus/tests/grad_samples/dp_rnn_test.py b/opacus/tests/grad_samples/dp_rnn_test.py
index 109020e9..fcbe2d2a 100644
--- a/opacus/tests/grad_samples/dp_rnn_test.py
+++ b/opacus/tests/grad_samples/dp_rnn_test.py
@@ -17,14 +17,10 @@
 import torch
 import torch.nn as nn
 from hypothesis import given, settings
+
 from opacus.layers import DPGRU, DPLSTM, DPRNN
 from opacus.utils.packed_sequences import _gen_packed_data
-
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import (
-    check_per_sample_gradients_are_correct,
-    get_grad_sample_modes,
-)
 
 MODELS = [
     DPRNN,
diff --git a/opacus/tests/grad_samples/embedding_test.py b/opacus/tests/grad_samples/embedding_test.py
index 35d3e2bf..ba1f28e2 100644
--- a/opacus/tests/grad_samples/embedding_test.py
+++ b/opacus/tests/grad_samples/embedding_test.py
@@ -19,10 +19,6 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import (
-    check_per_sample_gradients_are_correct,
-    get_grad_sample_modes,
-)
 
 
 class Embedding_test(GradSampleHooks_test):
diff --git a/opacus/tests/grad_samples/group_norm_test.py b/opacus/tests/grad_samples/group_norm_test.py
index 9d63ef09..bdc0ec79 100644
--- a/opacus/tests/grad_samples/group_norm_test.py
+++ b/opacus/tests/grad_samples/group_norm_test.py
@@ -21,10 +21,6 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import (
-    check_per_sample_gradients_are_correct,
-    get_grad_sample_modes,
-)
 
 
 class GroupNorm_test(GradSampleHooks_test):
diff --git a/opacus/tests/grad_samples/instance_norm1d_test.py b/opacus/tests/grad_samples/instance_norm1d_test.py
index fac4b738..e2e284f4 100644
--- a/opacus/tests/grad_samples/instance_norm1d_test.py
+++ b/opacus/tests/grad_samples/instance_norm1d_test.py
@@ -19,10 +19,6 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import (
-    check_per_sample_gradients_are_correct,
-    get_grad_sample_modes,
-)
 
 
 class InstanceNorm1d_test(GradSampleHooks_test):
diff --git a/opacus/tests/grad_samples/instance_norm3d_test.py b/opacus/tests/grad_samples/instance_norm3d_test.py
index 7145f2bc..d46050ec 100644
--- a/opacus/tests/grad_samples/instance_norm3d_test.py
+++ b/opacus/tests/grad_samples/instance_norm3d_test.py
@@ -19,10 +19,6 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import (
-    get_grad_sample_modes,
-    check_per_sample_gradients_are_correct,
-)
 
 
 class InstanceNorm3d_test(GradSampleHooks_test):
diff --git a/opacus/tests/grad_samples/layer_norm_test.py b/opacus/tests/grad_samples/layer_norm_test.py
index d0301ad7..c5b5eb83 100644
--- a/opacus/tests/grad_samples/layer_norm_test.py
+++ b/opacus/tests/grad_samples/layer_norm_test.py
@@ -19,10 +19,6 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import (
-    check_per_sample_gradients_are_correct,
-    get_grad_sample_modes,
-)
 
 
 class LayerNorm_test(GradSampleHooks_test):
diff --git a/opacus/tests/grad_samples/linear_test.py b/opacus/tests/grad_samples/linear_test.py
index 25eba672..67b9565a 100644
--- a/opacus/tests/grad_samples/linear_test.py
+++ b/opacus/tests/grad_samples/linear_test.py
@@ -19,10 +19,6 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import (
-    get_grad_sample_modes,
-    check_per_sample_gradients_are_correct,
-)
 
 
 class Linear_test(GradSampleHooks_test):

From f7880d8c8a1fc64d31f5488d8f82f6e5d9db09a0 Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Tue, 1 Nov 2022 15:39:07 +0000
Subject: [PATCH 24/32] Fix test

---
 opacus/tests/grad_samples/conv1d_test.py      |  2 +-
 opacus/tests/grad_samples/conv3d_test.py      | 33 +++++--------------
 .../tests/grad_samples/sequence_bias_test.py  |  4 +--
 3 files changed, 10 insertions(+), 29 deletions(-)

diff --git a/opacus/tests/grad_samples/conv1d_test.py b/opacus/tests/grad_samples/conv1d_test.py
index 0aaa1739..598ec379 100644
--- a/opacus/tests/grad_samples/conv1d_test.py
+++ b/opacus/tests/grad_samples/conv1d_test.py
@@ -33,7 +33,7 @@ class Conv1d_test(GradSampleHooks_test):
         stride=st.integers(1, 2),
         padding=st.sampled_from([0, 1, 2, "same", "valid"]),
         dilation=st.integers(1, 2),
-        groups=st.integers(1, 12)
+        groups=st.integers(1, 12),
     )
     @settings(deadline=10000)
     def test_conv1d(
diff --git a/opacus/tests/grad_samples/conv3d_test.py b/opacus/tests/grad_samples/conv3d_test.py
index 5b39e026..e50909e2 100644
--- a/opacus/tests/grad_samples/conv3d_test.py
+++ b/opacus/tests/grad_samples/conv3d_test.py
@@ -21,10 +21,6 @@
 from hypothesis import given, settings
 
 from .common import GradSampleHooks_test, expander, shrinker
-from ...utils.per_sample_gradients_utils import (
-    get_grad_sample_modes,
-    check_per_sample_gradients_are_correct,
-)
 
 
 class Conv3d_test(GradSampleHooks_test):
@@ -40,7 +36,6 @@ class Conv3d_test(GradSampleHooks_test):
         padding=st.sampled_from([0, 2, (1, 2, 3), "same", "valid"]),
         dilation=st.sampled_from([1, (1, 2, 2)]),
         groups=st.integers(1, 16),
-        test_or_check=st.integers(1, 2),
     )
     @settings(deadline=30000)
     def test_conv3d(
@@ -56,7 +51,6 @@ def test_conv3d(
         padding: Union[int, Tuple[int]],
         dilation: int,
         groups: int,
-        test_or_check: int,
     ):
 
         if padding == "same" and stride != 1:
@@ -79,22 +73,11 @@ def test_conv3d(
         is_ew_compatible = (
             dilation == 1 and padding != "same" and N > 0
         )  # TODO add support for padding = 'same' with EW
-        if test_or_check == 1:
-            self.run_test(
-                x,
-                conv,
-                batch_first=True,
-                atol=10e-5,
-                rtol=10e-3,
-                ew_compatible=is_ew_compatible,
-            )
-        if test_or_check == 2:
-            for grad_sample_mode in get_grad_sample_modes(use_ew=is_ew_compatible):
-                assert check_per_sample_gradients_are_correct(
-                    x,
-                    conv,
-                    batch_first=True,
-                    atol=10e-5,
-                    rtol=10e-3,
-                    grad_sample_mode=grad_sample_mode,
-                )
+        self.run_test(
+            x,
+            conv,
+            batch_first=True,
+            atol=10e-5,
+            rtol=10e-3,
+            ew_compatible=is_ew_compatible,
+        )
diff --git a/opacus/tests/grad_samples/sequence_bias_test.py b/opacus/tests/grad_samples/sequence_bias_test.py
index 71888757..cac8dea7 100644
--- a/opacus/tests/grad_samples/sequence_bias_test.py
+++ b/opacus/tests/grad_samples/sequence_bias_test.py
@@ -29,9 +29,7 @@ class SequenceBias_test(GradSampleHooks_test):
         batch_first=st.booleans(),
     )
     @settings(deadline=10000)
-    def test_batch_second(
-        self, N: int, T: int, D: int, batch_first: bool
-    ):
+    def test_batch_second(self, N: int, T: int, D: int, batch_first: bool):
 
         seqbias = SequenceBias(D, batch_first)
         if batch_first:

From 1ae50cb979a01e564e9c22414350f450b05c1833 Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Tue, 1 Nov 2022 16:05:14 +0000
Subject: [PATCH 25/32] Add utility description to tutorial

---
 tutorials/guide_to_grad_sampler.ipynb | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/tutorials/guide_to_grad_sampler.ipynb b/tutorials/guide_to_grad_sampler.ipynb
index 94f95f84..0076cd74 100644
--- a/tutorials/guide_to_grad_sampler.ipynb
+++ b/tutorials/guide_to_grad_sampler.ipynb
@@ -246,6 +246,33 @@
     "\n",
     "If you have any questions or comments, please don't hesitate to post them on our [forum](https://discuss.pytorch.org/c/opacus/29)."
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Per-sample-gradients correctness utility\n",
+    "[Here](https://github.com/pytorch/opacus/blob/main/opacus/utils/per_sample_gradients_utils.py) you can find a simple utility function `check_per_sample_gradients_are_correct` that checks if the gradient sampler works correctly with a particular module."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "x_shape = [N, Z, W]\n",
+    "x = torch.randn(x_shape)\n",
+    "model = nn.Linear(W, W + 2)\n",
+    "assert check_per_sample_gradients_are_correct(\n",
+    "        x,\n",
+    "        model\n",
+    "    ) # This will fail only if the opacus per sample gradients do not match the micro-batch gradients."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
   }
  ],
  "metadata": {

From 8c67f407c861d9c1961fbafbf1c999624d1361f7 Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Mon, 7 Nov 2022 15:32:28 +0000
Subject: [PATCH 26/32] Fix grad samples test

---
 .../grad_samples/dp_multihead_attention_test.py      | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/opacus/tests/grad_samples/dp_multihead_attention_test.py b/opacus/tests/grad_samples/dp_multihead_attention_test.py
index d4bd17de..9ae3448c 100644
--- a/opacus/tests/grad_samples/dp_multihead_attention_test.py
+++ b/opacus/tests/grad_samples/dp_multihead_attention_test.py
@@ -20,10 +20,6 @@
 
 from opacus.layers import DPMultiheadAttention
 from .common import GradSampleHooks_test
-from ...utils.per_sample_gradients_utils import (
-    check_per_sample_gradients_are_correct,
-    get_grad_sample_modes,
-)
 
 
 class DPMultiheadAttentionAdapter(nn.Module):
@@ -92,10 +88,4 @@ def test_multihead_attention(
         v = torch.randn([T, N, D])
         x = torch.stack((q, k, v), dim=-1)
 
-        if test_or_check == 1:
-            self.run_test(x, attn, batch_first=False)
-        if test_or_check == 2:
-            for grad_sample_mode in get_grad_sample_modes(use_ew=True):
-                assert check_per_sample_gradients_are_correct(
-                    x, attn, batch_first=False, grad_sample_mode=grad_sample_mode
-                )
+        self.run_test(x, attn, batch_first=False)

From 0faf6618c52710d23736bd395f7d620f37492030 Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Mon, 7 Nov 2022 16:08:05 +0000
Subject: [PATCH 27/32] Fixed isort warnings

---
 opacus/tests/grad_samples/common.py                      | 5 ++---
 opacus/tests/grad_samples/conv2d_test.py                 | 4 ++--
 opacus/tests/grad_samples/dp_multihead_attention_test.py | 2 +-
 opacus/tests/grad_samples/dp_rnn_test.py                 | 3 ++-
 opacus/tests/grad_samples/sequence_bias_test.py          | 2 +-
 opacus/tests/per_sample_gradients_utils_test.py          | 5 ++---
 opacus/utils/per_sample_gradients_utils.py               | 5 ++---
 7 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/opacus/tests/grad_samples/common.py b/opacus/tests/grad_samples/common.py
index fc74b79c..7cdcee94 100644
--- a/opacus/tests/grad_samples/common.py
+++ b/opacus/tests/grad_samples/common.py
@@ -19,12 +19,11 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.nn.utils.rnn import PackedSequence
-from torch.testing import assert_allclose
-
 from opacus.utils.per_sample_gradients_utils import (
     compute_grad_samples_microbatch_and_opacus,
 )
+from torch.nn.utils.rnn import PackedSequence
+from torch.testing import assert_allclose
 
 
 def expander(x, factor: int = 2):
diff --git a/opacus/tests/grad_samples/conv2d_test.py b/opacus/tests/grad_samples/conv2d_test.py
index 3316bd8e..0b57dd5d 100644
--- a/opacus/tests/grad_samples/conv2d_test.py
+++ b/opacus/tests/grad_samples/conv2d_test.py
@@ -19,11 +19,11 @@
 import torch
 import torch.nn as nn
 from hypothesis import given, settings
-from torch.testing import assert_allclose
-
 from opacus.grad_sample.conv import convolution2d_backward_as_a_convolution
 from opacus.grad_sample.grad_sample_module import GradSampleModule
 from opacus.utils.tensor_utils import unfold2d
+from torch.testing import assert_allclose
+
 from .common import GradSampleHooks_test, expander, shrinker
 
 
diff --git a/opacus/tests/grad_samples/dp_multihead_attention_test.py b/opacus/tests/grad_samples/dp_multihead_attention_test.py
index 9ae3448c..87f60b3f 100644
--- a/opacus/tests/grad_samples/dp_multihead_attention_test.py
+++ b/opacus/tests/grad_samples/dp_multihead_attention_test.py
@@ -17,8 +17,8 @@
 import torch
 import torch.nn as nn
 from hypothesis import given, settings
-
 from opacus.layers import DPMultiheadAttention
+
 from .common import GradSampleHooks_test
 
 
diff --git a/opacus/tests/grad_samples/dp_rnn_test.py b/opacus/tests/grad_samples/dp_rnn_test.py
index fcbe2d2a..38916667 100644
--- a/opacus/tests/grad_samples/dp_rnn_test.py
+++ b/opacus/tests/grad_samples/dp_rnn_test.py
@@ -17,11 +17,12 @@
 import torch
 import torch.nn as nn
 from hypothesis import given, settings
-
 from opacus.layers import DPGRU, DPLSTM, DPRNN
 from opacus.utils.packed_sequences import _gen_packed_data
+
 from .common import GradSampleHooks_test
 
+
 MODELS = [
     DPRNN,
     DPGRU,
diff --git a/opacus/tests/grad_samples/sequence_bias_test.py b/opacus/tests/grad_samples/sequence_bias_test.py
index cac8dea7..9c069eac 100644
--- a/opacus/tests/grad_samples/sequence_bias_test.py
+++ b/opacus/tests/grad_samples/sequence_bias_test.py
@@ -16,8 +16,8 @@
 import hypothesis.strategies as st
 import torch
 from hypothesis import given, settings
-
 from opacus.layers import SequenceBias
+
 from .common import GradSampleHooks_test
 
 
diff --git a/opacus/tests/per_sample_gradients_utils_test.py b/opacus/tests/per_sample_gradients_utils_test.py
index 62aacf33..6d101367 100644
--- a/opacus/tests/per_sample_gradients_utils_test.py
+++ b/opacus/tests/per_sample_gradients_utils_test.py
@@ -19,13 +19,12 @@
 import hypothesis.strategies as st
 import torch
 from hypothesis import given, settings
-from torch import nn
-
 from opacus.tests.grad_samples.common import expander, shrinker
 from opacus.utils.per_sample_gradients_utils import (
-    get_grad_sample_modes,
     check_per_sample_gradients_are_correct,
+    get_grad_sample_modes,
 )
+from torch import nn
 
 
 class PerSampleGradientsUtilsTest(unittest.TestCase):
diff --git a/opacus/utils/per_sample_gradients_utils.py b/opacus/utils/per_sample_gradients_utils.py
index bedfe579..3918349b 100644
--- a/opacus/utils/per_sample_gradients_utils.py
+++ b/opacus/utils/per_sample_gradients_utils.py
@@ -14,16 +14,15 @@
 # limitations under the License.
 
 import io
-from typing import Union, Dict, List
+from typing import Dict, List, Union
 
 import numpy as np
 import torch
 import torch.nn as nn
-from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence
-
 from opacus.grad_sample import wrap_model
 from opacus.utils.module_utils import trainable_parameters
 from opacus.utils.packed_sequences import compute_seq_lengths
+from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence
 
 
 def clone_module(module: nn.Module) -> nn.Module:

From a95c95a68e13deb491f1fd19a54970469ccd020c Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Mon, 7 Nov 2022 16:42:48 +0000
Subject: [PATCH 28/32] Fix grad samples zero batch test

---
 opacus/tests/grad_samples/common.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/opacus/tests/grad_samples/common.py b/opacus/tests/grad_samples/common.py
index 7cdcee94..394152da 100644
--- a/opacus/tests/grad_samples/common.py
+++ b/opacus/tests/grad_samples/common.py
@@ -21,6 +21,7 @@
 import torch.nn.functional as F
 from opacus.utils.per_sample_gradients_utils import (
     compute_grad_samples_microbatch_and_opacus,
+    compute_opacus_grad_sample,
 )
 from torch.nn.utils.rnn import PackedSequence
 from torch.testing import assert_allclose
@@ -86,9 +87,15 @@ def run_test_with_reduction(
         rtol=10e-5,
         grad_sample_mode="hooks",
     ):
-        if (
-            not type(x) is PackedSequence and x.numel() <= 0
-        ):  # We've checked opacus can handle 0-sized batch. Microbatch doesn't make sense
+        if not type(x) is PackedSequence and x.numel() <= 0:
+            _ = compute_opacus_grad_sample(
+                x,
+                module,
+                batch_first=batch_first,
+                loss_reduction=loss_reduction,
+                grad_sample_mode=grad_sample_mode,
+            )
+            # We've checked opacus can handle 0-sized batch. Microbatch doesn't make sense
             return
         (
             microbatch_grad_samples,

From 786a0933af1bb19b1efe78796cd6779250843afa Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Mon, 7 Nov 2022 17:07:20 +0000
Subject: [PATCH 29/32] Skip functorch test when unavailable

---
 .../tests/per_sample_gradients_utils_test.py  | 67 +++++++++++++------
 1 file changed, 45 insertions(+), 22 deletions(-)

diff --git a/opacus/tests/per_sample_gradients_utils_test.py b/opacus/tests/per_sample_gradients_utils_test.py
index 6d101367..5c721b1b 100644
--- a/opacus/tests/per_sample_gradients_utils_test.py
+++ b/opacus/tests/per_sample_gradients_utils_test.py
@@ -20,6 +20,7 @@
 import torch
 from hypothesis import given, settings
 from opacus.tests.grad_samples.common import expander, shrinker
+from opacus.tests.privacy_engine_test import _is_functorch_available
 from opacus.utils.per_sample_gradients_utils import (
     check_per_sample_gradients_are_correct,
     get_grad_sample_modes,
@@ -28,29 +29,35 @@
 
 
 class PerSampleGradientsUtilsTest(unittest.TestCase):
-    def per_sample_grads_utils_test(self, x, model, ew_compatible, is_empty=False):
+    def per_sample_grads_utils_test(
+        self,
+        x,
+        model,
+        grad_sample_mode,
+        is_empty=False,
+        atol=10e-5,
+        rtol=10e-4,
+    ):
         if is_empty:
-            for grad_sample_mode in get_grad_sample_modes(use_ew=ew_compatible):
-                with self.assertRaises(RuntimeError):
-                    check_per_sample_gradients_are_correct(
-                        x,
-                        model,
-                        batch_first=True,
-                        atol=10e-5,
-                        rtol=10e-4,
-                        grad_sample_mode=grad_sample_mode,
-                    )
+            with self.assertRaises(RuntimeError):
+                check_per_sample_gradients_are_correct(
+                    x,
+                    model,
+                    batch_first=True,
+                    atol=atol,
+                    rtol=rtol,
+                    grad_sample_mode=grad_sample_mode,
+                )
             return
 
-        for grad_sample_mode in get_grad_sample_modes(use_ew=ew_compatible):
-            assert check_per_sample_gradients_are_correct(
-                x,
-                model,
-                batch_first=True,
-                atol=10e-5,
-                rtol=10e-4,
-                grad_sample_mode=grad_sample_mode,
-            )
+        assert check_per_sample_gradients_are_correct(
+            x,
+            model,
+            batch_first=True,
+            atol=atol,
+            rtol=rtol,
+            grad_sample_mode=grad_sample_mode,
+        )
 
     @given(
         N=st.integers(0, 4),
@@ -62,6 +69,7 @@ def per_sample_grads_utils_test(self, x, model, ew_compatible, is_empty=False):
         padding=st.sampled_from([0, 1, 2, "same", "valid"]),
         dilation=st.integers(1, 2),
         groups=st.integers(1, 12),
+        grad_sample_mode=st.sampled_from(get_grad_sample_modes(use_ew=True)),
     )
     @settings(deadline=10000)
     def test_conv1d(
@@ -75,6 +83,7 @@ def test_conv1d(
         padding: int,
         dilation: int,
         groups: int,
+        grad_sample_mode: str,
     ):
         if padding == "same" and stride != 1:
             return
@@ -96,7 +105,13 @@ def test_conv1d(
         )
         ew_compatible = N > 0
 
-        self.per_sample_grads_utils_test(x, conv, ew_compatible, N == 0)
+        if grad_sample_mode == "functorch" and not _is_functorch_available():
+            raise unittest.SkipTest("Functorch is not available for this version.")
+
+        if not ew_compatible and grad_sample_mode == "ew":
+            return
+
+        self.per_sample_grads_utils_test(x, conv, grad_sample_mode, N == 0)
 
     @given(
         N=st.integers(0, 4),
@@ -106,6 +121,7 @@ def test_conv1d(
         input_dim=st.integers(2, 4),
         bias=st.booleans(),
         batch_first=st.booleans(),
+        grad_sample_mode=st.sampled_from(get_grad_sample_modes(use_ew=True)),
     )
     @settings(deadline=10000)
     def test_linear(
@@ -117,6 +133,7 @@ def test_linear(
         input_dim: int,
         bias: bool,
         batch_first: bool,
+        grad_sample_mode: str,
     ):
 
         if input_dim == 2:
@@ -135,4 +152,10 @@ def test_linear(
             x = x.transpose(0, 1)
         ew_compatible = N > 0
 
-        self.per_sample_grads_utils_test(x, linear, ew_compatible, N == 0)
+        if grad_sample_mode == "functorch" and not _is_functorch_available():
+            raise unittest.SkipTest("Functorch is not available for this version.")
+
+        if not ew_compatible and grad_sample_mode == "ew":
+            return
+
+        self.per_sample_grads_utils_test(x, linear, grad_sample_mode, N == 0)

From 6402c18e550ff043ec5992f60afddc5a944905ce Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Tue, 8 Nov 2022 16:04:11 +0000
Subject: [PATCH 30/32] Fix merge

---
 opacus/tests/grad_samples/common.py        | 16 +++++----
 opacus/utils/per_sample_gradients_utils.py | 41 ++++++++++++++++------
 2 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/opacus/tests/grad_samples/common.py b/opacus/tests/grad_samples/common.py
index 62eba8e2..69c7220d 100644
--- a/opacus/tests/grad_samples/common.py
+++ b/opacus/tests/grad_samples/common.py
@@ -13,20 +13,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
 import unittest
-from typing import Dict, Iterable, List, Tuple, Union
+from typing import Tuple, Union
 
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from opacus.grad_sample import wrap_model
-from opacus.utils.module_utils import trainable_parameters
-from opacus.utils.packed_sequences import compute_seq_lengths
-from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence
+from torch.nn.utils.rnn import PackedSequence
 from torch.testing import assert_close
 
+from opacus.utils.per_sample_gradients_utils import (
+    compute_grad_samples_microbatch_and_opacus,
+    compute_opacus_grad_sample,
+    is_batch_empty,
+)
+
 
 def expander(x, factor: int = 2):
     return x * factor
@@ -112,6 +113,7 @@ def run_test_with_reduction(
             batch_first=batch_first,
             loss_reduction=loss_reduction,
             grad_sample_mode=grad_sample_mode,
+            chunk_method=chunk_method,
         )
 
         self.check_shapes(microbatch_grad_samples, opacus_grad_samples, loss_reduction)
diff --git a/opacus/utils/per_sample_gradients_utils.py b/opacus/utils/per_sample_gradients_utils.py
index 3918349b..1843b895 100644
--- a/opacus/utils/per_sample_gradients_utils.py
+++ b/opacus/utils/per_sample_gradients_utils.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 import io
-from typing import Dict, List, Union
+from typing import Dict, List, Union, Iterable, Callable
 
 import numpy as np
 import torch
@@ -43,6 +43,13 @@ def clone_module(module: nn.Module) -> nn.Module:
     return module_copy
 
 
+def is_batch_empty(batch: Union[torch.Tensor, Iterable[torch.Tensor]]):
+    if type(batch) is torch.Tensor:
+        return batch.numel() == 0
+    else:
+        return batch[0].numel() == 0
+
+
 class ModelWithLoss(nn.Module):
     """
     To test the gradients of a module, we need to have a loss.
@@ -74,7 +81,10 @@ def __init__(self, module: nn.Module, loss_reduction: str = "mean"):
         self.criterion = nn.L1Loss(reduction=loss_reduction)
 
     def forward(self, x):
-        x = self.wrapped_module(x)
+        if type(x) is tuple:
+            x = self.wrapped_module(*x)
+        else:
+            x = self.wrapped_module(x)
         if type(x) is PackedSequence:
             loss = _compute_loss_packedsequences(self.criterion, x)
         else:
@@ -88,6 +98,7 @@ def compute_microbatch_grad_sample(
     module: nn.Module,
     batch_first: bool = True,
     loss_reduction: str = "mean",
+    chunk_method: Callable = iter,
 ) -> Dict[str, torch.tensor]:
     """
     Computes per-sample gradients with the microbatch method, i.e. by computing normal gradients
@@ -96,11 +107,11 @@ def compute_microbatch_grad_sample(
 
     Args:
         x: Sample input batch
-         module: The nn.Module you want to test.
+        module: The nn.Module you want to test.
         batch_first: Whether batch size is the first dimension (as opposed to the second).
             Defaults to True.
-        loss_reduction: Indicates if the loss reduction (for aggregating the gradients)
-                is a sum or a mean operation. Can take values "sum" or "mean".
+        loss_reduction: What reduction to apply to the loss. Defaults to "mean".
+        chunk_method: The method to use to split the batch into microbatches. Defaults to ``iter``.
 
     Returns:
         Dictionary mapping parameter_name -> per-sample-gradient for that parameter
@@ -120,12 +131,14 @@ def compute_microbatch_grad_sample(
 
     # Invariant: x is [B, T, ...]
 
-    for x_i in x:
+    for x_i in chunk_method(x):
         # x_i is [T, ...]
-        x_i = x_i.unsqueeze(
-            0 if batch_first else 1
-        )  # x_i of size [1, T, ...] if batch_first, else [T, 1, ...]
         module.zero_grad()
+        if type(x_i) is not tuple:
+            # EmbeddingBag provides tuples
+            x_i = x_i.unsqueeze(
+                0 if batch_first else 1
+            )  # x_i of size [1, T, ...] if batch_first, else [T, 1, ...]
         loss_i = module(x_i)
         loss_i.backward()
         for p in module.parameters():
@@ -290,6 +303,7 @@ def compute_grad_samples_microbatch_and_opacus(
     batch_first: bool = True,
     loss_reduction: str = "mean",
     grad_sample_mode: str = "hooks",
+    chunk_method: Callable = iter,
 ):
     if type(x) is PackedSequence:
         x_unpacked = unpack_packedsequences(x)
@@ -298,10 +312,15 @@ def compute_grad_samples_microbatch_and_opacus(
             module,
             batch_first=batch_first,
             loss_reduction=loss_reduction,
+            chunk_method=chunk_method,
         )
-    elif x.numel() > 0:
+    elif not is_batch_empty(x):
         microbatch_grad_samples = compute_microbatch_grad_sample(
-            x, module, batch_first=batch_first, loss_reduction=loss_reduction
+            x,
+            module,
+            batch_first=batch_first,
+            loss_reduction=loss_reduction,
+            chunk_method=chunk_method,
         )
     else:
         raise RuntimeError("x is expected to be non-empty.")

From 0b9a1ec0cc301b6ed3b700abad9985115426b11a Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Tue, 8 Nov 2022 16:42:49 +0000
Subject: [PATCH 31/32] Isort fix

---
 opacus/tests/grad_samples/common.py        | 5 ++---
 opacus/utils/per_sample_gradients_utils.py | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/opacus/tests/grad_samples/common.py b/opacus/tests/grad_samples/common.py
index 69c7220d..15a71f1e 100644
--- a/opacus/tests/grad_samples/common.py
+++ b/opacus/tests/grad_samples/common.py
@@ -19,14 +19,13 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.nn.utils.rnn import PackedSequence
-from torch.testing import assert_close
-
 from opacus.utils.per_sample_gradients_utils import (
     compute_grad_samples_microbatch_and_opacus,
     compute_opacus_grad_sample,
     is_batch_empty,
 )
+from torch.nn.utils.rnn import PackedSequence
+from torch.testing import assert_close
 
 
 def expander(x, factor: int = 2):
diff --git a/opacus/utils/per_sample_gradients_utils.py b/opacus/utils/per_sample_gradients_utils.py
index 1843b895..8a620b5b 100644
--- a/opacus/utils/per_sample_gradients_utils.py
+++ b/opacus/utils/per_sample_gradients_utils.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 import io
-from typing import Dict, List, Union, Iterable, Callable
+from typing import Callable, Dict, Iterable, List, Union
 
 import numpy as np
 import torch

From 05fdb4fe20cfc5054ed0b898543a5d52e8f0aee5 Mon Sep 17 00:00:00 2001
From: Pavel Solikov <psolikov@fb.com>
Date: Tue, 8 Nov 2022 18:01:59 +0000
Subject: [PATCH 32/32] Fix docstring

---
 opacus/utils/per_sample_gradients_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/opacus/utils/per_sample_gradients_utils.py b/opacus/utils/per_sample_gradients_utils.py
index 8a620b5b..754d86d6 100644
--- a/opacus/utils/per_sample_gradients_utils.py
+++ b/opacus/utils/per_sample_gradients_utils.py
@@ -245,6 +245,7 @@ def check_per_sample_gradients_are_correct(
     Returns: True if per sample gradients were computed correctly. False otherwise.
 
     Example:
+        >>> N, Z, W = 100, 10, 10
         >>> x_shape = [N, Z, W]
         >>> x = torch.randn(x_shape)
         >>> model = nn.Linear(W, W + 2)