From f9009e2ef8dcc62eba2c3432210163ef91519484 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Thu, 18 Apr 2024 17:45:40 +0200
Subject: [PATCH 01/17] github: update config.yml (#1689)

---
 .github/ISSUE_TEMPLATE/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 6d6ee3543..84f6ea55a 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -11,5 +11,5 @@ contact_links:
     about: Using pyannote.audio in production? Make the most of it thanks to our consulting services.
 
   - name: Premium models
-    url: https://forms.gle/eKhn7H2zTa68sMMx8
+    url: https://forms.office.com/e/GdqwVgkZ5C
     about: We are considering selling premium models, extensions, or services around pyannote.audio.

From 704f87414a57173938ba546115bbd71de02da82b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Mon, 22 Apr 2024 13:04:10 +0200
Subject: [PATCH 02/17] doc: update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b3df6eabc..50dfeb286 100644
--- a/README.md
+++ b/README.md
@@ -79,7 +79,7 @@ for turn, _, speaker in diarization.itertracks(yield_label=True):
 Out of the box, `pyannote.audio` speaker diarization [pipeline](https://hf.co/pyannote/speaker-diarization-3.1) v3.1 is expected to be much better (and faster) than v2.x.
 Those numbers are diarization error rates (in %):
 
-| Benchmark              | [v2.1](https://hf.co/pyannote/speaker-diarization-2.1) | [v3.1](https://hf.co/pyannote/speaker-diarization-3.1) | [Premium](https://forms.gle/eKhn7H2zTa68sMMx8) |
+| Benchmark              | [v2.1](https://hf.co/pyannote/speaker-diarization-2.1) | [v3.1](https://hf.co/pyannote/speaker-diarization-3.1) | [Premium](https://forms.office.com/e/GdqwVgkZ5C) |
 | ---------------------- | ------ | ------ | --------- |
 | [AISHELL-4](https://arxiv.org/abs/2104.03603)              |  14.1  |  12.2  | 11.9      |
 | [AliMeeting](https://www.openslr.org/119/) (channel 1) |  27.4  |  24.4  | 22.5      |

From 2a7206762fd4ad095994836ce646af1abd1900bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Wed, 24 Apr 2024 21:53:36 +0200
Subject: [PATCH 03/17] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a82a2488f..49b976a1f 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ for turn, _, speaker in diarization.itertracks(yield_label=True):
 Out of the box, `pyannote.audio` speaker diarization [pipeline](https://hf.co/pyannote/speaker-diarization-3.1) v3.1 is expected to be much better (and faster) than v2.x.
 Those numbers are diarization error rates (in %):
 
-| Benchmark              | [v2.1](https://hf.co/pyannote/speaker-diarization-2.1) | [v3.1](https://hf.co/pyannote/speaker-diarization-3.1) | [Premium](https://forms.gle/eKhn7H2zTa68sMMx8) |
+| Benchmark              | [v2.1](https://hf.co/pyannote/speaker-diarization-2.1) | [v3.1](https://hf.co/pyannote/speaker-diarization-3.1) | [Premium](https://forms.office.com/e/GdqwVgkZ5C) |
 | ---------------------- | ------------------------------------------------------ | ------------------------------------------------------ | ---------------------------------------------- |
 | AISHELL-4              | 14.1                                                   | 12.3                                                   | 11.9                                           |
 | AliMeeting (channel 1) | 27.4                                                   | 24.5                                                   | 22.5                                           |

From 4407a66023cb42fd74450ab83b802a09ffa27d52 Mon Sep 17 00:00:00 2001
From: Lukas <38840142+lukasstorck@users.noreply.github.com>
Date: Sun, 5 May 2024 21:12:39 +0200
Subject: [PATCH 04/17] doc: fix typo in powerset docstring

---
 pyannote/audio/utils/powerset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyannote/audio/utils/powerset.py b/pyannote/audio/utils/powerset.py
index 6a0716df9..23f921569 100644
--- a/pyannote/audio/utils/powerset.py
+++ b/pyannote/audio/utils/powerset.py
@@ -109,7 +109,7 @@ def to_multilabel(self, powerset: torch.Tensor, soft: bool = False) -> torch.Ten
             Soft predictions in "powerset" space.
         soft : bool, optional
             Return soft multi-label predictions. Defaults to False (i.e. hard predictions)
-            Assumes that `powerset` are "logits" (not "probabilities").
+            Assumes that `powerset` are "log probabilities".
 
         Returns
         -------

From 5d56a11fd8340f34672ed53b5e32750feee820c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Tue, 7 May 2024 17:16:14 +0200
Subject: [PATCH 05/17] chore: remove use of vmap in stats-pooling layer
 (#1706)

---
 pyannote/audio/models/blocks/pooling.py | 77 +++++++++++++------------
 1 file changed, 40 insertions(+), 37 deletions(-)

diff --git a/pyannote/audio/models/blocks/pooling.py b/pyannote/audio/models/blocks/pooling.py
index 22d736a03..dc31bea8e 100644
--- a/pyannote/audio/models/blocks/pooling.py
+++ b/pyannote/audio/models/blocks/pooling.py
@@ -26,53 +26,53 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from einops import rearrange
 
 
-class StatsPool(nn.Module):
-    """Statistics pooling
+def _pool(sequences: torch.Tensor, weights: torch.Tensor) -> torch.Tensor:
+    """Helper function to compute statistics pooling
 
-    Compute temporal mean and (unbiased) standard deviation
-    and returns their concatenation.
+    Assumes that weights are already interpolated to match the number of frames
+    in sequences and that they encode the activation of only one speaker.
 
-    Reference
-    ---------
-    https://en.wikipedia.org/wiki/Weighted_arithmetic_mean
+    Parameters
+    ----------
+    sequences : (batch, features, frames) torch.Tensor
+        Sequences of features.
+    weights : (batch, frames) torch.Tensor
+        (Already interpolated) weights.
 
+    Returns
+    -------
+    output : (batch, 2 * features) torch.Tensor
+        Concatenation of mean and (unbiased) standard deviation.
     """
 
-    def _pool(self, sequences: torch.Tensor, weights: torch.Tensor) -> torch.Tensor:
-        """Helper function to compute statistics pooling
+    weights = weights.unsqueeze(dim=1)
+    # (batch, 1, frames)
 
-        Assumes that weights are already interpolated to match the number of frames
-        in sequences and that they encode the activation of only one speaker.
+    v1 = weights.sum(dim=2) + 1e-8
+    mean = torch.sum(sequences * weights, dim=2) / v1
 
-        Parameters
-        ----------
-        sequences : (batch, features, frames) torch.Tensor
-            Sequences of features.
-        weights : (batch, frames) torch.Tensor
-            (Already interpolated) weights.
+    dx2 = torch.square(sequences - mean.unsqueeze(2))
+    v2 = torch.square(weights).sum(dim=2)
 
-        Returns
-        -------
-        output : (batch, 2 * features) torch.Tensor
-            Concatenation of mean and (unbiased) standard deviation.
-        """
+    var = torch.sum(dx2 * weights, dim=2) / (v1 - v2 / v1 + 1e-8)
+    std = torch.sqrt(var)
 
-        weights = weights.unsqueeze(dim=1)
-        # (batch, 1, frames)
+    return torch.cat([mean, std], dim=1)
 
-        v1 = weights.sum(dim=2) + 1e-8
-        mean = torch.sum(sequences * weights, dim=2) / v1
 
-        dx2 = torch.square(sequences - mean.unsqueeze(2))
-        v2 = torch.square(weights).sum(dim=2)
+class StatsPool(nn.Module):
+    """Statistics pooling
 
-        var = torch.sum(dx2 * weights, dim=2) / (v1 - v2 / v1 + 1e-8)
-        std = torch.sqrt(var)
+    Compute temporal mean and (unbiased) standard deviation
+    and returns their concatenation.
 
-        return torch.cat([mean, std], dim=1)
+    Reference
+    ---------
+    https://en.wikipedia.org/wiki/Weighted_arithmetic_mean
+
+    """
 
     def forward(
         self, sequences: torch.Tensor, weights: Optional[torch.Tensor] = None
@@ -112,17 +112,20 @@ def forward(
             has_speaker_dimension = True
 
         # interpolate weights if needed
-        _, _, num_frames = sequences.shape
-        _, _, num_weights = weights.shape
+        _, _, num_frames = sequences.size()
+        _, num_speakers, num_weights = weights.size()
         if num_frames != num_weights:
             warnings.warn(
                 f"Mismatch between frames ({num_frames}) and weights ({num_weights}) numbers."
             )
             weights = F.interpolate(weights, size=num_frames, mode="nearest")
 
-        output = rearrange(
-            torch.vmap(self._pool, in_dims=(None, 1))(sequences, weights),
-            "speakers batch features -> batch speakers features",
+        output = torch.stack(
+            [
+                _pool(sequences, weights[:, speaker, :])
+                for speaker in range(num_speakers)
+            ],
+            dim=1,
         )
 
         if not has_speaker_dimension:

From 9a61ec26d9e14a8a1107f8b5bbb536160c4d6345 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Wed, 8 May 2024 10:38:18 +0200
Subject: [PATCH 06/17] fix: fix receptive field computation with non-zero
 padding (#1707)

---
 pyannote/audio/models/blocks/sincnet.py       |  2 ++
 pyannote/audio/models/embedding/debug.py      | 32 +++++-------------
 .../models/embedding/wespeaker/__init__.py    |  1 +
 .../models/embedding/wespeaker/resnet.py      |  3 ++
 pyannote/audio/models/embedding/xvector.py    | 33 +++++--------------
 .../audio/models/segmentation/SSeRiouSS.py    | 10 ++++--
 pyannote/audio/models/segmentation/debug.py   | 32 +++++-------------
 pyannote/audio/utils/receptive_field.py       | 11 +++++--
 8 files changed, 48 insertions(+), 76 deletions(-)

diff --git a/pyannote/audio/models/blocks/sincnet.py b/pyannote/audio/models/blocks/sincnet.py
index b46549bb3..2a085201c 100644
--- a/pyannote/audio/models/blocks/sincnet.py
+++ b/pyannote/audio/models/blocks/sincnet.py
@@ -122,12 +122,14 @@ def receptive_field_size(self, num_frames: int = 1) -> int:
 
         kernel_size = [251, 3, 5, 3, 5, 3]
         stride = [self.stride, 3, 1, 3, 1, 3]
+        padding = [0, 0, 0, 0, 0, 0]
         dilation = [1, 1, 1, 1, 1, 1]
 
         return multi_conv_receptive_field_size(
             num_frames,
             kernel_size=kernel_size,
             stride=stride,
+            padding=padding,
             dilation=dilation,
         )
 
diff --git a/pyannote/audio/models/embedding/debug.py b/pyannote/audio/models/embedding/debug.py
index a5e862a24..b09283908 100644
--- a/pyannote/audio/models/embedding/debug.py
+++ b/pyannote/audio/models/embedding/debug.py
@@ -31,11 +31,6 @@
 
 from pyannote.audio.core.model import Model
 from pyannote.audio.core.task import Task
-from pyannote.audio.utils.receptive_field import (
-    conv1d_num_frames,
-    conv1d_receptive_field_center,
-    conv1d_receptive_field_size,
-)
 
 
 class SimpleEmbeddingModel(Model):
@@ -87,13 +82,10 @@ def num_frames(self, num_samples: int) -> int:
         n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft
         center = self.mfcc.MelSpectrogram.spectrogram.center
 
-        return conv1d_num_frames(
-            num_samples=num_samples,
-            kernel_size=n_fft,
-            stride=hop_length,
-            padding=n_fft // 2 if center else 0,
-            dilation=1,
-        )
+        if center:
+            return 1 + num_samples // hop_length
+        else:
+            return 1 + (num_samples - n_fft) // hop_length
 
     def receptive_field_size(self, num_frames: int = 1) -> int:
         """Compute size of receptive field
@@ -111,10 +103,7 @@ def receptive_field_size(self, num_frames: int = 1) -> int:
 
         hop_length = self.mfcc.MelSpectrogram.spectrogram.hop_length
         n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft
-
-        return conv1d_receptive_field_size(
-            num_frames, kernel_size=n_fft, stride=hop_length, dilation=1
-        )
+        return n_fft + (num_frames - 1) * hop_length
 
     def receptive_field_center(self, frame: int = 0) -> int:
         """Compute center of receptive field
@@ -134,13 +123,10 @@ def receptive_field_center(self, frame: int = 0) -> int:
         n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft
         center = self.mfcc.MelSpectrogram.spectrogram.center
 
-        return conv1d_receptive_field_center(
-            frame=frame,
-            kernel_size=n_fft,
-            stride=hop_length,
-            padding=n_fft // 2 if center else 0,
-            dilation=1,
-        )
+        if center:
+            return frame * hop_length
+        else:
+            return frame * hop_length + n_fft // 2
 
     @property
     def dimension(self) -> int:
diff --git a/pyannote/audio/models/embedding/wespeaker/__init__.py b/pyannote/audio/models/embedding/wespeaker/__init__.py
index e75779dda..75df427d6 100644
--- a/pyannote/audio/models/embedding/wespeaker/__init__.py
+++ b/pyannote/audio/models/embedding/wespeaker/__init__.py
@@ -154,6 +154,7 @@ def receptive_field_size(self, num_frames: int = 1) -> int:
             num_frames=receptive_field_size,
             kernel_size=window_size,
             stride=step_size,
+            padding=0,
             dilation=1,
         )
 
diff --git a/pyannote/audio/models/embedding/wespeaker/resnet.py b/pyannote/audio/models/embedding/wespeaker/resnet.py
index b64dd386d..2a1f58e0b 100644
--- a/pyannote/audio/models/embedding/wespeaker/resnet.py
+++ b/pyannote/audio/models/embedding/wespeaker/resnet.py
@@ -124,6 +124,7 @@ def receptive_field_size(self, num_frames: int = 1) -> int:
             num_frames,
             kernel_size=[3, 3],
             stride=[self.stride, 1],
+            padding=[1, 1],
             dilation=[1, 1],
         )
 
@@ -189,6 +190,7 @@ def receptive_field_size(self, num_frames: int = 1) -> int:
             num_frames,
             kernel_size=[1, 3, 1],
             stride=[1, self.stride, 1],
+            padding=[0, 1, 0],
             dilation=[1, 1, 1],
         )
 
@@ -305,6 +307,7 @@ def receptive_field_size(self, num_frames: int = 1) -> int:
             num_frames=receptive_field_size,
             kernel_size=3,
             stride=1,
+            padding=1,
             dilation=1,
         )
 
diff --git a/pyannote/audio/models/embedding/xvector.py b/pyannote/audio/models/embedding/xvector.py
index 00916fbd0..3161876e3 100644
--- a/pyannote/audio/models/embedding/xvector.py
+++ b/pyannote/audio/models/embedding/xvector.py
@@ -33,9 +33,6 @@
 from pyannote.audio.models.blocks.sincnet import SincNet
 from pyannote.audio.utils.params import merge_dict
 from pyannote.audio.utils.receptive_field import (
-    conv1d_num_frames,
-    conv1d_receptive_field_center,
-    conv1d_receptive_field_size,
     multi_conv_num_frames,
     multi_conv_receptive_field_center,
     multi_conv_receptive_field_size,
@@ -115,13 +112,10 @@ def num_frames(self, num_samples: int) -> int:
         n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft
         center = self.mfcc.MelSpectrogram.spectrogram.center
 
-        num_frames = conv1d_num_frames(
-            num_samples,
-            kernel_size=n_fft,
-            stride=hop_length,
-            dilation=1,
-            padding=n_fft // 2 if center else 0,
-        )
+        if center:
+            num_frames = 1 + num_samples // hop_length
+        else:
+            num_frames = 1 + (num_samples - n_fft) // hop_length
 
         return multi_conv_num_frames(
             num_frames,
@@ -155,13 +149,7 @@ def receptive_field_size(self, num_frames: int = 1) -> int:
 
         hop_length = self.mfcc.MelSpectrogram.spectrogram.hop_length
         n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft
-
-        return conv1d_receptive_field_size(
-            num_frames=receptive_field_size,
-            kernel_size=n_fft,
-            stride=hop_length,
-            dilation=1,
-        )
+        return n_fft + (receptive_field_size - 1) * hop_length
 
     def receptive_field_center(self, frame: int = 0) -> int:
         """Compute center of receptive field
@@ -189,13 +177,10 @@ def receptive_field_center(self, frame: int = 0) -> int:
         n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft
         center = self.mfcc.MelSpectrogram.spectrogram.center
 
-        return conv1d_receptive_field_center(
-            frame=receptive_field_center,
-            kernel_size=n_fft,
-            stride=hop_length,
-            padding=n_fft // 2 if center else 0,
-            dilation=1,
-        )
+        if center:
+            return receptive_field_center * hop_length
+        else:
+            return receptive_field_center * hop_length + n_fft // 2
 
     def forward(
         self, waveforms: torch.Tensor, weights: Optional[torch.Tensor] = None
diff --git a/pyannote/audio/models/segmentation/SSeRiouSS.py b/pyannote/audio/models/segmentation/SSeRiouSS.py
index ef550dfe1..b96464ab3 100644
--- a/pyannote/audio/models/segmentation/SSeRiouSS.py
+++ b/pyannote/audio/models/segmentation/SSeRiouSS.py
@@ -149,9 +149,12 @@ def __init__(
             self.lstm = nn.ModuleList(
                 [
                     nn.LSTM(
-                        wav2vec_dim
-                        if i == 0
-                        else lstm["hidden_size"] * (2 if lstm["bidirectional"] else 1),
+                        (
+                            wav2vec_dim
+                            if i == 0
+                            else lstm["hidden_size"]
+                            * (2 if lstm["bidirectional"] else 1)
+                        ),
                         **one_layer_lstm,
                     )
                     for i in range(num_layers)
@@ -246,6 +249,7 @@ def receptive_field_size(self, num_frames: int = 1) -> int:
                 num_frames=receptive_field_size,
                 kernel_size=conv_layer.kernel_size,
                 stride=conv_layer.stride,
+                padding=conv_layer.conv.padding[0],
                 dilation=conv_layer.conv.dilation[0],
             )
         return receptive_field_size
diff --git a/pyannote/audio/models/segmentation/debug.py b/pyannote/audio/models/segmentation/debug.py
index 93c205b3d..ccac612a9 100644
--- a/pyannote/audio/models/segmentation/debug.py
+++ b/pyannote/audio/models/segmentation/debug.py
@@ -31,11 +31,6 @@
 
 from pyannote.audio.core.model import Model
 from pyannote.audio.core.task import Task
-from pyannote.audio.utils.receptive_field import (
-    conv1d_num_frames,
-    conv1d_receptive_field_center,
-    conv1d_receptive_field_size,
-)
 
 
 class SimpleSegmentationModel(Model):
@@ -87,13 +82,10 @@ def num_frames(self, num_samples: int) -> int:
         n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft
         center = self.mfcc.MelSpectrogram.spectrogram.center
 
-        return conv1d_num_frames(
-            num_samples=num_samples,
-            kernel_size=n_fft,
-            stride=hop_length,
-            padding=n_fft // 2 if center else 0,
-            dilation=1,
-        )
+        if center:
+            return 1 + num_samples // hop_length
+        else:
+            return 1 + (num_samples - n_fft) // hop_length
 
     def receptive_field_size(self, num_frames: int = 1) -> int:
         """Compute size of receptive field
@@ -111,10 +103,7 @@ def receptive_field_size(self, num_frames: int = 1) -> int:
 
         hop_length = self.mfcc.MelSpectrogram.spectrogram.hop_length
         n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft
-
-        return conv1d_receptive_field_size(
-            num_frames, kernel_size=n_fft, stride=hop_length, dilation=1
-        )
+        return n_fft + (num_frames - 1) * hop_length
 
     def receptive_field_center(self, frame: int = 0) -> int:
         """Compute center of receptive field
@@ -134,13 +123,10 @@ def receptive_field_center(self, frame: int = 0) -> int:
         n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft
         center = self.mfcc.MelSpectrogram.spectrogram.center
 
-        return conv1d_receptive_field_center(
-            frame=frame,
-            kernel_size=n_fft,
-            stride=hop_length,
-            padding=n_fft // 2 if center else 0,
-            dilation=1,
-        )
+        if center:
+            return frame * hop_length
+        else:
+            return frame * hop_length + n_fft // 2
 
     @property
     def dimension(self) -> int:
diff --git a/pyannote/audio/utils/receptive_field.py b/pyannote/audio/utils/receptive_field.py
index 0e484e4ad..420a62de0 100644
--- a/pyannote/audio/utils/receptive_field.py
+++ b/pyannote/audio/utils/receptive_field.py
@@ -69,7 +69,9 @@ def multi_conv_num_frames(
     return num_frames
 
 
-def conv1d_receptive_field_size(num_frames=1, kernel_size=5, stride=1, dilation=1):
+def conv1d_receptive_field_size(
+    num_frames=1, kernel_size=5, stride=1, padding=0, dilation=1
+):
     """Compute size of receptive field
 
     Parameters
@@ -80,6 +82,8 @@ def conv1d_receptive_field_size(num_frames=1, kernel_size=5, stride=1, dilation=
         Kernel size
     stride : int
         Stride
+    padding : int
+        Padding
     dilation : int
         Dilation
 
@@ -90,7 +94,7 @@ def conv1d_receptive_field_size(num_frames=1, kernel_size=5, stride=1, dilation=
     """
 
     effective_kernel_size = 1 + (kernel_size - 1) * dilation
-    return effective_kernel_size + (num_frames - 1) * stride
+    return effective_kernel_size + (num_frames - 1) * stride - 2 * padding
 
 
 def multi_conv_receptive_field_size(
@@ -102,11 +106,12 @@ def multi_conv_receptive_field_size(
 ) -> int:
     receptive_field_size = num_frames
 
-    for k, s, d in reversed(list(zip(kernel_size, stride, dilation))):
+    for k, s, p, d in reversed(list(zip(kernel_size, stride, padding, dilation))):
         receptive_field_size = conv1d_receptive_field_size(
             num_frames=receptive_field_size,
             kernel_size=k,
             stride=s,
+            padding=p,
             dilation=d,
         )
     return receptive_field_size

From 7a9013745216d736538bdfaea22c5308f9b0c23b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Wed, 8 May 2024 11:16:19 +0200
Subject: [PATCH 07/17] feat: add `fbank_only` property to `WeSpeaker` models
 (#1708)

---
 CHANGELOG.md                                  |   7 +-
 .../models/embedding/wespeaker/__init__.py    | 158 +++++++++++++++++-
 .../models/embedding/wespeaker/resnet.py      |  63 ++++++-
 3 files changed, 210 insertions(+), 18 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8741e7932..647d478ef 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,15 +6,16 @@
 
 - feat(task): add option to cache task training metadata to speed up training (with [@clement-pages](https://github.com/clement-pages/))
 - feat(model): add `receptive_field`, `num_frames` and `dimension` to models (with [@Bilal-Rahou](https://github.com/Bilal-Rahou))
+- feat(model): add `fbank_only` property to `WeSpeaker` models
 - feat(util): add `Powerset.permutation_mapping` to help with permutation in powerset space (with [@FrenchKrab](https://github.com/FrenchKrab))
-- feat(sample): add sample file at `pyannote.audio.sample.SAMPLE_FILE` 
+- feat(sample): add sample file at `pyannote.audio.sample.SAMPLE_FILE`
 - feat(metric): add `reduce` option to `diarization_error_rate` metric (with [@Bilal-Rahou](https://github.com/Bilal-Rahou))
 - feat(pipeline): add `Waveform` and `SampleRate` preprocessors
 
 ### Fixes
 
-- fix(task): fix random generators and their reproducibility  (with [@FrenchKrab](https://github.com/FrenchKrab))
-- fix(task): fix estimation of training set size  (with [@FrenchKrab](https://github.com/FrenchKrab))
+- fix(task): fix random generators and their reproducibility (with [@FrenchKrab](https://github.com/FrenchKrab))
+- fix(task): fix estimation of training set size (with [@FrenchKrab](https://github.com/FrenchKrab))
 
 ### Improvements
 
diff --git a/pyannote/audio/models/embedding/wespeaker/__init__.py b/pyannote/audio/models/embedding/wespeaker/__init__.py
index 75df427d6..be51196c1 100644
--- a/pyannote/audio/models/embedding/wespeaker/__init__.py
+++ b/pyannote/audio/models/embedding/wespeaker/__init__.py
@@ -25,6 +25,7 @@
 from typing import Optional
 
 import torch
+import torch.nn.functional as F
 import torchaudio.compliance.kaldi as kaldi
 
 from pyannote.audio.core.model import Model
@@ -39,16 +40,33 @@
 
 
 class BaseWeSpeakerResNet(Model):
+    """Base class for WeSpeaker's ResNet models
+
+    Parameters
+    ----------
+    fbank_centering_span : float, optional
+        Span of the fbank centering window (in seconds).
+        Defaults (None) to use whole input.
+
+    See also
+    --------
+    torchaudio.compliance.kaldi.fbank
+
+    """
+
     def __init__(
         self,
         sample_rate: int = 16000,
         num_channels: int = 1,
         num_mel_bins: int = 80,
-        frame_length: int = 25,
-        frame_shift: int = 10,
+        frame_length: float = 25.0,  # in milliseconds
+        frame_shift: float = 10.0,  # in milliseconds
+        round_to_power_of_two: bool = True,
+        snip_edges: bool = True,
         dither: float = 0.0,
         window_type: str = "hamming",
         use_energy: bool = False,
+        fbank_centering_span: Optional[float] = None,
         task: Optional[Task] = None,
     ):
         super().__init__(sample_rate=sample_rate, num_channels=num_channels, task=task)
@@ -60,21 +78,38 @@ def __init__(
             "frame_length",
             "frame_shift",
             "dither",
+            "round_to_power_of_two",
+            "snip_edges",
             "window_type",
             "use_energy",
+            "fbank_centering_span",
         )
 
         self._fbank = partial(
             kaldi.fbank,
             num_mel_bins=self.hparams.num_mel_bins,
             frame_length=self.hparams.frame_length,
+            round_to_power_of_two=self.hparams.round_to_power_of_two,
             frame_shift=self.hparams.frame_shift,
+            snip_edges=self.hparams.snip_edges,
             dither=self.hparams.dither,
             sample_frequency=self.hparams.sample_rate,
             window_type=self.hparams.window_type,
             use_energy=self.hparams.use_energy,
         )
 
+    @property
+    def fbank_only(self) -> bool:
+        """Whether to only extract fbank features"""
+        return getattr(self, "_fbank_only", False)
+
+    @fbank_only.setter
+    def fbank_only(self, value: bool):
+        if hasattr(self, "receptive_field"):
+            del self.receptive_field
+
+        self._fbank_only = value
+
     def compute_fbank(self, waveforms: torch.Tensor) -> torch.Tensor:
         """Extract fbank features
 
@@ -85,6 +120,7 @@ def compute_fbank(self, waveforms: torch.Tensor) -> torch.Tensor:
         Returns
         -------
         fbank : (batch_size, num_frames, num_mel_bins)
+            fbank features
 
         Source: https://github.com/wenet-e2e/wespeaker/blob/45941e7cba2c3ea99e232d02bedf617fc71b0dad/wespeaker/bin/infer_onnx.py#L30C1-L50
         """
@@ -98,11 +134,37 @@ def compute_fbank(self, waveforms: torch.Tensor) -> torch.Tensor:
 
         features = torch.vmap(self._fbank)(waveforms.to(fft_device)).to(device)
 
-        return features - torch.mean(features, dim=1, keepdim=True)
+        # center features with global average
+        if self.hparams.fbank_centering_span is None:
+            return features - torch.mean(features, dim=1, keepdim=True)
+
+        # center features with running average
+        window_size = int(self.hparams.sample_rate * self.hparams.frame_length * 0.001)
+        step_size = int(self.hparams.sample_rate * self.hparams.frame_shift * 0.001)
+        kernel_size = conv1d_num_frames(
+            num_samples=int(
+                self.hparams.fbank_centering_span * self.hparams.sample_rate
+            ),
+            kernel_size=window_size,
+            stride=step_size,
+            padding=0,
+            dilation=1,
+        )
+        return features - F.avg_pool1d(
+            features.transpose(1, 2),
+            kernel_size=2 * (kernel_size // 2) + 1,
+            stride=1,
+            padding=kernel_size // 2,
+            count_include_pad=False,
+        ).transpose(1, 2)
 
     @property
     def dimension(self) -> int:
         """Dimension of output"""
+
+        if self.fbank_only:
+            return self.hparams.num_mel_bins
+
         return self.resnet.embed_dim
 
     @lru_cache
@@ -122,6 +184,8 @@ def num_frames(self, num_samples: int) -> int:
         window_size = int(self.hparams.sample_rate * self.hparams.frame_length * 0.001)
         step_size = int(self.hparams.sample_rate * self.hparams.frame_shift * 0.001)
 
+        # TODO: take round_to_power_of_two and snip_edges into account
+
         num_frames = conv1d_num_frames(
             num_samples=num_samples,
             kernel_size=window_size,
@@ -129,6 +193,10 @@ def num_frames(self, num_samples: int) -> int:
             padding=0,
             dilation=1,
         )
+
+        if self.fbank_only:
+            return num_frames
+
         return self.resnet.num_frames(num_frames)
 
     def receptive_field_size(self, num_frames: int = 1) -> int:
@@ -144,8 +212,13 @@ def receptive_field_size(self, num_frames: int = 1) -> int:
         receptive_field_size : int
             Receptive field size.
         """
+
         receptive_field_size = num_frames
-        receptive_field_size = self.resnet.receptive_field_size(receptive_field_size)
+
+        if not self.fbank_only:
+            receptive_field_size = self.resnet.receptive_field_size(
+                receptive_field_size
+            )
 
         window_size = int(self.hparams.sample_rate * self.hparams.frame_length * 0.001)
         step_size = int(self.hparams.sample_rate * self.hparams.frame_shift * 0.001)
@@ -172,9 +245,11 @@ def receptive_field_center(self, frame: int = 0) -> int:
             Index of receptive field center.
         """
         receptive_field_center = frame
-        receptive_field_center = self.resnet.receptive_field_center(
-            frame=receptive_field_center
-        )
+
+        if not self.fbank_only:
+            receptive_field_center = self.resnet.receptive_field_center(
+                frame=receptive_field_center
+            )
 
         window_size = int(self.hparams.sample_rate * self.hparams.frame_length * 0.001)
         step_size = int(self.hparams.sample_rate * self.hparams.frame_shift * 0.001)
@@ -189,14 +264,79 @@ def receptive_field_center(self, frame: int = 0) -> int:
     def forward(
         self, waveforms: torch.Tensor, weights: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
+        """Extract speaker embeddings
+
+        Parameters
+        ----------
+        waveforms : torch.Tensor
+            Batch of waveforms with shape (batch, channel, sample)
+        weights : (batch, frames) or (batch, speakers, frames) torch.Tensor, optional
+            Batch of weights passed to statistics pooling layer.
+
+        Returns
+        -------
+        embeddings : (batch, dimension) or (batch, speakers, dimension) torch.Tensor
+            Batch of embeddings.
         """
 
+        fbank = self.compute_fbank(waveforms)
+        if self.fbank_only:
+            return fbank
+
+        return self.resnet(fbank, weights=weights)[1]
+
+    def forward_frames(self, waveforms: torch.Tensor) -> torch.Tensor:
+        """Extract frame-wise embeddings
+
         Parameters
         ----------
         waveforms : torch.Tensor
             Batch of waveforms with shape (batch, channel, sample)
-        weights : torch.Tensor, optional
-            Batch of weights with shape (batch, frame).
+
+        Returns
+        -------
+        embeddings : (batch, ..., embedding_frames) torch.Tensor
+            Batch of frame-wise embeddings.
+        """
+        fbank = self.compute_fbank(waveforms)
+        return self.resnet.forward_frames(fbank)
+
+    def forward_embedding(
+        self, frames: torch.Tensor, weights: torch.Tensor = None
+    ) -> torch.Tensor:
+        """Extract speaker embeddings from frame-wise embeddings
+
+        Parameters
+        ----------
+        frames : torch.Tensor
+            Batch of frames with shape (batch, ..., embedding_frames).
+        weights : (batch, frames) or (batch, speakers, frames) torch.Tensor, optional
+            Batch of weights passed to statistics pooling layer.
+
+        Returns
+        -------
+        embeddings : (batch, dimension) or (batch, speakers, dimension) torch.Tensor
+            Batch of embeddings.
+
+        """
+        return self.resnet.forward_embedding(frames, weights=weights)[1]
+
+    def forward(
+        self, waveforms: torch.Tensor, weights: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Extract speaker embeddings
+
+        Parameters
+        ----------
+        waveforms : torch.Tensor
+            Batch of waveforms with shape (batch, channel, sample)
+        weights : (batch, frames) or (batch, speakers, frames) torch.Tensor, optional
+            Batch of weights passed to statistics pooling layer.
+
+        Returns
+        -------
+        embeddings : (batch, dimension) or (batch, speakers, dimension) torch.Tensor
+            Batch of embeddings.
         """
 
         fbank = self.compute_fbank(waveforms)
diff --git a/pyannote/audio/models/embedding/wespeaker/resnet.py b/pyannote/audio/models/embedding/wespeaker/resnet.py
index 2a1f58e0b..4c9d5a5f0 100644
--- a/pyannote/audio/models/embedding/wespeaker/resnet.py
+++ b/pyannote/audio/models/embedding/wespeaker/resnet.py
@@ -344,12 +344,64 @@ def receptive_field_center(self, frame: int = 0) -> int:
 
         return receptive_field_center
 
-    def forward(self, x: torch.Tensor, weights: Optional[torch.Tensor] = None):
+    def forward_frames(self, fbank: torch.Tensor) -> torch.Tensor:
+        """Extract frame-wise embeddings
+
+        Parameters
+        ----------
+        fbanks : (batch, frames, features) torch.Tensor
+            Batch of fbank features
+
+        Returns
+        -------
+        embeddings : (batch, ..., embedding_frames) torch.Tensor
+            Batch of frame-wise embeddings.
+
         """
+        fbank = fbank.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
+        fbank = fbank.unsqueeze_(1)
+        out = F.relu(self.bn1(self.conv1(fbank)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        return out
+
+    def forward_embedding(
+        self, frames: torch.Tensor, weights: torch.Tensor = None
+    ) -> torch.Tensor:
+        """Extract speaker embeddings
 
         Parameters
         ----------
-        x : (batch, frames, features) torch.Tensor
+        frames : torch.Tensor
+            Batch of frames with shape (batch, ..., embedding_frames).
+        weights : (batch, frames) or (batch, speakers, frames) torch.Tensor, optional
+            Batch of weights passed to statistics pooling layer.
+
+        Returns
+        -------
+        embeddings : (batch, dimension) or (batch, speakers, dimension) torch.Tensor
+            Batch of embeddings.
+        """
+
+        stats = self.pool(frames, weights=weights)
+
+        embed_a = self.seg_1(stats)
+        if self.two_emb_layer:
+            out = F.relu(embed_a)
+            out = self.seg_bn_1(out)
+            embed_b = self.seg_2(out)
+            return embed_a, embed_b
+        else:
+            return torch.tensor(0.0), embed_a
+
+    def forward(self, fbank: torch.Tensor, weights: Optional[torch.Tensor] = None):
+        """Extract speaker embeddings
+
+        Parameters
+        ----------
+        fbank : (batch, frames, features) torch.Tensor
             Batch of features
         weights : (batch, frames) torch.Tensor, optional
             Batch of weights
@@ -358,10 +410,9 @@ def forward(self, x: torch.Tensor, weights: Optional[torch.Tensor] = None):
         -------
         embedding : (batch, embedding_dim) torch.Tensor
         """
-        x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
-
-        x = x.unsqueeze_(1)
-        out = F.relu(self.bn1(self.conv1(x)))
+        fbank = fbank.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
+        fbank = fbank.unsqueeze_(1)
+        out = F.relu(self.bn1(self.conv1(fbank)))
         out = self.layer1(out)
         out = self.layer2(out)
         out = self.layer3(out)

From 461580848a621b378ad3b943b99e4b916c07c2e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Wed, 8 May 2024 11:21:44 +0200
Subject: [PATCH 08/17] fix(hook): fix `torch.Tensor` support in `ArtifactHook`

---
 CHANGELOG.md                           | 1 +
 pyannote/audio/pipelines/utils/hook.py | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 647d478ef..710be5817 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@
 
 - fix(task): fix random generators and their reproducibility (with [@FrenchKrab](https://github.com/FrenchKrab))
 - fix(task): fix estimation of training set size (with [@FrenchKrab](https://github.com/FrenchKrab))
+- fix(hook): fix `torch.Tensor` support in `ArtifactHook`
 
 ### Improvements
 
diff --git a/pyannote/audio/pipelines/utils/hook.py b/pyannote/audio/pipelines/utils/hook.py
index 2a675d1c9..db6972e2e 100644
--- a/pyannote/audio/pipelines/utils/hook.py
+++ b/pyannote/audio/pipelines/utils/hook.py
@@ -24,6 +24,7 @@
 from copy import deepcopy
 from typing import Any, Mapping, Optional, Text
 
+import torch
 from rich.progress import (
     BarColumn,
     Progress,
@@ -75,6 +76,9 @@ def __call__(
         ):
             return
 
+        if isinstance(step_artifact, torch.Tensor):
+            step_artifact = step_artifact.numpy(force=True)
+
         file.setdefault(self.file_key, dict())[step_name] = deepcopy(step_artifact)
 
 

From 07a85a7c061d83ac663ed8723d8f09c966b30271 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Wed, 8 May 2024 11:36:04 +0200
Subject: [PATCH 09/17] doc: update changelog

---
 CHANGELOG.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 710be5817..a444fd7d3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,7 @@
 - fix(task): fix random generators and their reproducibility (with [@FrenchKrab](https://github.com/FrenchKrab))
 - fix(task): fix estimation of training set size (with [@FrenchKrab](https://github.com/FrenchKrab))
 - fix(hook): fix `torch.Tensor` support in `ArtifactHook`
+- fix(doc): fix typo in `Powerset` docstring (with [@lukasstorck](https://github.com/lukasstorck))
 
 ### Improvements
 
@@ -30,6 +31,10 @@
 - BREAKING(model): get rid of `Model.example_output` in favor of `num_frames` method, `receptive_field` property, and `dimension` property
 - BREAKING(task): custom tasks need to be updated (see "Add your own task" tutorial)
 
+## Community contributions
+
+- community: add tutorial for offline use of `pyannote/speaker-diarization-3.1` (by [@simonottenhauskenbun](https://github.com/simonottenhauskenbun))
+
 ## Version 3.1.1 (2023-12-01)
 
 ### TL;DR

From e01134dbddae2e4b93c3f11f0b57757bba15a7b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Wed, 8 May 2024 11:40:27 +0200
Subject: [PATCH 10/17] doc: update changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a444fd7d3..ad88762c2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # Changelog
 
-## develop
+## Version 3.2.0 (2024-05-08)
 
 ### New features
 

From bb4dd2eed453a6778246537fbb5e51190fd5ac1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Wed, 8 May 2024 11:40:45 +0200
Subject: [PATCH 11/17] git: update version

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 94ff29cc4..944880fa1 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-3.1.1
+3.2.0

From ba1c4f5e7cf606ab8f772fc238da66a93f5c6e00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Fri, 10 May 2024 22:38:11 +0200
Subject: [PATCH 12/17] doc: update README.md to reference pyannoteAI

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index e1326816a..abef6de01 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
-Using `pyannote.audio` open-source toolkit in production?
-Make the most of it thanks to our [consulting services](https://herve.niderb.fr/consulting.html).
+Using `pyannote.audio` open-source toolkit in production?  
+Consider switching to [pyannoteAI](https://www.pyannote.ai) for better and faster options.
 
 # `pyannote.audio` speaker diarization toolkit
 
@@ -79,7 +79,7 @@ for turn, _, speaker in diarization.itertracks(yield_label=True):
 Out of the box, `pyannote.audio` speaker diarization [pipeline](https://hf.co/pyannote/speaker-diarization-3.1) v3.1 is expected to be much better (and faster) than v2.x.
 Those numbers are diarization error rates (in %):
 
-| Benchmark                                                                                                                   | [v2.1](https://hf.co/pyannote/speaker-diarization-2.1) | [v3.1](https://hf.co/pyannote/speaker-diarization-3.1) | [Premium](https://forms.office.com/e/GdqwVgkZ5C) |
+| Benchmark                                                                                                                   | [v2.1](https://hf.co/pyannote/speaker-diarization-2.1) | [v3.1](https://hf.co/pyannote/speaker-diarization-3.1) | [pyannoteAI](https://www.pyannote.ai) |
 | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------ | ------------------------------------------------------ | ------------------------------------------------ |
 | [AISHELL-4](https://arxiv.org/abs/2104.03603)                                                                               | 14.1                                                   | 12.2                                                   | 11.9                                             |
 | [AliMeeting](https://www.openslr.org/119/) (channel 1)                                                                      | 27.4                                                   | 24.4                                                   | 22.5                                             |

From a73ded27f297c6876b722e3c7bb77428a1bac1c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Sun, 12 May 2024 22:09:11 +0200
Subject: [PATCH 13/17] fix(doc): fix title level

---
 CHANGELOG.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ad88762c2..4149ccd74 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,12 +26,12 @@
 - improve(io): switch to `torchaudio >= 2.2.0`
 - improve(doc): update tutorials (with [@clement-pages](https://github.com/clement-pages/))
 
-## Breaking changes
+### Breaking changes
 
 - BREAKING(model): get rid of `Model.example_output` in favor of `num_frames` method, `receptive_field` property, and `dimension` property
 - BREAKING(task): custom tasks need to be updated (see "Add your own task" tutorial)
 
-## Community contributions
+### Community contributions
 
 - community: add tutorial for offline use of `pyannote/speaker-diarization-3.1` (by [@simonottenhauskenbun](https://github.com/simonottenhauskenbun))
 

From cad8bea9883823ecba48e12963ef03cf4d41dc4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Fri, 17 May 2024 15:39:45 +0200
Subject: [PATCH 14/17] fix(task): fix incorrect train/dev split with (some)
 meta-protocols (#1709)

---
 CHANGELOG.md                |  6 ++++++
 pyannote/audio/core/task.py | 10 ++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4149ccd74..e48aafacb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## develop
+
+### Fixes
+
+- fix(task): fix wrong train/development split when training with (some) meta-protocols ([#1709](https://github.com/pyannote/pyannote-audio/issues/1709))
+
 ## Version 3.2.0 (2024-05-08)
 
 ### New features
diff --git a/pyannote/audio/core/task.py b/pyannote/audio/core/task.py
index 0a61e2a6f..974f43a67 100644
--- a/pyannote/audio/core/task.py
+++ b/pyannote/audio/core/task.py
@@ -362,12 +362,13 @@ def prepare_data(self):
 
         if self.has_validation:
             files_iter = itertools.chain(
-                self.protocol.train(), self.protocol.development()
+                zip(itertools.repeat("train"), self.protocol.train()),
+                zip(itertools.repeat("development"), self.protocol.development()),
             )
         else:
-            files_iter = self.protocol.train()
+            files_iter = zip(itertools.repeat("train"), self.protocol.train())
 
-        for file_id, file in enumerate(files_iter):
+        for file_id, (subset, file) in enumerate(files_iter):
             # gather metadata and update metadata_unique_values so that each metadatum
             # (e.g. source database or label) is represented by an integer.
             metadatum = dict()
@@ -378,7 +379,8 @@ def prepare_data(self):
             metadatum["database"] = metadata_unique_values["database"].index(
                 file["database"]
             )
-            metadatum["subset"] = Subsets.index(file["subset"])
+
+            metadatum["subset"] = Subsets.index(subset)
 
             # keep track of label scope (file, database, or global)
             metadatum["scope"] = Scopes.index(file["scope"])

From 5ae4c9b685feee02cbd58d25210e51def7037079 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Fri, 17 May 2024 20:59:02 +0200
Subject: [PATCH 15/17] improve(io): use (faster) soundfile backend when
 available (#1711)

---
 CHANGELOG.md                     |  8 ++++++
 pyannote/audio/core/io.py        | 48 +++++++++++++++++++++++++++-----
 pyannote/audio/utils/protocol.py | 11 +++++++-
 3 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e48aafacb..accc0cc1b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,10 +2,18 @@
 
 ## develop
 
+### New features
+
+- feat(io): add option to select torchaudio `backend`
+
 ### Fixes
 
 - fix(task): fix wrong train/development split when training with (some) meta-protocols ([#1709](https://github.com/pyannote/pyannote-audio/issues/1709))
 
+### Improvements
+
+- improve(io): when available, default to using `soundfile` backend
+
 ## Version 3.2.0 (2024-05-08)
 
 ### New features
diff --git a/pyannote/audio/core/io.py b/pyannote/audio/core/io.py
index 8fafe69d3..bce9c4dbf 100644
--- a/pyannote/audio/core/io.py
+++ b/pyannote/audio/core/io.py
@@ -55,14 +55,34 @@
 """
 
 
-def get_torchaudio_info(file: AudioFile):
+def get_torchaudio_info(
+    file: AudioFile, backend: str = None
+) -> torchaudio.AudioMetaData:
     """Protocol preprocessor used to cache output of torchaudio.info
 
     This is useful to speed future random access to this file, e.g.
     in dataloaders using Audio.crop a lot....
+
+    Parameters
+    ----------
+    file : AudioFile
+    backend : str
+        torchaudio backend to use. Defaults to 'soundfile' if available,
+        or the first available backend.
+
+    Returns
+    -------
+    info : torchaudio.AudioMetaData
+        Audio file metadata
     """
 
-    info = torchaudio.info(file["audio"])
+    if not backend:
+        backends = (
+            torchaudio.list_audio_backends()
+        )  # e.g ['ffmpeg', 'soundfile', 'sox']
+        backend = "soundfile" if "soundfile" in backends else backends[0]
+
+    info = torchaudio.info(file["audio"], backend=backend)
 
     # rewind if needed
     if isinstance(file["audio"], IOBase):
@@ -82,6 +102,9 @@ class Audio:
         In case of multi-channel audio, convert to single-channel audio
         using one of the following strategies: select one channel at
         'random' or 'downmix' by averaging all channels.
+    backend : str
+        torchaudio backend to use. Defaults to 'soundfile' if available,
+        or the first available backend.
 
     Usage
     -----
@@ -179,11 +202,19 @@ def validate_file(file: AudioFile) -> Mapping:
 
         return file
 
-    def __init__(self, sample_rate=None, mono=None):
+    def __init__(self, sample_rate: int = None, mono=None, backend: str = None):
         super().__init__()
         self.sample_rate = sample_rate
         self.mono = mono
 
+        if not backend:
+            backends = (
+                torchaudio.list_audio_backends()
+            )  # e.g ['ffmpeg', 'soundfile', 'sox']
+            backend = "soundfile" if "soundfile" in backends else backends[0]
+
+        self.backend = backend
+
     def downmix_and_resample(self, waveform: Tensor, sample_rate: int) -> Tensor:
         """Downmix and resample
 
@@ -244,7 +275,7 @@ def get_duration(self, file: AudioFile) -> float:
             if "torchaudio.info" in file:
                 info = file["torchaudio.info"]
             else:
-                info = get_torchaudio_info(file)
+                info = get_torchaudio_info(file, backend=self.backend)
 
             frames = info.num_frames
             sample_rate = info.sample_rate
@@ -291,7 +322,7 @@ def __call__(self, file: AudioFile) -> Tuple[Tensor, int]:
             sample_rate = file["sample_rate"]
 
         elif "audio" in file:
-            waveform, sample_rate = torchaudio.load(file["audio"])
+            waveform, sample_rate = torchaudio.load(file["audio"], backend=self.backend)
 
             # rewind if needed
             if isinstance(file["audio"], IOBase):
@@ -349,7 +380,7 @@ def crop(
             sample_rate = info.sample_rate
 
         else:
-            info = get_torchaudio_info(file)
+            info = get_torchaudio_info(file, backend=self.backend)
             frames = info.num_frames
             sample_rate = info.sample_rate
 
@@ -401,7 +432,10 @@ def crop(
         else:
             try:
                 data, _ = torchaudio.load(
-                    file["audio"], frame_offset=start_frame, num_frames=num_frames
+                    file["audio"],
+                    frame_offset=start_frame,
+                    num_frames=num_frames,
+                    backend=self.backend,
                 )
                 # rewind if needed
                 if isinstance(file["audio"], IOBase):
diff --git a/pyannote/audio/utils/protocol.py b/pyannote/audio/utils/protocol.py
index 0cfe4ccf2..bca0e5942 100644
--- a/pyannote/audio/utils/protocol.py
+++ b/pyannote/audio/utils/protocol.py
@@ -20,7 +20,9 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+from functools import partial
 
+import torchaudio
 from pyannote.database import FileFinder, Protocol, get_annotated
 from pyannote.database.protocol import SpeakerVerificationProtocol
 
@@ -89,7 +91,14 @@ def check_protocol(protocol: Protocol) -> Protocol:
 
     if "waveform" not in file and "torchaudio.info" not in file:
 
-        protocol.preprocessors["torchaudio.info"] = get_torchaudio_info
+        # use soundfile when available (it usually is faster than ffmpeg for getting info)
+        backends = (
+            torchaudio.list_audio_backends()
+        )  # e.g ['ffmpeg', 'soundfile', 'sox']
+        backend = "soundfile" if "soundfile" in backends else backends[0]
+        protocol.preprocessors["torchaudio.info"] = partial(
+            get_torchaudio_info, backend=backend
+        )
         msg = (
             f"Protocol {protocol.name} does not precompute the output of torchaudio.info(): "
             f"adding a 'torchaudio.info' preprocessor for you to speed up dataloaders. "

From f1a6db2a2a02c0e80a3073027d1ae9b49d45b3c1 Mon Sep 17 00:00:00 2001
From: Purfview <69023953+Purfview@users.noreply.github.com>
Date: Fri, 17 May 2024 20:03:32 +0100
Subject: [PATCH 16/17] fix(doc): remove mention of unsupported `numpy.ndarray`
 waveform (#1691)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Hervé BREDIN <hbredin@users.noreply.github.com>
---
 CHANGELOG.md              | 1 +
 pyannote/audio/core/io.py | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index accc0cc1b..064242ff5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -32,6 +32,7 @@
 - fix(task): fix estimation of training set size (with [@FrenchKrab](https://github.com/FrenchKrab))
 - fix(hook): fix `torch.Tensor` support in `ArtifactHook`
 - fix(doc): fix typo in `Powerset` docstring (with [@lukasstorck](https://github.com/lukasstorck))
+- fix(doc): remove mention of unsupported `numpy.ndarray` waveform (with [@Purfview](https://github.com/Purfview))
 
 ### Improvements
 
diff --git a/pyannote/audio/core/io.py b/pyannote/audio/core/io.py
index bce9c4dbf..352824737 100644
--- a/pyannote/audio/core/io.py
+++ b/pyannote/audio/core/io.py
@@ -48,7 +48,7 @@
     - a "IOBase" instance with "read" and "seek" support: open("audio.wav", "rb")
     - a "Mapping" with any of the above as "audio" key: {"audio": ...}
     - a "Mapping" with both "waveform" and "sample_rate" key:
-        {"waveform": (channel, time) numpy.ndarray or torch.Tensor, "sample_rate": 44100}
+        {"waveform": (channel, time) torch.Tensor, "sample_rate": 44100}
 
 For last two options, an additional "channel" key can be provided as a zero-indexed
 integer to load a specific channel: {"audio": "stereo.wav", "channel": 0}
@@ -149,7 +149,7 @@ def validate_file(file: AudioFile) -> Mapping:
         -------
         validated_file : Mapping
             {"audio": str, "uri": str, ...}
-            {"waveform": array or tensor, "sample_rate": int, "uri": str, ...}
+            {"waveform": tensor, "sample_rate": int, "uri": str, ...}
             {"audio": file, "uri": "stream"} if `file` is an IOBase instance
 
         Raises
@@ -171,7 +171,7 @@ def validate_file(file: AudioFile) -> Mapping:
             raise ValueError(AudioFileDocString)
 
         if "waveform" in file:
-            waveform: Union[np.ndarray, Tensor] = file["waveform"]
+            waveform: Tensor = file["waveform"]
             if len(waveform.shape) != 2 or waveform.shape[0] > waveform.shape[1]:
                 raise ValueError(
                     "'waveform' must be provided as a (channel, time) torch Tensor."

From 5e03622cb3fdc3a9d96a0cadb6dd4aad1e75ff43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Sun, 19 May 2024 16:53:40 +0200
Subject: [PATCH 17/17] improve(pipeline): do not extract embeddings in
 `SpeakerDiarization` pipeline when `max_speakers` is 1 (#1686)

---
 CHANGELOG.md                                  |  1 +
 .../audio/pipelines/speaker_diarization.py    | 51 +++++++++++--------
 2 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 064242ff5..02c931d5b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@
 ### Improvements
 
 - improve(io): when available, default to using `soundfile` backend
+- improve(pipeline): do not extract embeddings when `max_speakers` is set to 1
 
 ## Version 3.2.0 (2024-05-08)
 
diff --git a/pyannote/audio/pipelines/speaker_diarization.py b/pyannote/audio/pipelines/speaker_diarization.py
index 737cd1cb2..45ae085ed 100644
--- a/pyannote/audio/pipelines/speaker_diarization.py
+++ b/pyannote/audio/pipelines/speaker_diarization.py
@@ -478,6 +478,7 @@ def apply(
         segmentations = self.get_segmentations(file, hook=hook)
         hook("segmentation", segmentations)
         #   shape: (num_chunks, num_frames, local_num_speakers)
+        num_chunks, num_frames, local_num_speakers = segmentations.data.shape
 
         # binarize segmentation
         if self._segmentation.model.specifications.powerset:
@@ -507,29 +508,39 @@ def apply(
 
             return diarization
 
-        if self.klustering == "OracleClustering" and not return_embeddings:
+        # skip speaker embedding extraction and clustering when only one speaker
+        if not return_embeddings and max_speakers < 2:
+            hard_clusters = np.zeros((num_chunks, local_num_speakers), dtype=np.int8)
             embeddings = None
+            centroids = None
+
         else:
-            embeddings = self.get_embeddings(
-                file,
-                binarized_segmentations,
-                exclude_overlap=self.embedding_exclude_overlap,
-                hook=hook,
+
+            # skip speaker embedding extraction with oracle clustering
+            if self.klustering == "OracleClustering" and not return_embeddings:
+                embeddings = None
+
+            else:
+                embeddings = self.get_embeddings(
+                    file,
+                    binarized_segmentations,
+                    exclude_overlap=self.embedding_exclude_overlap,
+                    hook=hook,
+                )
+                hook("embeddings", embeddings)
+                #   shape: (num_chunks, local_num_speakers, dimension)
+
+            hard_clusters, _, centroids = self.clustering(
+                embeddings=embeddings,
+                segmentations=binarized_segmentations,
+                num_clusters=num_speakers,
+                min_clusters=min_speakers,
+                max_clusters=max_speakers,
+                file=file,  # <== for oracle clustering
+                frames=self._segmentation.model.receptive_field,  # <== for oracle clustering
             )
-            hook("embeddings", embeddings)
-            #   shape: (num_chunks, local_num_speakers, dimension)
-
-        hard_clusters, _, centroids = self.clustering(
-            embeddings=embeddings,
-            segmentations=binarized_segmentations,
-            num_clusters=num_speakers,
-            min_clusters=min_speakers,
-            max_clusters=max_speakers,
-            file=file,  # <== for oracle clustering
-            frames=self._segmentation.model.receptive_field,  # <== for oracle clustering
-        )
-        # hard_clusters: (num_chunks, num_speakers)
-        # centroids: (num_speakers, dimension)
+            # hard_clusters: (num_chunks, num_speakers)
+            # centroids: (num_speakers, dimension)
 
         # number of detected clusters is the number of different speakers
         num_different_speakers = np.max(hard_clusters) + 1