From f9009e2ef8dcc62eba2c3432210163ef91519484 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Thu, 18 Apr 2024 17:45:40 +0200 Subject: [PATCH 01/17] github: update config.yml (#1689) --- .github/ISSUE_TEMPLATE/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 6d6ee3543..84f6ea55a 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -11,5 +11,5 @@ contact_links: about: Using pyannote.audio in production? Make the most of it thanks to our consulting services. - name: Premium models - url: https://forms.gle/eKhn7H2zTa68sMMx8 + url: https://forms.office.com/e/GdqwVgkZ5C about: We are considering selling premium models, extensions, or services around pyannote.audio. From 704f87414a57173938ba546115bbd71de02da82b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Mon, 22 Apr 2024 13:04:10 +0200 Subject: [PATCH 02/17] doc: update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b3df6eabc..50dfeb286 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ for turn, _, speaker in diarization.itertracks(yield_label=True): Out of the box, `pyannote.audio` speaker diarization [pipeline](https://hf.co/pyannote/speaker-diarization-3.1) v3.1 is expected to be much better (and faster) than v2.x. Those numbers are diarization error rates (in %): -| Benchmark | [v2.1](https://hf.co/pyannote/speaker-diarization-2.1) | [v3.1](https://hf.co/pyannote/speaker-diarization-3.1) | [Premium](https://forms.gle/eKhn7H2zTa68sMMx8) | +| Benchmark | [v2.1](https://hf.co/pyannote/speaker-diarization-2.1) | [v3.1](https://hf.co/pyannote/speaker-diarization-3.1) | [Premium](https://forms.office.com/e/GdqwVgkZ5C) | | ---------------------- | ------ | ------ | --------- | | [AISHELL-4](https://arxiv.org/abs/2104.03603) | 14.1 | 12.2 | 11.9 | | [AliMeeting](https://www.openslr.org/119/) (channel 1) | 27.4 | 24.4 | 22.5 | From 2a7206762fd4ad095994836ce646af1abd1900bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Wed, 24 Apr 2024 21:53:36 +0200 Subject: [PATCH 03/17] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a82a2488f..49b976a1f 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ for turn, _, speaker in diarization.itertracks(yield_label=True): Out of the box, `pyannote.audio` speaker diarization [pipeline](https://hf.co/pyannote/speaker-diarization-3.1) v3.1 is expected to be much better (and faster) than v2.x. Those numbers are diarization error rates (in %): -| Benchmark | [v2.1](https://hf.co/pyannote/speaker-diarization-2.1) | [v3.1](https://hf.co/pyannote/speaker-diarization-3.1) | [Premium](https://forms.gle/eKhn7H2zTa68sMMx8) | +| Benchmark | [v2.1](https://hf.co/pyannote/speaker-diarization-2.1) | [v3.1](https://hf.co/pyannote/speaker-diarization-3.1) | [Premium](https://forms.office.com/e/GdqwVgkZ5C) | | ---------------------- | ------------------------------------------------------ | ------------------------------------------------------ | ---------------------------------------------- | | AISHELL-4 | 14.1 | 12.3 | 11.9 | | AliMeeting (channel 1) | 27.4 | 24.5 | 22.5 | From 4407a66023cb42fd74450ab83b802a09ffa27d52 Mon Sep 17 00:00:00 2001 From: Lukas <38840142+lukasstorck@users.noreply.github.com> Date: Sun, 5 May 2024 21:12:39 +0200 Subject: [PATCH 04/17] doc: fix typo in powerset docstring --- pyannote/audio/utils/powerset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyannote/audio/utils/powerset.py b/pyannote/audio/utils/powerset.py index 6a0716df9..23f921569 100644 --- a/pyannote/audio/utils/powerset.py +++ b/pyannote/audio/utils/powerset.py @@ -109,7 +109,7 @@ def to_multilabel(self, powerset: torch.Tensor, soft: bool = False) -> torch.Ten Soft predictions in "powerset" space. soft : bool, optional Return soft multi-label predictions. Defaults to False (i.e. hard predictions) - Assumes that `powerset` are "logits" (not "probabilities"). + Assumes that `powerset` are "log probabilities". Returns ------- From 5d56a11fd8340f34672ed53b5e32750feee820c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Tue, 7 May 2024 17:16:14 +0200 Subject: [PATCH 05/17] chore: remove use of vmap in stats-pooling layer (#1706) --- pyannote/audio/models/blocks/pooling.py | 77 +++++++++++++------------ 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/pyannote/audio/models/blocks/pooling.py b/pyannote/audio/models/blocks/pooling.py index 22d736a03..dc31bea8e 100644 --- a/pyannote/audio/models/blocks/pooling.py +++ b/pyannote/audio/models/blocks/pooling.py @@ -26,53 +26,53 @@ import torch import torch.nn as nn import torch.nn.functional as F -from einops import rearrange -class StatsPool(nn.Module): - """Statistics pooling +def _pool(sequences: torch.Tensor, weights: torch.Tensor) -> torch.Tensor: + """Helper function to compute statistics pooling - Compute temporal mean and (unbiased) standard deviation - and returns their concatenation. + Assumes that weights are already interpolated to match the number of frames + in sequences and that they encode the activation of only one speaker. - Reference - --------- - https://en.wikipedia.org/wiki/Weighted_arithmetic_mean + Parameters + ---------- + sequences : (batch, features, frames) torch.Tensor + Sequences of features. + weights : (batch, frames) torch.Tensor + (Already interpolated) weights. + Returns + ------- + output : (batch, 2 * features) torch.Tensor + Concatenation of mean and (unbiased) standard deviation. """ - def _pool(self, sequences: torch.Tensor, weights: torch.Tensor) -> torch.Tensor: - """Helper function to compute statistics pooling + weights = weights.unsqueeze(dim=1) + # (batch, 1, frames) - Assumes that weights are already interpolated to match the number of frames - in sequences and that they encode the activation of only one speaker. + v1 = weights.sum(dim=2) + 1e-8 + mean = torch.sum(sequences * weights, dim=2) / v1 - Parameters - ---------- - sequences : (batch, features, frames) torch.Tensor - Sequences of features. - weights : (batch, frames) torch.Tensor - (Already interpolated) weights. + dx2 = torch.square(sequences - mean.unsqueeze(2)) + v2 = torch.square(weights).sum(dim=2) - Returns - ------- - output : (batch, 2 * features) torch.Tensor - Concatenation of mean and (unbiased) standard deviation. - """ + var = torch.sum(dx2 * weights, dim=2) / (v1 - v2 / v1 + 1e-8) + std = torch.sqrt(var) - weights = weights.unsqueeze(dim=1) - # (batch, 1, frames) + return torch.cat([mean, std], dim=1) - v1 = weights.sum(dim=2) + 1e-8 - mean = torch.sum(sequences * weights, dim=2) / v1 - dx2 = torch.square(sequences - mean.unsqueeze(2)) - v2 = torch.square(weights).sum(dim=2) +class StatsPool(nn.Module): + """Statistics pooling - var = torch.sum(dx2 * weights, dim=2) / (v1 - v2 / v1 + 1e-8) - std = torch.sqrt(var) + Compute temporal mean and (unbiased) standard deviation + and returns their concatenation. - return torch.cat([mean, std], dim=1) + Reference + --------- + https://en.wikipedia.org/wiki/Weighted_arithmetic_mean + + """ def forward( self, sequences: torch.Tensor, weights: Optional[torch.Tensor] = None @@ -112,17 +112,20 @@ def forward( has_speaker_dimension = True # interpolate weights if needed - _, _, num_frames = sequences.shape - _, _, num_weights = weights.shape + _, _, num_frames = sequences.size() + _, num_speakers, num_weights = weights.size() if num_frames != num_weights: warnings.warn( f"Mismatch between frames ({num_frames}) and weights ({num_weights}) numbers." ) weights = F.interpolate(weights, size=num_frames, mode="nearest") - output = rearrange( - torch.vmap(self._pool, in_dims=(None, 1))(sequences, weights), - "speakers batch features -> batch speakers features", + output = torch.stack( + [ + _pool(sequences, weights[:, speaker, :]) + for speaker in range(num_speakers) + ], + dim=1, ) if not has_speaker_dimension: From 9a61ec26d9e14a8a1107f8b5bbb536160c4d6345 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Wed, 8 May 2024 10:38:18 +0200 Subject: [PATCH 06/17] fix: fix receptive field computation with non-zero padding (#1707) --- pyannote/audio/models/blocks/sincnet.py | 2 ++ pyannote/audio/models/embedding/debug.py | 32 +++++------------- .../models/embedding/wespeaker/__init__.py | 1 + .../models/embedding/wespeaker/resnet.py | 3 ++ pyannote/audio/models/embedding/xvector.py | 33 +++++-------------- .../audio/models/segmentation/SSeRiouSS.py | 10 ++++-- pyannote/audio/models/segmentation/debug.py | 32 +++++------------- pyannote/audio/utils/receptive_field.py | 11 +++++-- 8 files changed, 48 insertions(+), 76 deletions(-) diff --git a/pyannote/audio/models/blocks/sincnet.py b/pyannote/audio/models/blocks/sincnet.py index b46549bb3..2a085201c 100644 --- a/pyannote/audio/models/blocks/sincnet.py +++ b/pyannote/audio/models/blocks/sincnet.py @@ -122,12 +122,14 @@ def receptive_field_size(self, num_frames: int = 1) -> int: kernel_size = [251, 3, 5, 3, 5, 3] stride = [self.stride, 3, 1, 3, 1, 3] + padding = [0, 0, 0, 0, 0, 0] dilation = [1, 1, 1, 1, 1, 1] return multi_conv_receptive_field_size( num_frames, kernel_size=kernel_size, stride=stride, + padding=padding, dilation=dilation, ) diff --git a/pyannote/audio/models/embedding/debug.py b/pyannote/audio/models/embedding/debug.py index a5e862a24..b09283908 100644 --- a/pyannote/audio/models/embedding/debug.py +++ b/pyannote/audio/models/embedding/debug.py @@ -31,11 +31,6 @@ from pyannote.audio.core.model import Model from pyannote.audio.core.task import Task -from pyannote.audio.utils.receptive_field import ( - conv1d_num_frames, - conv1d_receptive_field_center, - conv1d_receptive_field_size, -) class SimpleEmbeddingModel(Model): @@ -87,13 +82,10 @@ def num_frames(self, num_samples: int) -> int: n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft center = self.mfcc.MelSpectrogram.spectrogram.center - return conv1d_num_frames( - num_samples=num_samples, - kernel_size=n_fft, - stride=hop_length, - padding=n_fft // 2 if center else 0, - dilation=1, - ) + if center: + return 1 + num_samples // hop_length + else: + return 1 + (num_samples - n_fft) // hop_length def receptive_field_size(self, num_frames: int = 1) -> int: """Compute size of receptive field @@ -111,10 +103,7 @@ def receptive_field_size(self, num_frames: int = 1) -> int: hop_length = self.mfcc.MelSpectrogram.spectrogram.hop_length n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft - - return conv1d_receptive_field_size( - num_frames, kernel_size=n_fft, stride=hop_length, dilation=1 - ) + return n_fft + (num_frames - 1) * hop_length def receptive_field_center(self, frame: int = 0) -> int: """Compute center of receptive field @@ -134,13 +123,10 @@ def receptive_field_center(self, frame: int = 0) -> int: n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft center = self.mfcc.MelSpectrogram.spectrogram.center - return conv1d_receptive_field_center( - frame=frame, - kernel_size=n_fft, - stride=hop_length, - padding=n_fft // 2 if center else 0, - dilation=1, - ) + if center: + return frame * hop_length + else: + return frame * hop_length + n_fft // 2 @property def dimension(self) -> int: diff --git a/pyannote/audio/models/embedding/wespeaker/__init__.py b/pyannote/audio/models/embedding/wespeaker/__init__.py index e75779dda..75df427d6 100644 --- a/pyannote/audio/models/embedding/wespeaker/__init__.py +++ b/pyannote/audio/models/embedding/wespeaker/__init__.py @@ -154,6 +154,7 @@ def receptive_field_size(self, num_frames: int = 1) -> int: num_frames=receptive_field_size, kernel_size=window_size, stride=step_size, + padding=0, dilation=1, ) diff --git a/pyannote/audio/models/embedding/wespeaker/resnet.py b/pyannote/audio/models/embedding/wespeaker/resnet.py index b64dd386d..2a1f58e0b 100644 --- a/pyannote/audio/models/embedding/wespeaker/resnet.py +++ b/pyannote/audio/models/embedding/wespeaker/resnet.py @@ -124,6 +124,7 @@ def receptive_field_size(self, num_frames: int = 1) -> int: num_frames, kernel_size=[3, 3], stride=[self.stride, 1], + padding=[1, 1], dilation=[1, 1], ) @@ -189,6 +190,7 @@ def receptive_field_size(self, num_frames: int = 1) -> int: num_frames, kernel_size=[1, 3, 1], stride=[1, self.stride, 1], + padding=[0, 1, 0], dilation=[1, 1, 1], ) @@ -305,6 +307,7 @@ def receptive_field_size(self, num_frames: int = 1) -> int: num_frames=receptive_field_size, kernel_size=3, stride=1, + padding=1, dilation=1, ) diff --git a/pyannote/audio/models/embedding/xvector.py b/pyannote/audio/models/embedding/xvector.py index 00916fbd0..3161876e3 100644 --- a/pyannote/audio/models/embedding/xvector.py +++ b/pyannote/audio/models/embedding/xvector.py @@ -33,9 +33,6 @@ from pyannote.audio.models.blocks.sincnet import SincNet from pyannote.audio.utils.params import merge_dict from pyannote.audio.utils.receptive_field import ( - conv1d_num_frames, - conv1d_receptive_field_center, - conv1d_receptive_field_size, multi_conv_num_frames, multi_conv_receptive_field_center, multi_conv_receptive_field_size, @@ -115,13 +112,10 @@ def num_frames(self, num_samples: int) -> int: n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft center = self.mfcc.MelSpectrogram.spectrogram.center - num_frames = conv1d_num_frames( - num_samples, - kernel_size=n_fft, - stride=hop_length, - dilation=1, - padding=n_fft // 2 if center else 0, - ) + if center: + num_frames = 1 + num_samples // hop_length + else: + num_frames = 1 + (num_samples - n_fft) // hop_length return multi_conv_num_frames( num_frames, @@ -155,13 +149,7 @@ def receptive_field_size(self, num_frames: int = 1) -> int: hop_length = self.mfcc.MelSpectrogram.spectrogram.hop_length n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft - - return conv1d_receptive_field_size( - num_frames=receptive_field_size, - kernel_size=n_fft, - stride=hop_length, - dilation=1, - ) + return n_fft + (receptive_field_size - 1) * hop_length def receptive_field_center(self, frame: int = 0) -> int: """Compute center of receptive field @@ -189,13 +177,10 @@ def receptive_field_center(self, frame: int = 0) -> int: n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft center = self.mfcc.MelSpectrogram.spectrogram.center - return conv1d_receptive_field_center( - frame=receptive_field_center, - kernel_size=n_fft, - stride=hop_length, - padding=n_fft // 2 if center else 0, - dilation=1, - ) + if center: + return receptive_field_center * hop_length + else: + return receptive_field_center * hop_length + n_fft // 2 def forward( self, waveforms: torch.Tensor, weights: Optional[torch.Tensor] = None diff --git a/pyannote/audio/models/segmentation/SSeRiouSS.py b/pyannote/audio/models/segmentation/SSeRiouSS.py index ef550dfe1..b96464ab3 100644 --- a/pyannote/audio/models/segmentation/SSeRiouSS.py +++ b/pyannote/audio/models/segmentation/SSeRiouSS.py @@ -149,9 +149,12 @@ def __init__( self.lstm = nn.ModuleList( [ nn.LSTM( - wav2vec_dim - if i == 0 - else lstm["hidden_size"] * (2 if lstm["bidirectional"] else 1), + ( + wav2vec_dim + if i == 0 + else lstm["hidden_size"] + * (2 if lstm["bidirectional"] else 1) + ), **one_layer_lstm, ) for i in range(num_layers) @@ -246,6 +249,7 @@ def receptive_field_size(self, num_frames: int = 1) -> int: num_frames=receptive_field_size, kernel_size=conv_layer.kernel_size, stride=conv_layer.stride, + padding=conv_layer.conv.padding[0], dilation=conv_layer.conv.dilation[0], ) return receptive_field_size diff --git a/pyannote/audio/models/segmentation/debug.py b/pyannote/audio/models/segmentation/debug.py index 93c205b3d..ccac612a9 100644 --- a/pyannote/audio/models/segmentation/debug.py +++ b/pyannote/audio/models/segmentation/debug.py @@ -31,11 +31,6 @@ from pyannote.audio.core.model import Model from pyannote.audio.core.task import Task -from pyannote.audio.utils.receptive_field import ( - conv1d_num_frames, - conv1d_receptive_field_center, - conv1d_receptive_field_size, -) class SimpleSegmentationModel(Model): @@ -87,13 +82,10 @@ def num_frames(self, num_samples: int) -> int: n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft center = self.mfcc.MelSpectrogram.spectrogram.center - return conv1d_num_frames( - num_samples=num_samples, - kernel_size=n_fft, - stride=hop_length, - padding=n_fft // 2 if center else 0, - dilation=1, - ) + if center: + return 1 + num_samples // hop_length + else: + return 1 + (num_samples - n_fft) // hop_length def receptive_field_size(self, num_frames: int = 1) -> int: """Compute size of receptive field @@ -111,10 +103,7 @@ def receptive_field_size(self, num_frames: int = 1) -> int: hop_length = self.mfcc.MelSpectrogram.spectrogram.hop_length n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft - - return conv1d_receptive_field_size( - num_frames, kernel_size=n_fft, stride=hop_length, dilation=1 - ) + return n_fft + (num_frames - 1) * hop_length def receptive_field_center(self, frame: int = 0) -> int: """Compute center of receptive field @@ -134,13 +123,10 @@ def receptive_field_center(self, frame: int = 0) -> int: n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft center = self.mfcc.MelSpectrogram.spectrogram.center - return conv1d_receptive_field_center( - frame=frame, - kernel_size=n_fft, - stride=hop_length, - padding=n_fft // 2 if center else 0, - dilation=1, - ) + if center: + return frame * hop_length + else: + return frame * hop_length + n_fft // 2 @property def dimension(self) -> int: diff --git a/pyannote/audio/utils/receptive_field.py b/pyannote/audio/utils/receptive_field.py index 0e484e4ad..420a62de0 100644 --- a/pyannote/audio/utils/receptive_field.py +++ b/pyannote/audio/utils/receptive_field.py @@ -69,7 +69,9 @@ def multi_conv_num_frames( return num_frames -def conv1d_receptive_field_size(num_frames=1, kernel_size=5, stride=1, dilation=1): +def conv1d_receptive_field_size( + num_frames=1, kernel_size=5, stride=1, padding=0, dilation=1 +): """Compute size of receptive field Parameters @@ -80,6 +82,8 @@ def conv1d_receptive_field_size(num_frames=1, kernel_size=5, stride=1, dilation= Kernel size stride : int Stride + padding : int + Padding dilation : int Dilation @@ -90,7 +94,7 @@ def conv1d_receptive_field_size(num_frames=1, kernel_size=5, stride=1, dilation= """ effective_kernel_size = 1 + (kernel_size - 1) * dilation - return effective_kernel_size + (num_frames - 1) * stride + return effective_kernel_size + (num_frames - 1) * stride - 2 * padding def multi_conv_receptive_field_size( @@ -102,11 +106,12 @@ def multi_conv_receptive_field_size( ) -> int: receptive_field_size = num_frames - for k, s, d in reversed(list(zip(kernel_size, stride, dilation))): + for k, s, p, d in reversed(list(zip(kernel_size, stride, padding, dilation))): receptive_field_size = conv1d_receptive_field_size( num_frames=receptive_field_size, kernel_size=k, stride=s, + padding=p, dilation=d, ) return receptive_field_size From 7a9013745216d736538bdfaea22c5308f9b0c23b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Wed, 8 May 2024 11:16:19 +0200 Subject: [PATCH 07/17] feat: add `fbank_only` property to `WeSpeaker` models (#1708) --- CHANGELOG.md | 7 +- .../models/embedding/wespeaker/__init__.py | 158 +++++++++++++++++- .../models/embedding/wespeaker/resnet.py | 63 ++++++- 3 files changed, 210 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8741e7932..647d478ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,15 +6,16 @@ - feat(task): add option to cache task training metadata to speed up training (with [@clement-pages](https://github.com/clement-pages/)) - feat(model): add `receptive_field`, `num_frames` and `dimension` to models (with [@Bilal-Rahou](https://github.com/Bilal-Rahou)) +- feat(model): add `fbank_only` property to `WeSpeaker` models - feat(util): add `Powerset.permutation_mapping` to help with permutation in powerset space (with [@FrenchKrab](https://github.com/FrenchKrab)) -- feat(sample): add sample file at `pyannote.audio.sample.SAMPLE_FILE` +- feat(sample): add sample file at `pyannote.audio.sample.SAMPLE_FILE` - feat(metric): add `reduce` option to `diarization_error_rate` metric (with [@Bilal-Rahou](https://github.com/Bilal-Rahou)) - feat(pipeline): add `Waveform` and `SampleRate` preprocessors ### Fixes -- fix(task): fix random generators and their reproducibility (with [@FrenchKrab](https://github.com/FrenchKrab)) -- fix(task): fix estimation of training set size (with [@FrenchKrab](https://github.com/FrenchKrab)) +- fix(task): fix random generators and their reproducibility (with [@FrenchKrab](https://github.com/FrenchKrab)) +- fix(task): fix estimation of training set size (with [@FrenchKrab](https://github.com/FrenchKrab)) ### Improvements diff --git a/pyannote/audio/models/embedding/wespeaker/__init__.py b/pyannote/audio/models/embedding/wespeaker/__init__.py index 75df427d6..be51196c1 100644 --- a/pyannote/audio/models/embedding/wespeaker/__init__.py +++ b/pyannote/audio/models/embedding/wespeaker/__init__.py @@ -25,6 +25,7 @@ from typing import Optional import torch +import torch.nn.functional as F import torchaudio.compliance.kaldi as kaldi from pyannote.audio.core.model import Model @@ -39,16 +40,33 @@ class BaseWeSpeakerResNet(Model): + """Base class for WeSpeaker's ResNet models + + Parameters + ---------- + fbank_centering_span : float, optional + Span of the fbank centering window (in seconds). + Defaults (None) to use whole input. + + See also + -------- + torchaudio.compliance.kaldi.fbank + + """ + def __init__( self, sample_rate: int = 16000, num_channels: int = 1, num_mel_bins: int = 80, - frame_length: int = 25, - frame_shift: int = 10, + frame_length: float = 25.0, # in milliseconds + frame_shift: float = 10.0, # in milliseconds + round_to_power_of_two: bool = True, + snip_edges: bool = True, dither: float = 0.0, window_type: str = "hamming", use_energy: bool = False, + fbank_centering_span: Optional[float] = None, task: Optional[Task] = None, ): super().__init__(sample_rate=sample_rate, num_channels=num_channels, task=task) @@ -60,21 +78,38 @@ def __init__( "frame_length", "frame_shift", "dither", + "round_to_power_of_two", + "snip_edges", "window_type", "use_energy", + "fbank_centering_span", ) self._fbank = partial( kaldi.fbank, num_mel_bins=self.hparams.num_mel_bins, frame_length=self.hparams.frame_length, + round_to_power_of_two=self.hparams.round_to_power_of_two, frame_shift=self.hparams.frame_shift, + snip_edges=self.hparams.snip_edges, dither=self.hparams.dither, sample_frequency=self.hparams.sample_rate, window_type=self.hparams.window_type, use_energy=self.hparams.use_energy, ) + @property + def fbank_only(self) -> bool: + """Whether to only extract fbank features""" + return getattr(self, "_fbank_only", False) + + @fbank_only.setter + def fbank_only(self, value: bool): + if hasattr(self, "receptive_field"): + del self.receptive_field + + self._fbank_only = value + def compute_fbank(self, waveforms: torch.Tensor) -> torch.Tensor: """Extract fbank features @@ -85,6 +120,7 @@ def compute_fbank(self, waveforms: torch.Tensor) -> torch.Tensor: Returns ------- fbank : (batch_size, num_frames, num_mel_bins) + fbank features Source: https://github.com/wenet-e2e/wespeaker/blob/45941e7cba2c3ea99e232d02bedf617fc71b0dad/wespeaker/bin/infer_onnx.py#L30C1-L50 """ @@ -98,11 +134,37 @@ def compute_fbank(self, waveforms: torch.Tensor) -> torch.Tensor: features = torch.vmap(self._fbank)(waveforms.to(fft_device)).to(device) - return features - torch.mean(features, dim=1, keepdim=True) + # center features with global average + if self.hparams.fbank_centering_span is None: + return features - torch.mean(features, dim=1, keepdim=True) + + # center features with running average + window_size = int(self.hparams.sample_rate * self.hparams.frame_length * 0.001) + step_size = int(self.hparams.sample_rate * self.hparams.frame_shift * 0.001) + kernel_size = conv1d_num_frames( + num_samples=int( + self.hparams.fbank_centering_span * self.hparams.sample_rate + ), + kernel_size=window_size, + stride=step_size, + padding=0, + dilation=1, + ) + return features - F.avg_pool1d( + features.transpose(1, 2), + kernel_size=2 * (kernel_size // 2) + 1, + stride=1, + padding=kernel_size // 2, + count_include_pad=False, + ).transpose(1, 2) @property def dimension(self) -> int: """Dimension of output""" + + if self.fbank_only: + return self.hparams.num_mel_bins + return self.resnet.embed_dim @lru_cache @@ -122,6 +184,8 @@ def num_frames(self, num_samples: int) -> int: window_size = int(self.hparams.sample_rate * self.hparams.frame_length * 0.001) step_size = int(self.hparams.sample_rate * self.hparams.frame_shift * 0.001) + # TODO: take round_to_power_of_two and snip_edges into account + num_frames = conv1d_num_frames( num_samples=num_samples, kernel_size=window_size, @@ -129,6 +193,10 @@ def num_frames(self, num_samples: int) -> int: padding=0, dilation=1, ) + + if self.fbank_only: + return num_frames + return self.resnet.num_frames(num_frames) def receptive_field_size(self, num_frames: int = 1) -> int: @@ -144,8 +212,13 @@ def receptive_field_size(self, num_frames: int = 1) -> int: receptive_field_size : int Receptive field size. """ + receptive_field_size = num_frames - receptive_field_size = self.resnet.receptive_field_size(receptive_field_size) + + if not self.fbank_only: + receptive_field_size = self.resnet.receptive_field_size( + receptive_field_size + ) window_size = int(self.hparams.sample_rate * self.hparams.frame_length * 0.001) step_size = int(self.hparams.sample_rate * self.hparams.frame_shift * 0.001) @@ -172,9 +245,11 @@ def receptive_field_center(self, frame: int = 0) -> int: Index of receptive field center. """ receptive_field_center = frame - receptive_field_center = self.resnet.receptive_field_center( - frame=receptive_field_center - ) + + if not self.fbank_only: + receptive_field_center = self.resnet.receptive_field_center( + frame=receptive_field_center + ) window_size = int(self.hparams.sample_rate * self.hparams.frame_length * 0.001) step_size = int(self.hparams.sample_rate * self.hparams.frame_shift * 0.001) @@ -189,14 +264,79 @@ def receptive_field_center(self, frame: int = 0) -> int: def forward( self, waveforms: torch.Tensor, weights: Optional[torch.Tensor] = None ) -> torch.Tensor: + """Extract speaker embeddings + + Parameters + ---------- + waveforms : torch.Tensor + Batch of waveforms with shape (batch, channel, sample) + weights : (batch, frames) or (batch, speakers, frames) torch.Tensor, optional + Batch of weights passed to statistics pooling layer. + + Returns + ------- + embeddings : (batch, dimension) or (batch, speakers, dimension) torch.Tensor + Batch of embeddings. """ + fbank = self.compute_fbank(waveforms) + if self.fbank_only: + return fbank + + return self.resnet(fbank, weights=weights)[1] + + def forward_frames(self, waveforms: torch.Tensor) -> torch.Tensor: + """Extract frame-wise embeddings + Parameters ---------- waveforms : torch.Tensor Batch of waveforms with shape (batch, channel, sample) - weights : torch.Tensor, optional - Batch of weights with shape (batch, frame). + + Returns + ------- + embeddings : (batch, ..., embedding_frames) torch.Tensor + Batch of frame-wise embeddings. + """ + fbank = self.compute_fbank(waveforms) + return self.resnet.forward_frames(fbank) + + def forward_embedding( + self, frames: torch.Tensor, weights: torch.Tensor = None + ) -> torch.Tensor: + """Extract speaker embeddings from frame-wise embeddings + + Parameters + ---------- + frames : torch.Tensor + Batch of frames with shape (batch, ..., embedding_frames). + weights : (batch, frames) or (batch, speakers, frames) torch.Tensor, optional + Batch of weights passed to statistics pooling layer. + + Returns + ------- + embeddings : (batch, dimension) or (batch, speakers, dimension) torch.Tensor + Batch of embeddings. + + """ + return self.resnet.forward_embedding(frames, weights=weights)[1] + + def forward( + self, waveforms: torch.Tensor, weights: Optional[torch.Tensor] = None + ) -> torch.Tensor: + """Extract speaker embeddings + + Parameters + ---------- + waveforms : torch.Tensor + Batch of waveforms with shape (batch, channel, sample) + weights : (batch, frames) or (batch, speakers, frames) torch.Tensor, optional + Batch of weights passed to statistics pooling layer. + + Returns + ------- + embeddings : (batch, dimension) or (batch, speakers, dimension) torch.Tensor + Batch of embeddings. """ fbank = self.compute_fbank(waveforms) diff --git a/pyannote/audio/models/embedding/wespeaker/resnet.py b/pyannote/audio/models/embedding/wespeaker/resnet.py index 2a1f58e0b..4c9d5a5f0 100644 --- a/pyannote/audio/models/embedding/wespeaker/resnet.py +++ b/pyannote/audio/models/embedding/wespeaker/resnet.py @@ -344,12 +344,64 @@ def receptive_field_center(self, frame: int = 0) -> int: return receptive_field_center - def forward(self, x: torch.Tensor, weights: Optional[torch.Tensor] = None): + def forward_frames(self, fbank: torch.Tensor) -> torch.Tensor: + """Extract frame-wise embeddings + + Parameters + ---------- + fbanks : (batch, frames, features) torch.Tensor + Batch of fbank features + + Returns + ------- + embeddings : (batch, ..., embedding_frames) torch.Tensor + Batch of frame-wise embeddings. + """ + fbank = fbank.permute(0, 2, 1) # (B,T,F) => (B,F,T) + fbank = fbank.unsqueeze_(1) + out = F.relu(self.bn1(self.conv1(fbank))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + return out + + def forward_embedding( + self, frames: torch.Tensor, weights: torch.Tensor = None + ) -> torch.Tensor: + """Extract speaker embeddings Parameters ---------- - x : (batch, frames, features) torch.Tensor + frames : torch.Tensor + Batch of frames with shape (batch, ..., embedding_frames). + weights : (batch, frames) or (batch, speakers, frames) torch.Tensor, optional + Batch of weights passed to statistics pooling layer. + + Returns + ------- + embeddings : (batch, dimension) or (batch, speakers, dimension) torch.Tensor + Batch of embeddings. + """ + + stats = self.pool(frames, weights=weights) + + embed_a = self.seg_1(stats) + if self.two_emb_layer: + out = F.relu(embed_a) + out = self.seg_bn_1(out) + embed_b = self.seg_2(out) + return embed_a, embed_b + else: + return torch.tensor(0.0), embed_a + + def forward(self, fbank: torch.Tensor, weights: Optional[torch.Tensor] = None): + """Extract speaker embeddings + + Parameters + ---------- + fbank : (batch, frames, features) torch.Tensor Batch of features weights : (batch, frames) torch.Tensor, optional Batch of weights @@ -358,10 +410,9 @@ def forward(self, x: torch.Tensor, weights: Optional[torch.Tensor] = None): ------- embedding : (batch, embedding_dim) torch.Tensor """ - x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T) - - x = x.unsqueeze_(1) - out = F.relu(self.bn1(self.conv1(x))) + fbank = fbank.permute(0, 2, 1) # (B,T,F) => (B,F,T) + fbank = fbank.unsqueeze_(1) + out = F.relu(self.bn1(self.conv1(fbank))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) From 461580848a621b378ad3b943b99e4b916c07c2e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Wed, 8 May 2024 11:21:44 +0200 Subject: [PATCH 08/17] fix(hook): fix `torch.Tensor` support in `ArtifactHook` --- CHANGELOG.md | 1 + pyannote/audio/pipelines/utils/hook.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 647d478ef..710be5817 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ - fix(task): fix random generators and their reproducibility (with [@FrenchKrab](https://github.com/FrenchKrab)) - fix(task): fix estimation of training set size (with [@FrenchKrab](https://github.com/FrenchKrab)) +- fix(hook): fix `torch.Tensor` support in `ArtifactHook` ### Improvements diff --git a/pyannote/audio/pipelines/utils/hook.py b/pyannote/audio/pipelines/utils/hook.py index 2a675d1c9..db6972e2e 100644 --- a/pyannote/audio/pipelines/utils/hook.py +++ b/pyannote/audio/pipelines/utils/hook.py @@ -24,6 +24,7 @@ from copy import deepcopy from typing import Any, Mapping, Optional, Text +import torch from rich.progress import ( BarColumn, Progress, @@ -75,6 +76,9 @@ def __call__( ): return + if isinstance(step_artifact, torch.Tensor): + step_artifact = step_artifact.numpy(force=True) + file.setdefault(self.file_key, dict())[step_name] = deepcopy(step_artifact) From 07a85a7c061d83ac663ed8723d8f09c966b30271 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Wed, 8 May 2024 11:36:04 +0200 Subject: [PATCH 09/17] doc: update changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 710be5817..a444fd7d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ - fix(task): fix random generators and their reproducibility (with [@FrenchKrab](https://github.com/FrenchKrab)) - fix(task): fix estimation of training set size (with [@FrenchKrab](https://github.com/FrenchKrab)) - fix(hook): fix `torch.Tensor` support in `ArtifactHook` +- fix(doc): fix typo in `Powerset` docstring (with [@lukasstorck](https://github.com/lukasstorck)) ### Improvements @@ -30,6 +31,10 @@ - BREAKING(model): get rid of `Model.example_output` in favor of `num_frames` method, `receptive_field` property, and `dimension` property - BREAKING(task): custom tasks need to be updated (see "Add your own task" tutorial) +## Community contributions + +- community: add tutorial for offline use of `pyannote/speaker-diarization-3.1` (by [@simonottenhauskenbun](https://github.com/simonottenhauskenbun)) + ## Version 3.1.1 (2023-12-01) ### TL;DR From e01134dbddae2e4b93c3f11f0b57757bba15a7b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Wed, 8 May 2024 11:40:27 +0200 Subject: [PATCH 10/17] doc: update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a444fd7d3..ad88762c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## develop +## Version 3.2.0 (2024-05-08) ### New features From bb4dd2eed453a6778246537fbb5e51190fd5ac1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Wed, 8 May 2024 11:40:45 +0200 Subject: [PATCH 11/17] git: update version --- version.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.txt b/version.txt index 94ff29cc4..944880fa1 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.1.1 +3.2.0 From ba1c4f5e7cf606ab8f772fc238da66a93f5c6e00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Fri, 10 May 2024 22:38:11 +0200 Subject: [PATCH 12/17] doc: update README.md to reference pyannoteAI --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e1326816a..abef6de01 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -Using `pyannote.audio` open-source toolkit in production? -Make the most of it thanks to our [consulting services](https://herve.niderb.fr/consulting.html). +Using `pyannote.audio` open-source toolkit in production? +Consider switching to [pyannoteAI](https://www.pyannote.ai) for better and faster options. # `pyannote.audio` speaker diarization toolkit @@ -79,7 +79,7 @@ for turn, _, speaker in diarization.itertracks(yield_label=True): Out of the box, `pyannote.audio` speaker diarization [pipeline](https://hf.co/pyannote/speaker-diarization-3.1) v3.1 is expected to be much better (and faster) than v2.x. Those numbers are diarization error rates (in %): -| Benchmark | [v2.1](https://hf.co/pyannote/speaker-diarization-2.1) | [v3.1](https://hf.co/pyannote/speaker-diarization-3.1) | [Premium](https://forms.office.com/e/GdqwVgkZ5C) | +| Benchmark | [v2.1](https://hf.co/pyannote/speaker-diarization-2.1) | [v3.1](https://hf.co/pyannote/speaker-diarization-3.1) | [pyannoteAI](https://www.pyannote.ai) | | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------ | ------------------------------------------------------ | ------------------------------------------------ | | [AISHELL-4](https://arxiv.org/abs/2104.03603) | 14.1 | 12.2 | 11.9 | | [AliMeeting](https://www.openslr.org/119/) (channel 1) | 27.4 | 24.4 | 22.5 | From a73ded27f297c6876b722e3c7bb77428a1bac1c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Sun, 12 May 2024 22:09:11 +0200 Subject: [PATCH 13/17] fix(doc): fix title level --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad88762c2..4149ccd74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,12 +26,12 @@ - improve(io): switch to `torchaudio >= 2.2.0` - improve(doc): update tutorials (with [@clement-pages](https://github.com/clement-pages/)) -## Breaking changes +### Breaking changes - BREAKING(model): get rid of `Model.example_output` in favor of `num_frames` method, `receptive_field` property, and `dimension` property - BREAKING(task): custom tasks need to be updated (see "Add your own task" tutorial) -## Community contributions +### Community contributions - community: add tutorial for offline use of `pyannote/speaker-diarization-3.1` (by [@simonottenhauskenbun](https://github.com/simonottenhauskenbun)) From cad8bea9883823ecba48e12963ef03cf4d41dc4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Fri, 17 May 2024 15:39:45 +0200 Subject: [PATCH 14/17] fix(task): fix incorrect train/dev split with (some) meta-protocols (#1709) --- CHANGELOG.md | 6 ++++++ pyannote/audio/core/task.py | 10 ++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4149ccd74..e48aafacb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## develop + +### Fixes + +- fix(task): fix wrong train/development split when training with (some) meta-protocols ([#1709](https://github.com/pyannote/pyannote-audio/issues/1709)) + ## Version 3.2.0 (2024-05-08) ### New features diff --git a/pyannote/audio/core/task.py b/pyannote/audio/core/task.py index 0a61e2a6f..974f43a67 100644 --- a/pyannote/audio/core/task.py +++ b/pyannote/audio/core/task.py @@ -362,12 +362,13 @@ def prepare_data(self): if self.has_validation: files_iter = itertools.chain( - self.protocol.train(), self.protocol.development() + zip(itertools.repeat("train"), self.protocol.train()), + zip(itertools.repeat("development"), self.protocol.development()), ) else: - files_iter = self.protocol.train() + files_iter = zip(itertools.repeat("train"), self.protocol.train()) - for file_id, file in enumerate(files_iter): + for file_id, (subset, file) in enumerate(files_iter): # gather metadata and update metadata_unique_values so that each metadatum # (e.g. source database or label) is represented by an integer. metadatum = dict() @@ -378,7 +379,8 @@ def prepare_data(self): metadatum["database"] = metadata_unique_values["database"].index( file["database"] ) - metadatum["subset"] = Subsets.index(file["subset"]) + + metadatum["subset"] = Subsets.index(subset) # keep track of label scope (file, database, or global) metadatum["scope"] = Scopes.index(file["scope"]) From 5ae4c9b685feee02cbd58d25210e51def7037079 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Fri, 17 May 2024 20:59:02 +0200 Subject: [PATCH 15/17] improve(io): use (faster) soundfile backend when available (#1711) --- CHANGELOG.md | 8 ++++++ pyannote/audio/core/io.py | 48 +++++++++++++++++++++++++++----- pyannote/audio/utils/protocol.py | 11 +++++++- 3 files changed, 59 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e48aafacb..accc0cc1b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,10 +2,18 @@ ## develop +### New features + +- feat(io): add option to select torchaudio `backend` + ### Fixes - fix(task): fix wrong train/development split when training with (some) meta-protocols ([#1709](https://github.com/pyannote/pyannote-audio/issues/1709)) +### Improvements + +- improve(io): when available, default to using `soundfile` backend + ## Version 3.2.0 (2024-05-08) ### New features diff --git a/pyannote/audio/core/io.py b/pyannote/audio/core/io.py index 8fafe69d3..bce9c4dbf 100644 --- a/pyannote/audio/core/io.py +++ b/pyannote/audio/core/io.py @@ -55,14 +55,34 @@ """ -def get_torchaudio_info(file: AudioFile): +def get_torchaudio_info( + file: AudioFile, backend: str = None +) -> torchaudio.AudioMetaData: """Protocol preprocessor used to cache output of torchaudio.info This is useful to speed future random access to this file, e.g. in dataloaders using Audio.crop a lot.... + + Parameters + ---------- + file : AudioFile + backend : str + torchaudio backend to use. Defaults to 'soundfile' if available, + or the first available backend. + + Returns + ------- + info : torchaudio.AudioMetaData + Audio file metadata """ - info = torchaudio.info(file["audio"]) + if not backend: + backends = ( + torchaudio.list_audio_backends() + ) # e.g ['ffmpeg', 'soundfile', 'sox'] + backend = "soundfile" if "soundfile" in backends else backends[0] + + info = torchaudio.info(file["audio"], backend=backend) # rewind if needed if isinstance(file["audio"], IOBase): @@ -82,6 +102,9 @@ class Audio: In case of multi-channel audio, convert to single-channel audio using one of the following strategies: select one channel at 'random' or 'downmix' by averaging all channels. + backend : str + torchaudio backend to use. Defaults to 'soundfile' if available, + or the first available backend. Usage ----- @@ -179,11 +202,19 @@ def validate_file(file: AudioFile) -> Mapping: return file - def __init__(self, sample_rate=None, mono=None): + def __init__(self, sample_rate: int = None, mono=None, backend: str = None): super().__init__() self.sample_rate = sample_rate self.mono = mono + if not backend: + backends = ( + torchaudio.list_audio_backends() + ) # e.g ['ffmpeg', 'soundfile', 'sox'] + backend = "soundfile" if "soundfile" in backends else backends[0] + + self.backend = backend + def downmix_and_resample(self, waveform: Tensor, sample_rate: int) -> Tensor: """Downmix and resample @@ -244,7 +275,7 @@ def get_duration(self, file: AudioFile) -> float: if "torchaudio.info" in file: info = file["torchaudio.info"] else: - info = get_torchaudio_info(file) + info = get_torchaudio_info(file, backend=self.backend) frames = info.num_frames sample_rate = info.sample_rate @@ -291,7 +322,7 @@ def __call__(self, file: AudioFile) -> Tuple[Tensor, int]: sample_rate = file["sample_rate"] elif "audio" in file: - waveform, sample_rate = torchaudio.load(file["audio"]) + waveform, sample_rate = torchaudio.load(file["audio"], backend=self.backend) # rewind if needed if isinstance(file["audio"], IOBase): @@ -349,7 +380,7 @@ def crop( sample_rate = info.sample_rate else: - info = get_torchaudio_info(file) + info = get_torchaudio_info(file, backend=self.backend) frames = info.num_frames sample_rate = info.sample_rate @@ -401,7 +432,10 @@ def crop( else: try: data, _ = torchaudio.load( - file["audio"], frame_offset=start_frame, num_frames=num_frames + file["audio"], + frame_offset=start_frame, + num_frames=num_frames, + backend=self.backend, ) # rewind if needed if isinstance(file["audio"], IOBase): diff --git a/pyannote/audio/utils/protocol.py b/pyannote/audio/utils/protocol.py index 0cfe4ccf2..bca0e5942 100644 --- a/pyannote/audio/utils/protocol.py +++ b/pyannote/audio/utils/protocol.py @@ -20,7 +20,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from functools import partial +import torchaudio from pyannote.database import FileFinder, Protocol, get_annotated from pyannote.database.protocol import SpeakerVerificationProtocol @@ -89,7 +91,14 @@ def check_protocol(protocol: Protocol) -> Protocol: if "waveform" not in file and "torchaudio.info" not in file: - protocol.preprocessors["torchaudio.info"] = get_torchaudio_info + # use soundfile when available (it usually is faster than ffmpeg for getting info) + backends = ( + torchaudio.list_audio_backends() + ) # e.g ['ffmpeg', 'soundfile', 'sox'] + backend = "soundfile" if "soundfile" in backends else backends[0] + protocol.preprocessors["torchaudio.info"] = partial( + get_torchaudio_info, backend=backend + ) msg = ( f"Protocol {protocol.name} does not precompute the output of torchaudio.info(): " f"adding a 'torchaudio.info' preprocessor for you to speed up dataloaders. " From f1a6db2a2a02c0e80a3073027d1ae9b49d45b3c1 Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Fri, 17 May 2024 20:03:32 +0100 Subject: [PATCH 16/17] fix(doc): remove mention of unsupported `numpy.ndarray` waveform (#1691) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hervé BREDIN --- CHANGELOG.md | 1 + pyannote/audio/core/io.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index accc0cc1b..064242ff5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ - fix(task): fix estimation of training set size (with [@FrenchKrab](https://github.com/FrenchKrab)) - fix(hook): fix `torch.Tensor` support in `ArtifactHook` - fix(doc): fix typo in `Powerset` docstring (with [@lukasstorck](https://github.com/lukasstorck)) +- fix(doc): remove mention of unsupported `numpy.ndarray` waveform (with [@Purfview](https://github.com/Purfview)) ### Improvements diff --git a/pyannote/audio/core/io.py b/pyannote/audio/core/io.py index bce9c4dbf..352824737 100644 --- a/pyannote/audio/core/io.py +++ b/pyannote/audio/core/io.py @@ -48,7 +48,7 @@ - a "IOBase" instance with "read" and "seek" support: open("audio.wav", "rb") - a "Mapping" with any of the above as "audio" key: {"audio": ...} - a "Mapping" with both "waveform" and "sample_rate" key: - {"waveform": (channel, time) numpy.ndarray or torch.Tensor, "sample_rate": 44100} + {"waveform": (channel, time) torch.Tensor, "sample_rate": 44100} For last two options, an additional "channel" key can be provided as a zero-indexed integer to load a specific channel: {"audio": "stereo.wav", "channel": 0} @@ -149,7 +149,7 @@ def validate_file(file: AudioFile) -> Mapping: ------- validated_file : Mapping {"audio": str, "uri": str, ...} - {"waveform": array or tensor, "sample_rate": int, "uri": str, ...} + {"waveform": tensor, "sample_rate": int, "uri": str, ...} {"audio": file, "uri": "stream"} if `file` is an IOBase instance Raises @@ -171,7 +171,7 @@ def validate_file(file: AudioFile) -> Mapping: raise ValueError(AudioFileDocString) if "waveform" in file: - waveform: Union[np.ndarray, Tensor] = file["waveform"] + waveform: Tensor = file["waveform"] if len(waveform.shape) != 2 or waveform.shape[0] > waveform.shape[1]: raise ValueError( "'waveform' must be provided as a (channel, time) torch Tensor." From 5e03622cb3fdc3a9d96a0cadb6dd4aad1e75ff43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Sun, 19 May 2024 16:53:40 +0200 Subject: [PATCH 17/17] improve(pipeline): do not extract embeddings in `SpeakerDiarization` pipeline when `max_speakers` is 1 (#1686) --- CHANGELOG.md | 1 + .../audio/pipelines/speaker_diarization.py | 51 +++++++++++-------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 064242ff5..02c931d5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ ### Improvements - improve(io): when available, default to using `soundfile` backend +- improve(pipeline): do not extract embeddings when `max_speakers` is set to 1 ## Version 3.2.0 (2024-05-08) diff --git a/pyannote/audio/pipelines/speaker_diarization.py b/pyannote/audio/pipelines/speaker_diarization.py index 737cd1cb2..45ae085ed 100644 --- a/pyannote/audio/pipelines/speaker_diarization.py +++ b/pyannote/audio/pipelines/speaker_diarization.py @@ -478,6 +478,7 @@ def apply( segmentations = self.get_segmentations(file, hook=hook) hook("segmentation", segmentations) # shape: (num_chunks, num_frames, local_num_speakers) + num_chunks, num_frames, local_num_speakers = segmentations.data.shape # binarize segmentation if self._segmentation.model.specifications.powerset: @@ -507,29 +508,39 @@ def apply( return diarization - if self.klustering == "OracleClustering" and not return_embeddings: + # skip speaker embedding extraction and clustering when only one speaker + if not return_embeddings and max_speakers < 2: + hard_clusters = np.zeros((num_chunks, local_num_speakers), dtype=np.int8) embeddings = None + centroids = None + else: - embeddings = self.get_embeddings( - file, - binarized_segmentations, - exclude_overlap=self.embedding_exclude_overlap, - hook=hook, + + # skip speaker embedding extraction with oracle clustering + if self.klustering == "OracleClustering" and not return_embeddings: + embeddings = None + + else: + embeddings = self.get_embeddings( + file, + binarized_segmentations, + exclude_overlap=self.embedding_exclude_overlap, + hook=hook, + ) + hook("embeddings", embeddings) + # shape: (num_chunks, local_num_speakers, dimension) + + hard_clusters, _, centroids = self.clustering( + embeddings=embeddings, + segmentations=binarized_segmentations, + num_clusters=num_speakers, + min_clusters=min_speakers, + max_clusters=max_speakers, + file=file, # <== for oracle clustering + frames=self._segmentation.model.receptive_field, # <== for oracle clustering ) - hook("embeddings", embeddings) - # shape: (num_chunks, local_num_speakers, dimension) - - hard_clusters, _, centroids = self.clustering( - embeddings=embeddings, - segmentations=binarized_segmentations, - num_clusters=num_speakers, - min_clusters=min_speakers, - max_clusters=max_speakers, - file=file, # <== for oracle clustering - frames=self._segmentation.model.receptive_field, # <== for oracle clustering - ) - # hard_clusters: (num_chunks, num_speakers) - # centroids: (num_speakers, dimension) + # hard_clusters: (num_chunks, num_speakers) + # centroids: (num_speakers, dimension) # number of detected clusters is the number of different speakers num_different_speakers = np.max(hard_clusters) + 1