Skip to content

Commit

Permalink
refactoring + example
Browse files Browse the repository at this point in the history
  • Loading branch information
philipperemy committed Apr 24, 2020
1 parent 351d6bb commit 0ae5a8a
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 20 deletions.
23 changes: 14 additions & 9 deletions audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,19 @@
logger = logging.getLogger(__name__)


def read_mfcc(input_filename, sample_rate):
audio = Audio.read(input_filename, sample_rate)
energy = np.abs(audio)
silence_threshold = np.percentile(energy, 95)
offsets = np.where(energy > silence_threshold)[0]
# left_blank_duration_ms = (1000.0 * offsets[0]) // self.sample_rate # frame_id to duration (ms)
# right_blank_duration_ms = (1000.0 * (len(audio) - offsets[-1])) // self.sample_rate
# TODO: could use trim_silence() here or a better VAD.
audio_voice_only = audio[offsets[0]:offsets[-1]]
mfcc = mfcc_fbank(audio_voice_only, sample_rate)
return mfcc


def extract_speaker_and_utterance_ids(filename: str): # LIBRI.
# 'audio/dev-other/116/288045/116-288045-0000.flac'
speaker, _, basename = Path(filename).parts[-3:]
Expand Down Expand Up @@ -81,15 +94,7 @@ def cache_audio_file(self, input_filename, sample_rate):
cache_filename = os.path.join(self.cache_dir, f'{sp}_{utt}.npy')
if not os.path.isfile(cache_filename):
try:
audio = Audio.read(input_filename, sample_rate)
energy = np.abs(audio)
silence_threshold = np.percentile(energy, 95)
offsets = np.where(energy > silence_threshold)[0]
# left_blank_duration_ms = (1000.0 * offsets[0]) // self.sample_rate # frame_id to duration (ms)
# right_blank_duration_ms = (1000.0 * (len(audio) - offsets[-1])) // self.sample_rate
# TODO: could use trim_silence() here or a better VAD.
audio_voice_only = audio[offsets[0]:offsets[-1]]
mfcc = mfcc_fbank(audio_voice_only, sample_rate)
mfcc = read_mfcc(input_filename, sample_rate)
np.save(cache_filename, mfcc)
except librosa.util.exceptions.ParameterError as e:
logger.error(e)
Expand Down
24 changes: 14 additions & 10 deletions batcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ def extract_speaker(utt_file):
return utt_file.split('/')[-1].split('_')[0]


def sample_from_mfcc(utterance_file, max_length):
mfcc = np.load(utterance_file)
def sample_from_mfcc(mfcc, max_length):
if mfcc.shape[0] >= max_length:
r = choice(range(0, len(mfcc) - max_length + 1))
s = mfcc[r:r + max_length]
Expand All @@ -31,6 +30,11 @@ def sample_from_mfcc(utterance_file, max_length):
return np.expand_dims(s, axis=-1)


def sample_from_mfcc_file(utterance_file, max_length):
mfcc = np.load(utterance_file)
return sample_from_mfcc(mfcc, max_length)


class KerasFormatConverter:

def __init__(self, working_dir, load_test_only=False):
Expand Down Expand Up @@ -84,7 +88,7 @@ def generate(self, max_length=NUM_FRAMES, counts_per_speaker=(3000, 500)):

@staticmethod
def load_into_mat(utterance_file, categorical_speakers, speaker_id, max_length, kx, ky, i):
kx[i] = sample_from_mfcc(utterance_file, max_length)
kx[i] = sample_from_mfcc_file(utterance_file, max_length)
ky[i] = categorical_speakers.get_index(speaker_id)


Expand Down Expand Up @@ -155,7 +159,7 @@ def update_triplets_history(self):
for speaker_id in selected_speakers:
train_utterances = self.sp_to_utt_train[speaker_id]
for selected_utterance in np.random.choice(a=train_utterances, size=self.nb_per_speaker, replace=False):
mfcc = sample_from_mfcc(selected_utterance, self.max_length)
mfcc = sample_from_mfcc_file(selected_utterance, self.max_length)
embeddings_utterances.append(selected_utterance)
model_inputs.append(mfcc)
embeddings = self.model.m.predict(np.array(model_inputs))
Expand Down Expand Up @@ -208,9 +212,9 @@ def get_random_batch(self, batch_size, is_test=False):
[extract_speaker(s) for s in pos_neg[1, :]]))

batch_x = np.vstack([
[sample_from_mfcc(u, self.max_length) for u in anchor_utterances],
[sample_from_mfcc(u, self.max_length) for u in positive_utterances],
[sample_from_mfcc(u, self.max_length) for u in negative_utterances]
[sample_from_mfcc_file(u, self.max_length) for u in anchor_utterances],
[sample_from_mfcc_file(u, self.max_length) for u in positive_utterances],
[sample_from_mfcc_file(u, self.max_length) for u in negative_utterances]
])

batch_y = np.zeros(shape=(len(batch_x), 1)) # dummy. sparse softmax needs something.
Expand Down Expand Up @@ -332,9 +336,9 @@ def get_speaker_verification_data(self, anchor_speaker, num_different_speakers):
[extract_speaker(s) for s in anc_pos[1, :]]))

batch_x = np.vstack([
[sample_from_mfcc(u, self.max_length) for u in anchor_utterances],
[sample_from_mfcc(u, self.max_length) for u in positive_utterances],
[sample_from_mfcc(u, self.max_length) for u in negative_utterances]
[sample_from_mfcc_file(u, self.max_length) for u in anchor_utterances],
[sample_from_mfcc_file(u, self.max_length) for u in positive_utterances],
[sample_from_mfcc_file(u, self.max_length) for u in negative_utterances]
])

batch_y = np.zeros(shape=(len(batch_x), 1)) # dummy. sparse softmax needs something.
Expand Down
3 changes: 2 additions & 1 deletion deep-speaker
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

set -e

WORKING_DIR="/home/philippe/ds-test" # temporary for triplet training.
HOME_DIR=$(eval echo "~")
WORKING_DIR="${HOME_DIR}/.deep-speaker-wd"

if [ $# -lt 1 ]; then
echo "Usage : $0 Task [download_librispeech, build_mfcc, build_model_inputs, train_softmax, train_triplet]"
Expand Down
22 changes: 22 additions & 0 deletions example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import numpy as np

from audio import read_mfcc
from batcher import sample_from_mfcc
from constants import SAMPLE_RATE, NUM_FRAMES
from conv_models import DeepSpeakerModel
from test import batch_cosine_similarity

model = DeepSpeakerModel()
model.m.load_weights('/Users/premy/deep-speaker/checkpoints/ResCNN_triplet_training_checkpoint_175.h5', by_name=True)

mfcc_001 = sample_from_mfcc(read_mfcc('samples/PhilippeRemy/PhilippeRemy_001.wav', SAMPLE_RATE), NUM_FRAMES)
mfcc_002 = sample_from_mfcc(read_mfcc('samples/PhilippeRemy/PhilippeRemy_002.wav', SAMPLE_RATE), NUM_FRAMES)

predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0))
predict_002 = model.m.predict(np.expand_dims(mfcc_002, axis=0))

mfcc_003 = sample_from_mfcc(read_mfcc('samples/1255-90413-0001.flac', SAMPLE_RATE), NUM_FRAMES)
predict_003 = model.m.predict(np.expand_dims(mfcc_003, axis=0))

print('SAME SPEAKER', batch_cosine_similarity(predict_001, predict_002))
print('DIFF SPEAKER', batch_cosine_similarity(predict_001, predict_003))
Binary file added samples/1255-90413-0001.flac
Binary file not shown.

0 comments on commit 0ae5a8a

Please sign in to comment.