refactoring + example

philipperemy · Apr 24, 2020 · 0ae5a8a · 0ae5a8a
1 parent 351d6bb
commit 0ae5a8a
Show file tree

Hide file tree

Showing 5 changed files with 52 additions and 20 deletions.
diff --git a/audio.py b/audio.py
@@ -14,6 +14,19 @@
 logger = logging.getLogger(__name__)
 
 
+def read_mfcc(input_filename, sample_rate):
+    audio = Audio.read(input_filename, sample_rate)
+    energy = np.abs(audio)
+    silence_threshold = np.percentile(energy, 95)
+    offsets = np.where(energy > silence_threshold)[0]
+    # left_blank_duration_ms = (1000.0 * offsets[0]) // self.sample_rate  # frame_id to duration (ms)
+    # right_blank_duration_ms = (1000.0 * (len(audio) - offsets[-1])) // self.sample_rate
+    # TODO: could use trim_silence() here or a better VAD.
+    audio_voice_only = audio[offsets[0]:offsets[-1]]
+    mfcc = mfcc_fbank(audio_voice_only, sample_rate)
+    return mfcc
+
+
 def extract_speaker_and_utterance_ids(filename: str):  # LIBRI.
     # 'audio/dev-other/116/288045/116-288045-0000.flac'
     speaker, _, basename = Path(filename).parts[-3:]
@@ -81,15 +94,7 @@ def cache_audio_file(self, input_filename, sample_rate):
         cache_filename = os.path.join(self.cache_dir, f'{sp}_{utt}.npy')
         if not os.path.isfile(cache_filename):
             try:
-                audio = Audio.read(input_filename, sample_rate)
-                energy = np.abs(audio)
-                silence_threshold = np.percentile(energy, 95)
-                offsets = np.where(energy > silence_threshold)[0]
-                # left_blank_duration_ms = (1000.0 * offsets[0]) // self.sample_rate  # frame_id to duration (ms)
-                # right_blank_duration_ms = (1000.0 * (len(audio) - offsets[-1])) // self.sample_rate
-                # TODO: could use trim_silence() here or a better VAD.
-                audio_voice_only = audio[offsets[0]:offsets[-1]]
-                mfcc = mfcc_fbank(audio_voice_only, sample_rate)
+                mfcc = read_mfcc(input_filename, sample_rate)
                 np.save(cache_filename, mfcc)
             except librosa.util.exceptions.ParameterError as e:
                 logger.error(e)

diff --git a/batcher.py b/batcher.py
@@ -21,8 +21,7 @@ def extract_speaker(utt_file):
     return utt_file.split('/')[-1].split('_')[0]
 
 
-def sample_from_mfcc(utterance_file, max_length):
-    mfcc = np.load(utterance_file)
+def sample_from_mfcc(mfcc, max_length):
     if mfcc.shape[0] >= max_length:
         r = choice(range(0, len(mfcc) - max_length + 1))
         s = mfcc[r:r + max_length]
@@ -31,6 +30,11 @@ def sample_from_mfcc(utterance_file, max_length):
     return np.expand_dims(s, axis=-1)
 
 
+def sample_from_mfcc_file(utterance_file, max_length):
+    mfcc = np.load(utterance_file)
+    return sample_from_mfcc(mfcc, max_length)
+
+
 class KerasFormatConverter:
 
     def __init__(self, working_dir, load_test_only=False):
@@ -84,7 +88,7 @@ def generate(self, max_length=NUM_FRAMES, counts_per_speaker=(3000, 500)):
 
     @staticmethod
     def load_into_mat(utterance_file, categorical_speakers, speaker_id, max_length, kx, ky, i):
-        kx[i] = sample_from_mfcc(utterance_file, max_length)
+        kx[i] = sample_from_mfcc_file(utterance_file, max_length)
         ky[i] = categorical_speakers.get_index(speaker_id)
 
 
@@ -155,7 +159,7 @@ def update_triplets_history(self):
         for speaker_id in selected_speakers:
             train_utterances = self.sp_to_utt_train[speaker_id]
             for selected_utterance in np.random.choice(a=train_utterances, size=self.nb_per_speaker, replace=False):
-                mfcc = sample_from_mfcc(selected_utterance, self.max_length)
+                mfcc = sample_from_mfcc_file(selected_utterance, self.max_length)
                 embeddings_utterances.append(selected_utterance)
                 model_inputs.append(mfcc)
         embeddings = self.model.m.predict(np.array(model_inputs))
@@ -208,9 +212,9 @@ def get_random_batch(self, batch_size, is_test=False):
             [extract_speaker(s) for s in pos_neg[1, :]]))
 
         batch_x = np.vstack([
-            [sample_from_mfcc(u, self.max_length) for u in anchor_utterances],
-            [sample_from_mfcc(u, self.max_length) for u in positive_utterances],
-            [sample_from_mfcc(u, self.max_length) for u in negative_utterances]
+            [sample_from_mfcc_file(u, self.max_length) for u in anchor_utterances],
+            [sample_from_mfcc_file(u, self.max_length) for u in positive_utterances],
+            [sample_from_mfcc_file(u, self.max_length) for u in negative_utterances]
         ])
 
         batch_y = np.zeros(shape=(len(batch_x), 1))  # dummy. sparse softmax needs something.
@@ -332,9 +336,9 @@ def get_speaker_verification_data(self, anchor_speaker, num_different_speakers):
             [extract_speaker(s) for s in anc_pos[1, :]]))
 
         batch_x = np.vstack([
-            [sample_from_mfcc(u, self.max_length) for u in anchor_utterances],
-            [sample_from_mfcc(u, self.max_length) for u in positive_utterances],
-            [sample_from_mfcc(u, self.max_length) for u in negative_utterances]
+            [sample_from_mfcc_file(u, self.max_length) for u in anchor_utterances],
+            [sample_from_mfcc_file(u, self.max_length) for u in positive_utterances],
+            [sample_from_mfcc_file(u, self.max_length) for u in negative_utterances]
         ])
 
         batch_y = np.zeros(shape=(len(batch_x), 1))  # dummy. sparse softmax needs something.

diff --git a/deep-speaker b/deep-speaker
@@ -2,7 +2,8 @@
 
 set -e
 
-WORKING_DIR="/home/philippe/ds-test" # temporary for triplet training.
+HOME_DIR=$(eval echo "~")
+WORKING_DIR="${HOME_DIR}/.deep-speaker-wd"
 
 if [ $# -lt 1 ]; then
   echo "Usage : $0 Task [download_librispeech, build_mfcc, build_model_inputs, train_softmax, train_triplet]"

diff --git a/example.py b/example.py
@@ -0,0 +1,22 @@
+import numpy as np
+
+from audio import read_mfcc
+from batcher import sample_from_mfcc
+from constants import SAMPLE_RATE, NUM_FRAMES
+from conv_models import DeepSpeakerModel
+from test import batch_cosine_similarity
+
+model = DeepSpeakerModel()
+model.m.load_weights('/Users/premy/deep-speaker/checkpoints/ResCNN_triplet_training_checkpoint_175.h5', by_name=True)
+
+mfcc_001 = sample_from_mfcc(read_mfcc('samples/PhilippeRemy/PhilippeRemy_001.wav', SAMPLE_RATE), NUM_FRAMES)
+mfcc_002 = sample_from_mfcc(read_mfcc('samples/PhilippeRemy/PhilippeRemy_002.wav', SAMPLE_RATE), NUM_FRAMES)
+
+predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0))
+predict_002 = model.m.predict(np.expand_dims(mfcc_002, axis=0))
+
+mfcc_003 = sample_from_mfcc(read_mfcc('samples/1255-90413-0001.flac', SAMPLE_RATE), NUM_FRAMES)
+predict_003 = model.m.predict(np.expand_dims(mfcc_003, axis=0))
+
+print('SAME SPEAKER', batch_cosine_similarity(predict_001, predict_002))
+print('DIFF SPEAKER', batch_cosine_similarity(predict_001, predict_003))
diff --git a/samples/1255-90413-0001.flac b/samples/1255-90413-0001.flac