Merge branch 'feat/add_create_mn_test_doc' into 'master'

Feat/add create mn test doc See merge request speech-recognition-framework/esp-skainet!36
espressif · Sep 11, 2023 · 86b0caf · 86b0caf
2 parents 257bb34 + aa931b3
commit 86b0caf
Show file tree

Hide file tree

Showing 3 changed files with 167 additions and 12 deletions.
diff --git a/tools/record_test_set/README.md b/tools/record_test_set/README.md
@@ -1,5 +1,5 @@
-# How to record test set
-`create_test_set.py` is used to synthesize audio with different SNR, play the audio, and use the serial port to recoard audio data by the board.
+# How to record WakeNet test set
+`create_test_set.py` is used to synthesize audio with different SNR, play the audio, and use the serial port to record audio data by the board.
 
 `sdcard_recorder` app records and saves data into SD card by receiving the serial port signal from `create_test_set.py`. This project supports [ESP32-Korvo](https://github.com/espressif/esp-skainet/blob/master/docs/en/hw-reference/esp32/user-guide-esp32-korvo-v1.1.md), [ESP32-S3-Korvo-1](https://github.com/espressif/esp-skainet/blob/master/docs/en/hw-reference/esp32s3/user-guide-korvo-1.md),[ESP-S3-Korvo-2](https://docs.espressif.com/projects/esp-adf/en/latest/get-started/user-guide-esp32-s3-korvo-2.html).
 
@@ -32,7 +32,7 @@ output_set:
       clean_gain_dB: 5
 player:
   play_output: true
-   
+
 ```
 The above configuration will generate 2(the number of clean set)*3(the number of noise set)*3(the number of SNR)=18 test files.
 
@@ -51,3 +51,93 @@ pip install -r requirement.txt
 
 python create_test_set.py config.yml
 ```
+
+
+# How to record MultiNet test set
+Similar to recording WakeNet test set, `create_mn_test_set.py` is used to synthesize audio, play the audio, and record audio data for MultiNet test set.
+
+## 1. configuration
+The user can synthesize test set with different SNR by modifying the `config_mn.yml` file. The template of `config_mn.yml` is as follows:
+
+```yml
+clean_set:
+  wake_words_paths:
+    - "data/hilexin"       # the directory contains wake word recordings
+  commands_paths:
+    - "data/CN-TEST-S"     # the directory contains command recordings
+  filelists_paths:
+    - "data/cn_test.json"  # a JSON file contains the recording orders
+  normalization: True
+  target_dB: -36
+noise_set:
+  paths:
+    - "./data/noise_set/silence"
+    - "./data/noise_set/pink"
+    - "./data/noise_set/pub"
+  normalization: True
+  target_dB: -36
+output_set:
+  path: "data/test"        # the directory contains the output recordings
+  remove_old_files: true
+  snr:
+    - snr_dB: 10
+      clean_gain_dB: 0
+    - snr_dB: 5
+      clean_gain_dB: 0
+    - snr_dB: 0
+      clean_gain_dB: 5
+player:
+  play_output: true
+
+```
+The above configuration will generate 3(the number of noise set) * 3(the number of SNR) = 9 test files.
+
+## 2. JSON file
+The JSON file referred in `config_mn.yml` needs to be generated beforehand, which should use the following format:
+```json
+[
+  {
+    'wake_word_fname': 'K000000000000-16316211672.wav',
+    'size': 27648,
+    'wake_word_tailing_silence_ms': 100,
+    'commands': [
+      {'command_fname': 'brian-3.wav', 'size': 95744},
+      {'command_fname': 'Omar-3_1.wav', 'size': 95744},
+      {'command_fname': 'Jakob-8_3.wav', 'size': 95744},
+      {'command_fname': 'kirill-12_1.wav', 'size': 95744}],
+    'between_command_length_ms': 5000
+  },
+  {
+    'wake_word_fname': 'K000000000000-1631851371-193-83532.wav',
+    'size': 26624,
+    'wake_word_tailing_silence_ms': 100,
+    'commands': [
+      {'command_fname': 'Darian-21_3.wav', 'size': 95744},
+      {'command_fname': 'kirill-39_2.wav', 'size': 95744},
+      {'command_fname': 'jeroen-25.wav', 'size': 95744},
+      {'command_fname': 'Jakob-22_1.wav', 'size': 95744}],
+    'between_command_length_ms': 5000
+  },
+  ...
+]
+```
+- `'wake_word_fname'` is the filename of a wake word recording
+- `'size'` is the number of samples of a recording
+- `'wake_word_tailing_silence'` is the length of silence (ms) between the wake word and the first command.
+- `'command_fname'` is the filename of a command recording
+- `'between_command_length_ms'` is the length of silence (ms) between command recordings, **if the command recording you are using does not have long silent tail, make sure that there are at least 2 to 3 seconds of silence between two commands**.
+
+The following command provides an example of how to create the JSON file, you might want to modify the `wake_word_tailing_silence` and `between_command_length_ms` in it for your own test data.
+
+```sh
+python create_mn_test_json.py \
+    --wake-word-dir data/hiesp/ \
+    --command-dir data/espen \
+    --out-file en_mn_test.json
+```
+## 3. specifics
+
+- the recorded audio follows a format of: one wake word recording followed by a few command recordings.
+- there should be at least 2 to 3 seconds of silence between two consecutive commands.
+- the reason of using an standalone JSON file to store the file order is for reproducibility and as a way to store command ID sequence for evaluation.
+
diff --git a/tools/record_test_set/create_mn_test_json.py b/tools/record_test_set/create_mn_test_json.py
@@ -0,0 +1,61 @@
+#!/usr/bin/python3
+
+# This Python script is an example for creating the JSON file required by `create_mn_test_set.py`
+
+import argparse
+import json
+import random
+from pathlib import Path
+
+import torchaudio
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--wake-word-dir",
+        type=str,
+        help="directory of wake word recordings"
+    )
+    parser.add_argument(
+        "--command-dir",
+        type=str,
+        help="directory of command recordings"
+    )
+    parser.add_argument(
+        "--out-file",
+        type=str,
+        help="output filename"
+    )
+    args = parser.parse_args()
+
+    wakewords = list(Path(args.wake_word_dir).glob("*.wav"))
+    commands = list(Path(args.command_dir).glob("*.wav"))
+
+    output = []
+
+    for wakeword in wakewords:
+        data, sr = torchaudio.load(wakeword)
+        assert sr == 16000, sr
+        block = {
+            "wake_word_fname": wakeword.name,
+            "size": data.shape[1],
+            "wake_word_tailing_silence_ms": 100,
+            "commands": [],
+            "between_command_length_ms": 5000,
+        }
+        num_command = random.randint(1, 5)
+        block_commands = random.sample(commands, num_command)
+        for command in block_commands:
+            data, sr = torchaudio.load(wakeword)
+            assert sr == 16000, sr
+            block["commands"].append(
+                {
+                    "command_fname": command.name,
+                    "size": data.shape[1]
+                }
+            )
+        output.append(block)
+
+    with open(args.out_file, "wt") as f:
+        json.dump(output, f)
diff --git a/tools/record_test_set/create_mn_test_set.py b/tools/record_test_set/create_mn_test_set.py
@@ -38,35 +38,39 @@ def merge_wake_word_command_and_noise(
         for item in filelist:
             src_audio = cls.read_audio_file(f"{wake_word_dir}/{item['wake_word_fname']}")
             if src_audio is not None:
-                rnd_sep_samples = item["wake_word_tailing_silence"]
+                rnd_sep_samples = item["wake_word_tailing_silence_ms"] / 16000
                 sep_audio = AudioSegment.silent(duration=rnd_sep_samples, frame_rate=16000)
                 clean_audio += src_audio + sep_audio
             else:
                 print(f"Failed reading {wake_word_dir}/{item['wake_word_fname']}")
                 continue
-            for cmd_item in item["commands"]:
+            between_cmd_samples = item["between_command_length_ms"] / 16000
+            between_cmd_audio = AudioSegment.silent(duration=between_cmd_samples, frame_rate=16000)
+            for i, cmd_item in enumerate(item["commands"]):
                 src_audio = cls.read_audio_file(f"{command_dir}/{cmd_item['command_fname']}")
                 if src_audio is not None:
                     clean_audio += src_audio
                 else:
                     print(f"Failed reading {command_dir}/{cmd_item['command_fname']}")
+                if i < len(item["commands"]) - 1:
+                    clean_audio += between_cmd_audio
             clean_audio += time_out_audio
 
-        # merge all clean audio file into one file
+        # merge all noise audio file into one file
         for root, _, files in os.walk(noise_dir):
             for filename in files:
                 src_file = os.path.join(root, filename)
                 src_audio = cls.read_audio_file(src_file)
                 if src_audio != None:
                     noise_audio += src_audio
-        
+
         clean_audio = clean_audio.apply_gain(clean_gain)
         noise_audio = noise_audio.apply_gain(noise_gain)
-        
+
         # the len() is not exactly, so replace len() with frame_count()
-        while(noise_audio.frame_count() < clean_audio.frame_count()):
+        while noise_audio.frame_count() < clean_audio.frame_count():
             noise_audio = noise_audio + noise_audio
-        
+
         # Returns the raw audio data as an array of (numeric) samples.
         clean_audio_array = clean_audio.get_array_of_samples()
         noise_audio_array = noise_audio.get_array_of_samples()
@@ -87,8 +91,8 @@ def merge_wake_word_command_and_noise(
 
 if __name__ == '__main__':
     description = 'Usage: \n' \
-                  'python skainet_test_set.py your_yaml_file \n' \
-                  
+                  'python create_mn_test_set.py your_yaml_file \n' \
+
     parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter)
     parser.add_argument('yaml')
     args = parser.parse_args()