diff --git a/tools/record_test_set/README.md b/tools/record_test_set/README.md index 0438482..d040a00 100644 --- a/tools/record_test_set/README.md +++ b/tools/record_test_set/README.md @@ -1,5 +1,5 @@ -# How to record test set -`create_test_set.py` is used to synthesize audio with different SNR, play the audio, and use the serial port to recoard audio data by the board. +# How to record WakeNet test set +`create_test_set.py` is used to synthesize audio with different SNR, play the audio, and use the serial port to record audio data by the board. `sdcard_recorder` app records and saves data into SD card by receiving the serial port signal from `create_test_set.py`. This project supports [ESP32-Korvo](https://github.com/espressif/esp-skainet/blob/master/docs/en/hw-reference/esp32/user-guide-esp32-korvo-v1.1.md), [ESP32-S3-Korvo-1](https://github.com/espressif/esp-skainet/blob/master/docs/en/hw-reference/esp32s3/user-guide-korvo-1.md),[ESP-S3-Korvo-2](https://docs.espressif.com/projects/esp-adf/en/latest/get-started/user-guide-esp32-s3-korvo-2.html). @@ -32,7 +32,7 @@ output_set: clean_gain_dB: 5 player: play_output: true - + ``` The above configuration will generate 2(the number of clean set)*3(the number of noise set)*3(the number of SNR)=18 test files. @@ -51,3 +51,93 @@ pip install -r requirement.txt python create_test_set.py config.yml ``` + + +# How to record MultiNet test set +Similar to recording WakeNet test set, `create_mn_test_set.py` is used to synthesize audio, play the audio, and record audio data for MultiNet test set. + +## 1. configuration +The user can synthesize test set with different SNR by modifying the `config_mn.yml` file. The template of `config_mn.yml` is as follows: + +```yml +clean_set: + wake_words_paths: + - "data/hilexin" # the directory contains wake word recordings + commands_paths: + - "data/CN-TEST-S" # the directory contains command recordings + filelists_paths: + - "data/cn_test.json" # a JSON file contains the recording orders + normalization: True + target_dB: -36 +noise_set: + paths: + - "./data/noise_set/silence" + - "./data/noise_set/pink" + - "./data/noise_set/pub" + normalization: True + target_dB: -36 +output_set: + path: "data/test" # the directory contains the output recordings + remove_old_files: true + snr: + - snr_dB: 10 + clean_gain_dB: 0 + - snr_dB: 5 + clean_gain_dB: 0 + - snr_dB: 0 + clean_gain_dB: 5 +player: + play_output: true + +``` +The above configuration will generate 3(the number of noise set) * 3(the number of SNR) = 9 test files. + +## 2. JSON file +The JSON file referred in `config_mn.yml` needs to be generated beforehand, which should use the following format: +```json +[ + { + 'wake_word_fname': 'K000000000000-16316211672.wav', + 'size': 27648, + 'wake_word_tailing_silence_ms': 100, + 'commands': [ + {'command_fname': 'brian-3.wav', 'size': 95744}, + {'command_fname': 'Omar-3_1.wav', 'size': 95744}, + {'command_fname': 'Jakob-8_3.wav', 'size': 95744}, + {'command_fname': 'kirill-12_1.wav', 'size': 95744}], + 'between_command_length_ms': 5000 + }, + { + 'wake_word_fname': 'K000000000000-1631851371-193-83532.wav', + 'size': 26624, + 'wake_word_tailing_silence_ms': 100, + 'commands': [ + {'command_fname': 'Darian-21_3.wav', 'size': 95744}, + {'command_fname': 'kirill-39_2.wav', 'size': 95744}, + {'command_fname': 'jeroen-25.wav', 'size': 95744}, + {'command_fname': 'Jakob-22_1.wav', 'size': 95744}], + 'between_command_length_ms': 5000 + }, + ... +] +``` +- `'wake_word_fname'` is the filename of a wake word recording +- `'size'` is the number of samples of a recording +- `'wake_word_tailing_silence'` is the length of silence (ms) between the wake word and the first command. +- `'command_fname'` is the filename of a command recording +- `'between_command_length_ms'` is the length of silence (ms) between command recordings, **if the command recording you are using does not have long silent tail, make sure that there are at least 2 to 3 seconds of silence between two commands**. + +The following command provides an example of how to create the JSON file, you might want to modify the `wake_word_tailing_silence` and `between_command_length_ms` in it for your own test data. + +```sh +python create_mn_test_json.py \ + --wake-word-dir data/hiesp/ \ + --command-dir data/espen \ + --out-file en_mn_test.json +``` +## 3. specifics + +- the recorded audio follows a format of: one wake word recording followed by a few command recordings. +- there should be at least 2 to 3 seconds of silence between two consecutive commands. +- the reason of using an standalone JSON file to store the file order is for reproducibility and as a way to store command ID sequence for evaluation. + diff --git a/tools/record_test_set/create_mn_test_json.py b/tools/record_test_set/create_mn_test_json.py new file mode 100644 index 0000000..e0f7764 --- /dev/null +++ b/tools/record_test_set/create_mn_test_json.py @@ -0,0 +1,61 @@ +#!/usr/bin/python3 + +# This Python script is an example for creating the JSON file required by `create_mn_test_set.py` + +import argparse +import json +import random +from pathlib import Path + +import torchaudio + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + "--wake-word-dir", + type=str, + help="directory of wake word recordings" + ) + parser.add_argument( + "--command-dir", + type=str, + help="directory of command recordings" + ) + parser.add_argument( + "--out-file", + type=str, + help="output filename" + ) + args = parser.parse_args() + + wakewords = list(Path(args.wake_word_dir).glob("*.wav")) + commands = list(Path(args.command_dir).glob("*.wav")) + + output = [] + + for wakeword in wakewords: + data, sr = torchaudio.load(wakeword) + assert sr == 16000, sr + block = { + "wake_word_fname": wakeword.name, + "size": data.shape[1], + "wake_word_tailing_silence_ms": 100, + "commands": [], + "between_command_length_ms": 5000, + } + num_command = random.randint(1, 5) + block_commands = random.sample(commands, num_command) + for command in block_commands: + data, sr = torchaudio.load(wakeword) + assert sr == 16000, sr + block["commands"].append( + { + "command_fname": command.name, + "size": data.shape[1] + } + ) + output.append(block) + + with open(args.out_file, "wt") as f: + json.dump(output, f) \ No newline at end of file diff --git a/tools/record_test_set/create_mn_test_set.py b/tools/record_test_set/create_mn_test_set.py index fdd6aa1..aeeaa3c 100644 --- a/tools/record_test_set/create_mn_test_set.py +++ b/tools/record_test_set/create_mn_test_set.py @@ -38,35 +38,39 @@ def merge_wake_word_command_and_noise( for item in filelist: src_audio = cls.read_audio_file(f"{wake_word_dir}/{item['wake_word_fname']}") if src_audio is not None: - rnd_sep_samples = item["wake_word_tailing_silence"] + rnd_sep_samples = item["wake_word_tailing_silence_ms"] / 16000 sep_audio = AudioSegment.silent(duration=rnd_sep_samples, frame_rate=16000) clean_audio += src_audio + sep_audio else: print(f"Failed reading {wake_word_dir}/{item['wake_word_fname']}") continue - for cmd_item in item["commands"]: + between_cmd_samples = item["between_command_length_ms"] / 16000 + between_cmd_audio = AudioSegment.silent(duration=between_cmd_samples, frame_rate=16000) + for i, cmd_item in enumerate(item["commands"]): src_audio = cls.read_audio_file(f"{command_dir}/{cmd_item['command_fname']}") if src_audio is not None: clean_audio += src_audio else: print(f"Failed reading {command_dir}/{cmd_item['command_fname']}") + if i < len(item["commands"]) - 1: + clean_audio += between_cmd_audio clean_audio += time_out_audio - # merge all clean audio file into one file + # merge all noise audio file into one file for root, _, files in os.walk(noise_dir): for filename in files: src_file = os.path.join(root, filename) src_audio = cls.read_audio_file(src_file) if src_audio != None: noise_audio += src_audio - + clean_audio = clean_audio.apply_gain(clean_gain) noise_audio = noise_audio.apply_gain(noise_gain) - + # the len() is not exactly, so replace len() with frame_count() - while(noise_audio.frame_count() < clean_audio.frame_count()): + while noise_audio.frame_count() < clean_audio.frame_count(): noise_audio = noise_audio + noise_audio - + # Returns the raw audio data as an array of (numeric) samples. clean_audio_array = clean_audio.get_array_of_samples() noise_audio_array = noise_audio.get_array_of_samples() @@ -87,8 +91,8 @@ def merge_wake_word_command_and_noise( if __name__ == '__main__': description = 'Usage: \n' \ - 'python skainet_test_set.py your_yaml_file \n' \ - + 'python create_mn_test_set.py your_yaml_file \n' \ + parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('yaml') args = parser.parse_args()