From 962ad508d3134ea5de3da1ea7b2ee5355d00c43f Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sat, 10 Feb 2024 11:19:07 +0100 Subject: [PATCH 01/77] data prep scripts update --- recipes/GigaSpeech/gigaspeech_prepare.py | 148 +++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 recipes/GigaSpeech/gigaspeech_prepare.py diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py new file mode 100644 index 0000000000..25784a4819 --- /dev/null +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -0,0 +1,148 @@ +""" +Data preparation script for the GigaSpeech dataset. + +Download instruction: https://github.com/SpeechColab/GigaSpeech +Reference: https://arxiv.org/abs/2106.06909 + +Author +------- + * Adel Moumen, 2024 +""" + +import logging +import os +import json + +logger = logging.getLogger(__name__) +SAMPLERATE = 16000 +GRABAGE_UTTERANCE_TAGS = ["", "", "", ""] +PUNCTUATION_TAGS = { + "": ",", + "": "!", + "": ".", + "": "?" +} +SPLITS = ["train", "dev", "test"] +TRAIN_SUBSET = ["XS", "S", "M", "L", "XL"] + +def prepare_gigaspeech( + data_folder, + save_folder, + splits: list = SPLITS, + json_file="GigaSpeech.json", + skip_prep: bool = False, +): + """TODO. + """ + if skip_prep: + logger.info("Skipping data preparation as `skip_prep` is set to `True`") + return + + os.makedirs(save_folder, exist_ok=True) + + if skip(): # TODO: Implement skip function + logger.info("Skipping preparation, completed in previous run.") + return + else: + logger.info("Starting data preparation...") + + check_gigaspeech_folders(data_folder, json_file) + + json_metadata = os.path.join(data_folder, json_file) + logger.info("Creating train, dev, and test subsets.") + + with open(json_metadata, "r") as f: + info = json.load(f) + + ret = [] + for split in splits: + for audio in info["audios"]: + # 1. Check if the audio is part of the "subsets". One audio can be part of multiple subsets. + # such as "{XL}" and "{L}". + if ("{" + split + "}") in audio["subsets"]: + wav_path = os.path.join(data_folder, audio["path"]) + assert wav_path.is_file(), f"File not found: {wav_path}" + +def preprocess_text(text: str) -> str: + """ + Preprocesses the input text by removing garbage tags and replacing punctuation tags. + + Parameters + ---------- + text : str + The input text to be preprocessed. + + Returns + ------- + str + The preprocessed text with removed garbage tags and replaced punctuation tags. + + Raises + ------ + AssertionError + If '<' or '>' tags are found in the text after preprocessing. + + Notes + ----- + The function iterates over predefined garbage utterance tags (GRABAGE_UTTERANCE_TAGS) + and removes them from the input text. It then iterates over predefined punctuation tags + (PUNCTUATION_TAGS) and replaces them with the corresponding punctuation. + + Examples + -------- + >>> preprocess_text("Hello world ") + 'hello world !' + """ + # Remove garbage tags + for tag in GRABAGE_UTTERANCE_TAGS: + text = text.replace(tag, "") + + # Remove punctuation tags + for tag, punctuation in PUNCTUATION_TAGS.items(): + text = text.replace(' ' + tag + ' ', punctuation) + + assert "<" not in text and ">" not in text, f"Found tags in the text: {text}" + return text.lower() + + +def skip(): + """TODO. + """ + return False + +def check_gigaspeech_folders(data_folder, json_file="GigaSpeech.json", audio_folder="audio"): + """Check if the data folder actually contains the GigaSpeech dataset. + + If it does not, an error is raised. + + Returns + ------- + None + + Raises + ------ + OSError + If GigaSpeech is not found at the specified path. + """ + # Checking if "GigaSpeech.json" exist + json_gigaspeech = os.path.join(data_folder, json_file) + check_file(json_gigaspeech) + + # Check if audio folders exist + for folder_subset in ["audiobook", "podcast", "youtube"]: + audio_subset = os.path.join(data_folder, audio_folder, folder_subset) + if not os.path.exists(audio_subset): + err_msg = ( + "the file %s does not exist (it is expected in the " + "Gigaspeech dataset)" % audio_subset + ) + raise OSError(err_msg) + +def check_file(path): + # Check if file exist + if not os.path.exists(path): + err_msg = ( + "the opus file %s does not exist (it is expected in the " + "Gigaspeech dataset)" % path + ) + raise OSError(err_msg) \ No newline at end of file From 39b5049af23ded1b39ab28fb13e73dac1cd300a7 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sat, 10 Feb 2024 12:24:45 +0100 Subject: [PATCH 02/77] iterate over utterances --- recipes/GigaSpeech/gigaspeech_prepare.py | 75 ++++++++++++++++++++---- 1 file changed, 64 insertions(+), 11 deletions(-) diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index 25784a4819..1c63bafbb5 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -1,7 +1,7 @@ """ Data preparation script for the GigaSpeech dataset. -Download instruction: https://github.com/SpeechColab/GigaSpeech +Download instructions: https://github.com/SpeechColab/GigaSpeech Reference: https://arxiv.org/abs/2106.06909 Author @@ -12,6 +12,8 @@ import logging import os import json +from dataclasses import dataclass +from speechbrain.utils.parallel import parallel_map logger = logging.getLogger(__name__) SAMPLERATE = 16000 @@ -22,13 +24,27 @@ "": ".", "": "?" } -SPLITS = ["train", "dev", "test"] +SPLITS = ["DEV", "TEST"] TRAIN_SUBSET = ["XS", "S", "M", "L", "XL"] + +@dataclass +class GigaSpeechRow: + utt_id: str # segment[sid] + wav_id: str # audio[aid] + speaker: str # audio["speaker"] + begin_time: float + end_time: float + duration: float + text: str + + + def prepare_gigaspeech( data_folder, save_folder, splits: list = SPLITS, + train_subset: list = TRAIN_SUBSET, json_file="GigaSpeech.json", skip_prep: bool = False, ): @@ -54,14 +70,41 @@ def prepare_gigaspeech( with open(json_metadata, "r") as f: info = json.load(f) - ret = [] - for split in splits: + ret = {} + import time + time1 = time.time() + for split in splits + train_subset: + ret[split] = [] for audio in info["audios"]: - # 1. Check if the audio is part of the "subsets". One audio can be part of multiple subsets. + # 1. Check if the audio is part of the "subsets". One audio can be part of multiple subsets # such as "{XL}" and "{L}". if ("{" + split + "}") in audio["subsets"]: wav_path = os.path.join(data_folder, audio["path"]) assert wav_path.is_file(), f"File not found: {wav_path}" + + # 2. iterate over the utterances + utterances = [] + for segment in audio["segments"]: + text = preprocess_text(segment["text_tn"]) + if text: + print(segment["begin_time"], segment["end_time"]) + begin_time = float(segment["begin_time"]) + end_time = float(segment["end_time"]) + duration = end_time - begin_time + utterance = GigaSpeechRow( + utt_id=segment["sid"], + wav_id=audio["aid"], + speaker=audio["speaker"], + begin_time=begin_time, + end_time=end_time, + duration=duration, + text=text, + ) + print(utterance) + exit() + + ret[split].append(utterances) + exit() def preprocess_text(text: str) -> str: """ @@ -90,17 +133,19 @@ def preprocess_text(text: str) -> str: Examples -------- - >>> preprocess_text("Hello world ") - 'hello world !' + >>> text = " DOUGLAS MCGRAY IS GOING TO BE OUR GUIDE YOU WALK THROUGH THE DOOR YOU SEE THE RED CARPETING YOU SEE SOMEONE IN A SUIT THEY MAY BE GREETING YOU " + >>> preprocess_text(text) + "douglas mcgray is going to be our guide you walk through the door, you see the red carpeting, you see someone in a suit. they may be greeting you." """ # Remove garbage tags for tag in GRABAGE_UTTERANCE_TAGS: - text = text.replace(tag, "") + if tag in text: + return "" # Remove punctuation tags for tag, punctuation in PUNCTUATION_TAGS.items(): - text = text.replace(' ' + tag + ' ', punctuation) - + text = text.replace(' ' + tag, punctuation) + assert "<" not in text and ">" not in text, f"Found tags in the text: {text}" return text.lower() @@ -145,4 +190,12 @@ def check_file(path): "the opus file %s does not exist (it is expected in the " "Gigaspeech dataset)" % path ) - raise OSError(err_msg) \ No newline at end of file + raise OSError(err_msg) + + +if __name__ == "__main__": + data_folder = "/local_disk/idyie/amoumen/GigaSpeech_data/" + save_folder = "." + train_subset = ["XS"] + prepare_gigaspeech(data_folder, save_folder, train_subset=train_subset) + print("Done") \ No newline at end of file From b3137341878736ec6c31e94f5a7bcf2b84d0b62c Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sat, 10 Feb 2024 14:07:08 +0100 Subject: [PATCH 03/77] without parallel map --- recipes/GigaSpeech/gigaspeech_prepare.py | 100 +++++++++++++++++------ 1 file changed, 76 insertions(+), 24 deletions(-) diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index 1c63bafbb5..7e20010fb7 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -13,6 +13,7 @@ import os import json from dataclasses import dataclass +import functools from speechbrain.utils.parallel import parallel_map logger = logging.getLogger(__name__) @@ -32,24 +33,26 @@ class GigaSpeechRow: utt_id: str # segment[sid] wav_id: str # audio[aid] + wav_path: str speaker: str # audio["speaker"] begin_time: float end_time: float duration: float text: str - - def prepare_gigaspeech( data_folder, save_folder, - splits: list = SPLITS, - train_subset: list = TRAIN_SUBSET, + splits: list, json_file="GigaSpeech.json", skip_prep: bool = False, ): """TODO. """ + # check that `splits` input is valid + for split in splits: + assert split in SPLITS + TRAIN_SUBSET, f"Split {split} not recognized. Valid splits are {SPLITS + TRAIN_SUBSET}." + if skip_prep: logger.info("Skipping data preparation as `skip_prep` is set to `True`") return @@ -67,45 +70,93 @@ def prepare_gigaspeech( json_metadata = os.path.join(data_folder, json_file) logger.info("Creating train, dev, and test subsets.") + print("Starting reading JSON file.") with open(json_metadata, "r") as f: info = json.load(f) - - ret = {} - import time - time1 = time.time() - for split in splits + train_subset: - ret[split] = [] - for audio in info["audios"]: - # 1. Check if the audio is part of the "subsets". One audio can be part of multiple subsets - # such as "{XL}" and "{L}". - if ("{" + split + "}") in audio["subsets"]: - wav_path = os.path.join(data_folder, audio["path"]) - assert wav_path.is_file(), f"File not found: {wav_path}" + + print("Reading JSON file done.") + for split in splits: + print(f"Creating CSV for {split} split.") + output_csv_file = os.path.join(save_folder, f"{split}.csv") + create_csv(output_csv_file, info, split, data_folder) + exit() + +def process_line(audio, split): + if ("{" + split + "}") in audio["subsets"]: + print("VALID AUDIO FILE") + + wav_path = os.path.join(data_folder, audio["path"]) + assert os.path.isfile(wav_path), f"File not found: {wav_path}" + + # 2. iterate over the utterances + utterances = [] + for segment in audio["segments"]: + text = preprocess_text(segment["text_tn"]) + if text: + begin_time = float(segment["begin_time"]) + end_time = float(segment["end_time"]) + duration = end_time - begin_time + utterance = GigaSpeechRow( + utt_id=segment["sid"], + wav_id=audio["aid"], + wav_path=str(wav_path), + speaker=audio["speaker"], + begin_time=begin_time, + end_time=end_time, + duration=duration, + text=text, + ) + utterances.append(utterance) + return utterances + +def create_csv(csv_file, info, split, data_folder): + """TODO. + """ + ret = [] + print("inside create_csv") + logger.info(f"Creating CSV for {split} split.") + csv_file_tmp = csv_file + ".tmp" + total_duration = 0.0 + + line_processor = functools.partial( + process_line, + split=split, + ) + + print("audios = ", len(info["audios"])) + for audio in info["audios"]: + + # 1. Check if the audio is part of the "subsets". One audio can be part of multiple subsets + # such as "{XL}" and "{L}". + if ("{" + split + "}") in audio["subsets"]: + print("VALID AUDIO FILE") + + wav_path = os.path.join(data_folder, audio["path"]) + assert os.path.isfile(wav_path), f"File not found: {wav_path}" # 2. iterate over the utterances utterances = [] + print("segments = ", len(audio["segments"])) for segment in audio["segments"]: text = preprocess_text(segment["text_tn"]) if text: - print(segment["begin_time"], segment["end_time"]) begin_time = float(segment["begin_time"]) end_time = float(segment["end_time"]) duration = end_time - begin_time utterance = GigaSpeechRow( utt_id=segment["sid"], wav_id=audio["aid"], + wav_path=str(wav_path), speaker=audio["speaker"], begin_time=begin_time, end_time=end_time, duration=duration, text=text, ) - print(utterance) - exit() - - ret[split].append(utterances) + total_duration += duration + ret.append(utterances) exit() - + def preprocess_text(text: str) -> str: """ Preprocesses the input text by removing garbage tags and replacing punctuation tags. @@ -196,6 +247,7 @@ def check_file(path): if __name__ == "__main__": data_folder = "/local_disk/idyie/amoumen/GigaSpeech_data/" save_folder = "." - train_subset = ["XS"] - prepare_gigaspeech(data_folder, save_folder, train_subset=train_subset) + splits = ["XS", "DEV", "TEST"] + print("HERE") + prepare_gigaspeech(data_folder, save_folder, splits=splits) print("Done") \ No newline at end of file From 7bdb17f7f649c618b2b3f259018a2c68ba75bf79 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sat, 10 Feb 2024 15:04:57 +0100 Subject: [PATCH 04/77] parallel map -> so fast omfg --- recipes/GigaSpeech/gigaspeech_prepare.py | 119 +++++++++++++---------- 1 file changed, 66 insertions(+), 53 deletions(-) diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index 7e20010fb7..6fa4c27d4f 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -12,10 +12,12 @@ import logging import os import json +import csv from dataclasses import dataclass import functools from speechbrain.utils.parallel import parallel_map +logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) SAMPLERATE = 16000 GRABAGE_UTTERANCE_TAGS = ["", "", "", ""] @@ -32,8 +34,8 @@ @dataclass class GigaSpeechRow: utt_id: str # segment[sid] - wav_id: str # audio[aid] - wav_path: str + audio_id: str # audio[aid] + audio_path: str # by default this is opus files speaker: str # audio["speaker"] begin_time: float end_time: float @@ -53,6 +55,10 @@ def prepare_gigaspeech( for split in splits: assert split in SPLITS + TRAIN_SUBSET, f"Split {split} not recognized. Valid splits are {SPLITS + TRAIN_SUBSET}." + # check that we are not using multiple train subsets + if len(set(splits).intersection(TRAIN_SUBSET)) > 1: + raise ValueError("You cannot use multiple train subsets. Please select only one train subset.") + if skip_prep: logger.info("Skipping data preparation as `skip_prep` is set to `True`") return @@ -70,23 +76,26 @@ def prepare_gigaspeech( json_metadata = os.path.join(data_folder, json_file) logger.info("Creating train, dev, and test subsets.") - print("Starting reading JSON file.") + logger.info(f"Starting reading {json_file}.") with open(json_metadata, "r") as f: info = json.load(f) + logger.info(f"Reading {json_file} done.") - print("Reading JSON file done.") for split in splits: - print(f"Creating CSV for {split} split.") - output_csv_file = os.path.join(save_folder, f"{split}.csv") - create_csv(output_csv_file, info, split, data_folder) - exit() + if split in TRAIN_SUBSET: + logger.info(f"Starting creating train.csv using {split} subset.") + output_csv_file = os.path.join(save_folder, f"train.csv") + create_csv(output_csv_file, info, split) + else: + logger.info(f"Starting creating {split.lower()}.csv using {split} subset.") + output_csv_file = os.path.join(save_folder, f"{split.lower()}.csv") + create_csv(output_csv_file, info, split) def process_line(audio, split): if ("{" + split + "}") in audio["subsets"]: - print("VALID AUDIO FILE") - wav_path = os.path.join(data_folder, audio["path"]) - assert os.path.isfile(wav_path), f"File not found: {wav_path}" + audio_path = os.path.join(data_folder, audio["path"]) + assert os.path.isfile(audio_path), f"File not found: {audio_path}" # 2. iterate over the utterances utterances = [] @@ -98,8 +107,8 @@ def process_line(audio, split): duration = end_time - begin_time utterance = GigaSpeechRow( utt_id=segment["sid"], - wav_id=audio["aid"], - wav_path=str(wav_path), + audio_id=audio["aid"], + audio_path=str(audio_path), speaker=audio["speaker"], begin_time=begin_time, end_time=end_time, @@ -109,54 +118,58 @@ def process_line(audio, split): utterances.append(utterance) return utterances -def create_csv(csv_file, info, split, data_folder): +def create_csv(csv_file, info, split): """TODO. - """ - ret = [] - print("inside create_csv") - logger.info(f"Creating CSV for {split} split.") - csv_file_tmp = csv_file + ".tmp" + """ total_duration = 0.0 + nb_samples = 0 line_processor = functools.partial( process_line, split=split, ) - - print("audios = ", len(info["audios"])) - for audio in info["audios"]: - - # 1. Check if the audio is part of the "subsets". One audio can be part of multiple subsets - # such as "{XL}" and "{L}". - if ("{" + split + "}") in audio["subsets"]: - print("VALID AUDIO FILE") - - wav_path = os.path.join(data_folder, audio["path"]) - assert os.path.isfile(wav_path), f"File not found: {wav_path}" - - # 2. iterate over the utterances - utterances = [] - print("segments = ", len(audio["segments"])) - for segment in audio["segments"]: - text = preprocess_text(segment["text_tn"]) - if text: - begin_time = float(segment["begin_time"]) - end_time = float(segment["end_time"]) - duration = end_time - begin_time - utterance = GigaSpeechRow( - utt_id=segment["sid"], - wav_id=audio["aid"], - wav_path=str(wav_path), - speaker=audio["speaker"], - begin_time=begin_time, - end_time=end_time, - duration=duration, - text=text, - ) - total_duration += duration - ret.append(utterances) - exit() + csv_file_tmp = csv_file + ".tmp" + with open(csv_file_tmp, mode="w", encoding="utf-8") as csv_f: + csv_writer = csv.writer( + csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL + ) + header = [ + "utt_id", + "audio_id", + "audio_path", + "speaker", + "begin_time", + "end_time", + "duration", + "text", + ] + csv_writer.writerow(header) + for row in parallel_map(line_processor, info["audios"]): + if row is None: + continue + + for item in row: + csv_writer.writerow([ + item.utt_id, + item.audio_id, + item.audio_path, + item.speaker, + str(item.begin_time), + str(item.end_time), + str(item.duration), + item.text + ]) + + total_duration += item.duration + nb_samples += 1 + + os.replace(csv_file_tmp, csv_file) + + logger.info(f"{csv_file} succesfully created!") + logger.info(f"Number of samples in {split} split: {nb_samples}") + logger.info(f"Total duration of {split} split: {round(total_duration / 3600, 2)} Hours") + def preprocess_text(text: str) -> str: """ Preprocesses the input text by removing garbage tags and replacing punctuation tags. From a3d2d4c6e3ee729bd60086d90ea8daeef2ff3e0f Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sat, 10 Feb 2024 15:49:37 +0100 Subject: [PATCH 05/77] gigaspeech data prep done --- recipes/GigaSpeech/gigaspeech_prepare.py | 282 ++++++++++++++++------- 1 file changed, 196 insertions(+), 86 deletions(-) diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index 6fa4c27d4f..d13fda4bb2 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -17,15 +17,14 @@ import functools from speechbrain.utils.parallel import parallel_map -logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -SAMPLERATE = 16000 + GRABAGE_UTTERANCE_TAGS = ["", "", "", ""] PUNCTUATION_TAGS = { - "": ",", - "": "!", - "": ".", - "": "?" + "": ",", + "": "!", + "": ".", + "": "?", } SPLITS = ["DEV", "TEST"] TRAIN_SUBSET = ["XS", "S", "M", "L", "XL"] @@ -33,39 +32,103 @@ @dataclass class GigaSpeechRow: - utt_id: str # segment[sid] - audio_id: str # audio[aid] - audio_path: str # by default this is opus files - speaker: str # audio["speaker"] + """ Dataclass for handling GigaSpeech rows. + + Attributes + ---------- + utt_id : str + The segment ID. + audio_id : str + The audio ID. + audio_path : str + The path to the audio file. + speaker : str + The speaker ID. + begin_time : float + The start time of the segment. + end_time : float + The end time of the segment. + duration : float + The duration of the segment. + text : str + The text of the segment. + """ + + utt_id: str # segment[sid] + audio_id: str # audio[aid] + audio_path: str # by default this is opus files + speaker: str # audio["speaker"] begin_time: float end_time: float duration: float text: str + def prepare_gigaspeech( - data_folder, - save_folder, + data_folder: str, + save_folder: str, splits: list, - json_file="GigaSpeech.json", + json_file: str = "GigaSpeech.json", skip_prep: bool = False, -): - """TODO. +) -> None: + """ Prepare the csv files for GigaSpeech dataset. + + Download instructions: https://github.com/SpeechColab/GigaSpeech + Reference: https://arxiv.org/abs/2106.06909 + + The `train.csv` file is created by following the train subset specified in the `splits` list. + It must be part of the `TRAIN_SUBSET` list. You cannot use multiple train subsets. + + The `dev.csv` and `test.csv` files are created based on the `DEV` and `TEST` splits + specified in the `splits` list. + + Parameters + ---------- + data_folder : str + The path to the GigaSpeech dataset. + save_folder : str + The path to the folder where the CSV files will be saved. + splits : list + The list of splits to be used for creating the CSV files. + json_file : str, optional + The name of the JSON file containing the metadata of the GigaSpeech dataset. + skip_prep : bool, optional + If True, the data preparation will be skipped, and the function will return immediately. + + Returns + ------- + None """ + if skip_prep: + logger.info("Skipping data preparation as `skip_prep` is set to `True`") + return + # check that `splits` input is valid for split in splits: - assert split in SPLITS + TRAIN_SUBSET, f"Split {split} not recognized. Valid splits are {SPLITS + TRAIN_SUBSET}." - + assert ( + split in SPLITS + TRAIN_SUBSET + ), f"Split {split} not recognized. Valid splits are {SPLITS + TRAIN_SUBSET}." + # check that we are not using multiple train subsets if len(set(splits).intersection(TRAIN_SUBSET)) > 1: - raise ValueError("You cannot use multiple train subsets. Please select only one train subset.") + raise ValueError( + "You cannot use multiple train subsets. Please select only one train subset." + ) - if skip_prep: - logger.info("Skipping data preparation as `skip_prep` is set to `True`") - return - os.makedirs(save_folder, exist_ok=True) - if skip(): # TODO: Implement skip function + # Setting output files + save_csv_files = {} + for split in splits: + if split in TRAIN_SUBSET: + save_csv_files[split] = os.path.join(save_folder, "train.csv") + else: + save_csv_files[split] = os.path.join( + save_folder, f"{split.lower()}.csv" + ) + + # check if the data is already prepared + if skip(save_csv_files): logger.info("Skipping preparation, completed in previous run.") return else: @@ -74,24 +137,36 @@ def prepare_gigaspeech( check_gigaspeech_folders(data_folder, json_file) json_metadata = os.path.join(data_folder, json_file) - logger.info("Creating train, dev, and test subsets.") - logger.info(f"Starting reading {json_file}.") with open(json_metadata, "r") as f: info = json.load(f) logger.info(f"Reading {json_file} done.") - - for split in splits: - if split in TRAIN_SUBSET: - logger.info(f"Starting creating train.csv using {split} subset.") - output_csv_file = os.path.join(save_folder, f"train.csv") - create_csv(output_csv_file, info, split) - else: - logger.info(f"Starting creating {split.lower()}.csv using {split} subset.") - output_csv_file = os.path.join(save_folder, f"{split.lower()}.csv") - create_csv(output_csv_file, info, split) -def process_line(audio, split): + logger.info("Creating train, dev, and test subsets.") + for split, output_csv_file in save_csv_files.items(): + logger.info(f"Starting creating {output_csv_file} using {split} split.") + create_csv(output_csv_file, info, data_folder, split) + logger.info("Data preparation completed!") + + +def process_line(audio: json, data_folder: str, split: str) -> list: + """ + Process the audio line and return the utterances for the given split. + + Parameters + ---------- + audio : dict + The audio line to be processed. + data_folder : str + The path to the GigaSpeech dataset. + split : str + The split to be used for filtering the data. + + Returns + ------- + list + The list of utterances for the given split. + """ if ("{" + split + "}") in audio["subsets"]: audio_path = os.path.join(data_folder, audio["path"]) @@ -118,17 +193,33 @@ def process_line(audio, split): utterances.append(utterance) return utterances -def create_csv(csv_file, info, split): - """TODO. - """ + +def create_csv(csv_file: str, info: json, data_folder: str, split: str) -> None: + """ + Create a CSV file based on the info in the GigaSpeech JSON file and filter the data based on the split. + + Parameters + ---------- + csv_file : str + The path to the CSV file to be created. + info : dict + The GigaSpeech JSON file content. + data_folder : str + The path to the GigaSpeech dataset. + split : str + The split to be used for filtering the data. + + Returns + ------- + None + """ total_duration = 0.0 nb_samples = 0 - + line_processor = functools.partial( - process_line, - split=split, + process_line, data_folder=data_folder, split=split, ) - + csv_file_tmp = csv_file + ".tmp" with open(csv_file_tmp, mode="w", encoding="utf-8") as csv_f: csv_writer = csv.writer( @@ -148,28 +239,33 @@ def create_csv(csv_file, info, split): for row in parallel_map(line_processor, info["audios"]): if row is None: continue - + for item in row: - csv_writer.writerow([ - item.utt_id, - item.audio_id, - item.audio_path, - item.speaker, - str(item.begin_time), - str(item.end_time), - str(item.duration), - item.text - ]) - + csv_writer.writerow( + [ + item.utt_id, + item.audio_id, + item.audio_path, + item.speaker, + str(item.begin_time), + str(item.end_time), + str(item.duration), + item.text, + ] + ) + total_duration += item.duration nb_samples += 1 - + os.replace(csv_file_tmp, csv_file) logger.info(f"{csv_file} succesfully created!") logger.info(f"Number of samples in {split} split: {nb_samples}") - logger.info(f"Total duration of {split} split: {round(total_duration / 3600, 2)} Hours") - + logger.info( + f"Total duration of {split} split: {round(total_duration / 3600, 2)} Hours" + ) + + def preprocess_text(text: str) -> str: """ Preprocesses the input text by removing garbage tags and replacing punctuation tags. @@ -205,25 +301,51 @@ def preprocess_text(text: str) -> str: for tag in GRABAGE_UTTERANCE_TAGS: if tag in text: return "" - + # Remove punctuation tags for tag, punctuation in PUNCTUATION_TAGS.items(): - text = text.replace(' ' + tag, punctuation) - - assert "<" not in text and ">" not in text, f"Found tags in the text: {text}" + text = text.replace(" " + tag, punctuation) + + assert ( + "<" not in text and ">" not in text + ), f"Found tags in the text: {text}" return text.lower() -def skip(): - """TODO. +def skip(save_csv_files: dict) -> bool: + """ Check if the CSV files already exist. + + Parameters + ---------- + save_csv_files : dict + The dictionary containing the paths to the CSV files. + + Returns + ------- + bool + True if all the CSV files already exist, False otherwise. """ - return False + return all(os.path.isfile(path) for path in save_csv_files.values()) + -def check_gigaspeech_folders(data_folder, json_file="GigaSpeech.json", audio_folder="audio"): +def check_gigaspeech_folders( + data_folder: str, + json_file: str = "GigaSpeech.json", + audio_folder: str = "audio", +) -> None: """Check if the data folder actually contains the GigaSpeech dataset. If it does not, an error is raised. + Parameters + ---------- + data_folder : str + The path to the GigaSpeech dataset. + json_file : str, optional + The name of the JSON file containing the metadata of the GigaSpeech dataset. + audio_folder : str, optional + The name of the folder containing the audio files of the GigaSpeech dataset. + Returns ------- None @@ -235,8 +357,14 @@ def check_gigaspeech_folders(data_folder, json_file="GigaSpeech.json", audio_fol """ # Checking if "GigaSpeech.json" exist json_gigaspeech = os.path.join(data_folder, json_file) - check_file(json_gigaspeech) - + + if not os.path.exists(json_gigaspeech): + err_msg = ( + "the opus file %s does not exist (it is expected in the " + "Gigaspeech dataset)" % json_gigaspeech + ) + raise OSError(err_msg) + # Check if audio folders exist for folder_subset in ["audiobook", "podcast", "youtube"]: audio_subset = os.path.join(data_folder, audio_folder, folder_subset) @@ -246,21 +374,3 @@ def check_gigaspeech_folders(data_folder, json_file="GigaSpeech.json", audio_fol "Gigaspeech dataset)" % audio_subset ) raise OSError(err_msg) - -def check_file(path): - # Check if file exist - if not os.path.exists(path): - err_msg = ( - "the opus file %s does not exist (it is expected in the " - "Gigaspeech dataset)" % path - ) - raise OSError(err_msg) - - -if __name__ == "__main__": - data_folder = "/local_disk/idyie/amoumen/GigaSpeech_data/" - save_folder = "." - splits = ["XS", "DEV", "TEST"] - print("HERE") - prepare_gigaspeech(data_folder, save_folder, splits=splits) - print("Done") \ No newline at end of file From 4cb3257761777db61bf8fb42ec9bd35096533f14 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sat, 10 Feb 2024 15:50:13 +0100 Subject: [PATCH 06/77] speechcolab extra dep if one must download gigaspeech --- recipes/GigaSpeech/extra_requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 recipes/GigaSpeech/extra_requirements.txt diff --git a/recipes/GigaSpeech/extra_requirements.txt b/recipes/GigaSpeech/extra_requirements.txt new file mode 100644 index 0000000000..7a239a9e76 --- /dev/null +++ b/recipes/GigaSpeech/extra_requirements.txt @@ -0,0 +1 @@ +speechcolab From e521cc1e82f7c880c75cd5435e5d2900b7d9e304 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sat, 10 Feb 2024 15:53:46 +0100 Subject: [PATCH 07/77] create ASR CTC folder --- recipes/GigaSpeech/ASR/CTC/README.md | 1 + .../GigaSpeech/ASR/CTC/gigaspeech_prepare.py | 376 ++++++++++++++++++ 2 files changed, 377 insertions(+) create mode 100644 recipes/GigaSpeech/ASR/CTC/README.md create mode 100644 recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py diff --git a/recipes/GigaSpeech/ASR/CTC/README.md b/recipes/GigaSpeech/ASR/CTC/README.md new file mode 100644 index 0000000000..9cfe1d8671 --- /dev/null +++ b/recipes/GigaSpeech/ASR/CTC/README.md @@ -0,0 +1 @@ +to do \ No newline at end of file diff --git a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py new file mode 100644 index 0000000000..d13fda4bb2 --- /dev/null +++ b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py @@ -0,0 +1,376 @@ +""" +Data preparation script for the GigaSpeech dataset. + +Download instructions: https://github.com/SpeechColab/GigaSpeech +Reference: https://arxiv.org/abs/2106.06909 + +Author +------- + * Adel Moumen, 2024 +""" + +import logging +import os +import json +import csv +from dataclasses import dataclass +import functools +from speechbrain.utils.parallel import parallel_map + +logger = logging.getLogger(__name__) + +GRABAGE_UTTERANCE_TAGS = ["", "", "", ""] +PUNCTUATION_TAGS = { + "": ",", + "": "!", + "": ".", + "": "?", +} +SPLITS = ["DEV", "TEST"] +TRAIN_SUBSET = ["XS", "S", "M", "L", "XL"] + + +@dataclass +class GigaSpeechRow: + """ Dataclass for handling GigaSpeech rows. + + Attributes + ---------- + utt_id : str + The segment ID. + audio_id : str + The audio ID. + audio_path : str + The path to the audio file. + speaker : str + The speaker ID. + begin_time : float + The start time of the segment. + end_time : float + The end time of the segment. + duration : float + The duration of the segment. + text : str + The text of the segment. + """ + + utt_id: str # segment[sid] + audio_id: str # audio[aid] + audio_path: str # by default this is opus files + speaker: str # audio["speaker"] + begin_time: float + end_time: float + duration: float + text: str + + +def prepare_gigaspeech( + data_folder: str, + save_folder: str, + splits: list, + json_file: str = "GigaSpeech.json", + skip_prep: bool = False, +) -> None: + """ Prepare the csv files for GigaSpeech dataset. + + Download instructions: https://github.com/SpeechColab/GigaSpeech + Reference: https://arxiv.org/abs/2106.06909 + + The `train.csv` file is created by following the train subset specified in the `splits` list. + It must be part of the `TRAIN_SUBSET` list. You cannot use multiple train subsets. + + The `dev.csv` and `test.csv` files are created based on the `DEV` and `TEST` splits + specified in the `splits` list. + + Parameters + ---------- + data_folder : str + The path to the GigaSpeech dataset. + save_folder : str + The path to the folder where the CSV files will be saved. + splits : list + The list of splits to be used for creating the CSV files. + json_file : str, optional + The name of the JSON file containing the metadata of the GigaSpeech dataset. + skip_prep : bool, optional + If True, the data preparation will be skipped, and the function will return immediately. + + Returns + ------- + None + """ + if skip_prep: + logger.info("Skipping data preparation as `skip_prep` is set to `True`") + return + + # check that `splits` input is valid + for split in splits: + assert ( + split in SPLITS + TRAIN_SUBSET + ), f"Split {split} not recognized. Valid splits are {SPLITS + TRAIN_SUBSET}." + + # check that we are not using multiple train subsets + if len(set(splits).intersection(TRAIN_SUBSET)) > 1: + raise ValueError( + "You cannot use multiple train subsets. Please select only one train subset." + ) + + os.makedirs(save_folder, exist_ok=True) + + # Setting output files + save_csv_files = {} + for split in splits: + if split in TRAIN_SUBSET: + save_csv_files[split] = os.path.join(save_folder, "train.csv") + else: + save_csv_files[split] = os.path.join( + save_folder, f"{split.lower()}.csv" + ) + + # check if the data is already prepared + if skip(save_csv_files): + logger.info("Skipping preparation, completed in previous run.") + return + else: + logger.info("Starting data preparation...") + + check_gigaspeech_folders(data_folder, json_file) + + json_metadata = os.path.join(data_folder, json_file) + logger.info(f"Starting reading {json_file}.") + with open(json_metadata, "r") as f: + info = json.load(f) + logger.info(f"Reading {json_file} done.") + + logger.info("Creating train, dev, and test subsets.") + for split, output_csv_file in save_csv_files.items(): + logger.info(f"Starting creating {output_csv_file} using {split} split.") + create_csv(output_csv_file, info, data_folder, split) + logger.info("Data preparation completed!") + + +def process_line(audio: json, data_folder: str, split: str) -> list: + """ + Process the audio line and return the utterances for the given split. + + Parameters + ---------- + audio : dict + The audio line to be processed. + data_folder : str + The path to the GigaSpeech dataset. + split : str + The split to be used for filtering the data. + + Returns + ------- + list + The list of utterances for the given split. + """ + if ("{" + split + "}") in audio["subsets"]: + + audio_path = os.path.join(data_folder, audio["path"]) + assert os.path.isfile(audio_path), f"File not found: {audio_path}" + + # 2. iterate over the utterances + utterances = [] + for segment in audio["segments"]: + text = preprocess_text(segment["text_tn"]) + if text: + begin_time = float(segment["begin_time"]) + end_time = float(segment["end_time"]) + duration = end_time - begin_time + utterance = GigaSpeechRow( + utt_id=segment["sid"], + audio_id=audio["aid"], + audio_path=str(audio_path), + speaker=audio["speaker"], + begin_time=begin_time, + end_time=end_time, + duration=duration, + text=text, + ) + utterances.append(utterance) + return utterances + + +def create_csv(csv_file: str, info: json, data_folder: str, split: str) -> None: + """ + Create a CSV file based on the info in the GigaSpeech JSON file and filter the data based on the split. + + Parameters + ---------- + csv_file : str + The path to the CSV file to be created. + info : dict + The GigaSpeech JSON file content. + data_folder : str + The path to the GigaSpeech dataset. + split : str + The split to be used for filtering the data. + + Returns + ------- + None + """ + total_duration = 0.0 + nb_samples = 0 + + line_processor = functools.partial( + process_line, data_folder=data_folder, split=split, + ) + + csv_file_tmp = csv_file + ".tmp" + with open(csv_file_tmp, mode="w", encoding="utf-8") as csv_f: + csv_writer = csv.writer( + csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL + ) + header = [ + "utt_id", + "audio_id", + "audio_path", + "speaker", + "begin_time", + "end_time", + "duration", + "text", + ] + csv_writer.writerow(header) + for row in parallel_map(line_processor, info["audios"]): + if row is None: + continue + + for item in row: + csv_writer.writerow( + [ + item.utt_id, + item.audio_id, + item.audio_path, + item.speaker, + str(item.begin_time), + str(item.end_time), + str(item.duration), + item.text, + ] + ) + + total_duration += item.duration + nb_samples += 1 + + os.replace(csv_file_tmp, csv_file) + + logger.info(f"{csv_file} succesfully created!") + logger.info(f"Number of samples in {split} split: {nb_samples}") + logger.info( + f"Total duration of {split} split: {round(total_duration / 3600, 2)} Hours" + ) + + +def preprocess_text(text: str) -> str: + """ + Preprocesses the input text by removing garbage tags and replacing punctuation tags. + + Parameters + ---------- + text : str + The input text to be preprocessed. + + Returns + ------- + str + The preprocessed text with removed garbage tags and replaced punctuation tags. + + Raises + ------ + AssertionError + If '<' or '>' tags are found in the text after preprocessing. + + Notes + ----- + The function iterates over predefined garbage utterance tags (GRABAGE_UTTERANCE_TAGS) + and removes them from the input text. It then iterates over predefined punctuation tags + (PUNCTUATION_TAGS) and replaces them with the corresponding punctuation. + + Examples + -------- + >>> text = " DOUGLAS MCGRAY IS GOING TO BE OUR GUIDE YOU WALK THROUGH THE DOOR YOU SEE THE RED CARPETING YOU SEE SOMEONE IN A SUIT THEY MAY BE GREETING YOU " + >>> preprocess_text(text) + "douglas mcgray is going to be our guide you walk through the door, you see the red carpeting, you see someone in a suit. they may be greeting you." + """ + # Remove garbage tags + for tag in GRABAGE_UTTERANCE_TAGS: + if tag in text: + return "" + + # Remove punctuation tags + for tag, punctuation in PUNCTUATION_TAGS.items(): + text = text.replace(" " + tag, punctuation) + + assert ( + "<" not in text and ">" not in text + ), f"Found tags in the text: {text}" + return text.lower() + + +def skip(save_csv_files: dict) -> bool: + """ Check if the CSV files already exist. + + Parameters + ---------- + save_csv_files : dict + The dictionary containing the paths to the CSV files. + + Returns + ------- + bool + True if all the CSV files already exist, False otherwise. + """ + return all(os.path.isfile(path) for path in save_csv_files.values()) + + +def check_gigaspeech_folders( + data_folder: str, + json_file: str = "GigaSpeech.json", + audio_folder: str = "audio", +) -> None: + """Check if the data folder actually contains the GigaSpeech dataset. + + If it does not, an error is raised. + + Parameters + ---------- + data_folder : str + The path to the GigaSpeech dataset. + json_file : str, optional + The name of the JSON file containing the metadata of the GigaSpeech dataset. + audio_folder : str, optional + The name of the folder containing the audio files of the GigaSpeech dataset. + + Returns + ------- + None + + Raises + ------ + OSError + If GigaSpeech is not found at the specified path. + """ + # Checking if "GigaSpeech.json" exist + json_gigaspeech = os.path.join(data_folder, json_file) + + if not os.path.exists(json_gigaspeech): + err_msg = ( + "the opus file %s does not exist (it is expected in the " + "Gigaspeech dataset)" % json_gigaspeech + ) + raise OSError(err_msg) + + # Check if audio folders exist + for folder_subset in ["audiobook", "podcast", "youtube"]: + audio_subset = os.path.join(data_folder, audio_folder, folder_subset) + if not os.path.exists(audio_subset): + err_msg = ( + "the file %s does not exist (it is expected in the " + "Gigaspeech dataset)" % audio_subset + ) + raise OSError(err_msg) From 92a17c16f6c50d5b9f826c82dde69e487c1d9721 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sat, 10 Feb 2024 16:04:15 +0100 Subject: [PATCH 08/77] base yaml + update data prep to better reflect potential different naming for csvs --- .../ASR/CTC/hparams/train_with_wavlm.yaml | 183 ++++++++++++++++++ recipes/GigaSpeech/gigaspeech_prepare.py | 18 +- 2 files changed, 197 insertions(+), 4 deletions(-) create mode 100644 recipes/GigaSpeech/ASR/CTC/hparams/train_with_wavlm.yaml diff --git a/recipes/GigaSpeech/ASR/CTC/hparams/train_with_wavlm.yaml b/recipes/GigaSpeech/ASR/CTC/hparams/train_with_wavlm.yaml new file mode 100644 index 0000000000..b172cd559c --- /dev/null +++ b/recipes/GigaSpeech/ASR/CTC/hparams/train_with_wavlm.yaml @@ -0,0 +1,183 @@ +# ################################ +# Model: wavlm + DNN + CTC +# Decoding AM: Greedy for validation, and Beam search for testing +# Augmentation: SpecAugment +# Authors: Adel Moumen 2024 +# ################################ + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/train_wavlm_char/ +output_wer_folder: !ref / +save_folder: !ref /save +train_log: !ref /train_log.txt + +wav2vec2_hub: microsoft/wavlm-large +wav2vec2_folder: !ref /wav2vec2_checkpoint + +# Data files +data_folder: !PLACEHOLDER # e,g./path/to/GigaSpeech + +# see https://github.com/SpeechColab/GigaSpeech for more details on the dataset +# must be one of ["XS", "S", "M", "L", "XL"] +# and ["DEV", "TEST"] for the eval splits. +splits: ["XS", "DEV", "TEST"] +skip_prep: False +ckpt_interval_minutes: 25 # save checkpoint every N min +train_csv: !ref /train.csv +valid_csv: !ref /dev.csv +test_csv: !ref /test.csv + +# Training parameters +number_of_epochs: 1 +lr: 0.9 +lr_wav2vec: 0.0001 +sorting: ascending +precision: fp32 # bf16, fp16 or fp32 +sample_rate: 16000 + +# With data_parallel batch_size is split into N jobs +# With DDP batch_size is multiplied by N jobs +# Must be 3 per GPU to fit 32GB of VRAM +batch_size: 6 +test_batch_size: 8 + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + +# Model parameters +activation: !name:torch.nn.LeakyReLU +dnn_layers: 2 +dnn_neurons: 1024 +freeze_wav2vec: True + +# Outputs +output_neurons: 29 # BPE size, index(blank/eos/bos) = 0 +blank_index: 0 + +# Decoding parameters +test_beam_search: + beam_size: 143 + topk: 1 + blank_index: !ref + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -12.0 + token_prune_min_logp: -1.2 + prune_history: True + +# +# Functions and classes +# +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref + speeds: [95, 100, 105] + +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1 + drop_length_high: 5 + drop_count_low: 1000 + drop_count_high: 2000 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + parallel_augment: False + concat_original: True + repeat_augment: 1 + shuffle_augmentations: False + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref , + !ref , + !ref ] + + +enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN + input_shape: [null, null, 1024] + activation: !ref + dnn_blocks: !ref + dnn_neurons: !ref + +wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + output_norm: True + freeze: !ref + save_path: !ref + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + +modules: + wav2vec2: !ref + enc: !ref + ctc_lin: !ref + +model: !new:torch.nn.ModuleList + - [!ref , !ref ] + +model_opt_class: !name:torch.optim.Adadelta + lr: !ref + rho: 0.95 + eps: 1.e-8 + +wav2vec_opt_class: !name:torch.optim.Adam + lr: !ref + +lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 + +lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.9 + patient: 0 + +label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + wav2vec2: !ref + model: !ref + scheduler_model: !ref + scheduler_wav2vec: !ref + counter: !ref + tokenizer: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index d13fda4bb2..a1728e59d1 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -68,6 +68,9 @@ def prepare_gigaspeech( data_folder: str, save_folder: str, splits: list, + output_train_csv_filename=None, + output_dev_csv_filename=None, + output_test_csv_filename=None, json_file: str = "GigaSpeech.json", skip_prep: bool = False, ) -> None: @@ -90,6 +93,12 @@ def prepare_gigaspeech( The path to the folder where the CSV files will be saved. splits : list The list of splits to be used for creating the CSV files. + output_train_csv_filename : str, optional + The name of the CSV file which will be containing the train subset. + output_dev_csv_filename : str, optional + The name of the CSV file which will be containing the dev subset. + output_test_csv_filename : str, optional + The name of the CSV file which will be containing the test subset. json_file : str, optional The name of the JSON file containing the metadata of the GigaSpeech dataset. skip_prep : bool, optional @@ -121,11 +130,12 @@ def prepare_gigaspeech( save_csv_files = {} for split in splits: if split in TRAIN_SUBSET: - save_csv_files[split] = os.path.join(save_folder, "train.csv") + save_csv_files[split] = output_train_csv_filename else: - save_csv_files[split] = os.path.join( - save_folder, f"{split.lower()}.csv" - ) + if split == "DEV": + save_csv_files[split] = output_dev_csv_filename + elif split == "TEST": + save_csv_files[split] = output_test_csv_filename # check if the data is already prepared if skip(save_csv_files): From 4dd02a0ba901fe4219d2aa2862e9e532eae65bd3 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sat, 10 Feb 2024 16:09:10 +0100 Subject: [PATCH 09/77] update recipe --- ...in_with_wavlm.yaml => train_hf_wavlm.yaml} | 0 .../GigaSpeech/ASR/CTC/train_with_wavlm.py | 386 ++++++++++++++++++ tests/recipes/GigaSpeech.csv | 2 + 3 files changed, 388 insertions(+) rename recipes/GigaSpeech/ASR/CTC/hparams/{train_with_wavlm.yaml => train_hf_wavlm.yaml} (100%) create mode 100644 recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py create mode 100644 tests/recipes/GigaSpeech.csv diff --git a/recipes/GigaSpeech/ASR/CTC/hparams/train_with_wavlm.yaml b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml similarity index 100% rename from recipes/GigaSpeech/ASR/CTC/hparams/train_with_wavlm.yaml rename to recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py new file mode 100644 index 0000000000..9620d7bd06 --- /dev/null +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py @@ -0,0 +1,386 @@ +"""TODO + +Authors + * Adel Moumen 2024 +""" +import os +import sys +import torch +import logging +import speechbrain as sb +from speechbrain.utils.distributed import run_on_main, if_main_process +from hyperpyyaml import load_hyperpyyaml +from pathlib import Path + +logger = logging.getLogger(__name__) + + +# Define training procedure +class ASR(sb.Brain): + def compute_forward(self, batch, stage): + """Forward computations from the waveform batches to the output probabilities.""" + batch = batch.to(self.device) + wavs, wav_lens = batch.sig + wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device) + + # Downsample the inputs if specified + if hasattr(self.modules, "downsampler"): + wavs = self.modules.downsampler(wavs) + + # Add waveform augmentation if specified. + if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): + wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens) + + # Forward pass + + # Handling SpeechBrain vs HuggingFance pretrained models + if hasattr(self.modules, "extractor"): # SpeechBrain pretrained model + latents = self.modules.extractor(wavs) + feats = self.modules.encoder_wrapper(latents, wav_lens=wav_lens)[ + "embeddings" + ] + else: # HuggingFace pretrained model + feats = self.modules.wav2vec2(wavs, wav_lens) + + x = self.modules.enc(feats) + + # Compute outputs + p_tokens = None + logits = self.modules.ctc_lin(x) + + # Upsample the inputs if they have been highly downsampled + if hasattr(self.hparams, "upsampling") and self.hparams.upsampling: + logits = logits.view( + logits.shape[0], -1, self.hparams.output_neurons + ) + + p_ctc = self.hparams.log_softmax(logits) + + if stage == sb.Stage.VALID: + p_tokens = sb.decoders.ctc_greedy_decode( + p_ctc, wav_lens, blank_id=self.hparams.blank_index + ) + elif stage == sb.Stage.TEST: + p_tokens = test_searcher(p_ctc, wav_lens) + + candidates = [] + scores = [] + + for batch in p_tokens: + candidates.append([hyp.text for hyp in batch]) + scores.append([hyp.score for hyp in batch]) + + if hasattr(self.hparams, "rescorer"): + p_tokens, _ = self.hparams.rescorer.rescore(candidates, scores) + + return p_ctc, wav_lens, p_tokens + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss (CTC+NLL) given predictions and targets.""" + + p_ctc, wav_lens, predicted_tokens = predictions + + ids = batch.id + tokens, tokens_lens = batch.tokens + + # Label Augmentation + if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): + tokens = self.hparams.wav_augment.replicate_labels(tokens) + tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) + + loss_ctc = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) + loss = loss_ctc + + if stage == sb.Stage.VALID: + # Decode token terms to words + predicted_words = [ + "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") + for utt_seq in predicted_tokens + ] + elif stage == sb.Stage.TEST: + if hasattr(self.hparams, "rescorer"): + predicted_words = [ + hyp[0].split(" ") for hyp in predicted_tokens + ] + else: + predicted_words = [ + hyp[0].text.split(" ") for hyp in predicted_tokens + ] + + if stage != sb.Stage.TRAIN: + target_words = [wrd.split(" ") for wrd in batch.wrd] + self.wer_metric.append(ids, predicted_words, target_words) + self.cer_metric.append(ids, predicted_words, target_words) + + return loss + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch""" + if stage != sb.Stage.TRAIN: + self.cer_metric = self.hparams.cer_computer() + self.wer_metric = self.hparams.error_rate_computer() + + if stage == sb.Stage.TEST: + if hasattr(self.hparams, "rescorer"): + self.hparams.rescorer.move_rescorers_to_device() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of an epoch.""" + # Compute/store important stats + stage_stats = {"loss": stage_loss} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + else: + stage_stats["CER"] = self.cer_metric.summarize("error_rate") + stage_stats["WER"] = self.wer_metric.summarize("error_rate") + + # Perform end-of-iteration things, like annealing, logging, etc. + if stage == sb.Stage.VALID: + old_lr_model, new_lr_model = self.hparams.lr_annealing_model( + stage_stats["loss"] + ) + old_lr_wav2vec, new_lr_wav2vec = self.hparams.lr_annealing_wav2vec( + stage_stats["loss"] + ) + sb.nnet.schedulers.update_learning_rate( + self.model_optimizer, new_lr_model + ) + sb.nnet.schedulers.update_learning_rate( + self.wav2vec_optimizer, new_lr_wav2vec + ) + self.hparams.train_logger.log_stats( + stats_meta={ + "epoch": epoch, + "lr_model": old_lr_model, + "lr_wav2vec": old_lr_wav2vec, + }, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + self.checkpointer.save_and_keep_only( + meta={"WER": stage_stats["WER"]}, min_keys=["WER"], + ) + elif stage == sb.Stage.TEST: + self.hparams.train_logger.log_stats( + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, + ) + if if_main_process(): + with open(self.hparams.test_wer_file, "w") as w: + self.wer_metric.write_stats(w) + + def init_optimizers(self): + "Initializes the wav2vec2 optimizer and model optimizer" + # Handling SpeechBrain vs HuggingFace pretrained models + if hasattr(self.modules, "extractor"): # SpeechBrain pretrained model + self.wav2vec_optimizer = self.hparams.wav2vec_opt_class( + self.modules.encoder_wrapper.parameters() + ) + + else: # HuggingFace pretrained model + self.wav2vec_optimizer = self.hparams.wav2vec_opt_class( + self.modules.wav2vec2.parameters() + ) + + self.model_optimizer = self.hparams.model_opt_class( + self.hparams.model.parameters() + ) + + # save the optimizers in a dictionary + # the key will be used in `freeze_optimizers()` + self.optimizers_dict = { + "model_optimizer": self.model_optimizer, + } + if not self.hparams.freeze_wav2vec: + self.optimizers_dict["wav2vec_optimizer"] = self.wav2vec_optimizer + + if self.checkpointer is not None: + self.checkpointer.add_recoverable( + "wav2vec_opt", self.wav2vec_optimizer + ) + self.checkpointer.add_recoverable("modelopt", self.model_optimizer) + + +def dataio_prepare(hparams): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions.""" + data_folder = hparams["data_folder"] + + train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, + ) + + if hparams["sorting"] == "ascending": + # we sort training data to speed up training and get better results. + train_data = train_data.filtered_sorted(sort_key="duration") + # when sorting do not shuffle in dataloader ! otherwise is pointless + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "descending": + train_data = train_data.filtered_sorted( + sort_key="duration", reverse=True + ) + # when sorting do not shuffle in dataloader ! otherwise is pointless + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "random": + pass + + else: + raise NotImplementedError( + "sorting must be random, ascending or descending" + ) + + valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, + ) + valid_data = valid_data.filtered_sorted(sort_key="duration") + + # test is separate + test_datasets = {} + for csv_file in hparams["test_csv"]: + name = Path(csv_file).stem + test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=csv_file, replacements={"data_root": data_folder} + ) + test_datasets[name] = test_datasets[name].filtered_sorted( + sort_key="duration" + ) + + datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] + + # 2. Define audio pipeline: + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def audio_pipeline(wav): + sig = sb.dataio.dataio.read_audio(wav) + return sig + + sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) + label_encoder = sb.dataio.encoder.CTCTextEncoder() + + # 3. Define text pipeline: + @sb.utils.data_pipeline.takes("wrd") + @sb.utils.data_pipeline.provides( + "wrd", "char_list", "tokens_list", "tokens" + ) + def text_pipeline(wrd): + yield wrd + char_list = list(wrd) + yield char_list + tokens_list = label_encoder.encode_sequence(char_list) + yield tokens_list + tokens = torch.LongTensor(tokens_list) + yield tokens + + sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) + + lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") + special_labels = { + "blank_label": hparams["blank_index"], + } + label_encoder.load_or_create( + path=lab_enc_file, + from_didatasets=[train_data], + output_key="char_list", + special_labels=special_labels, + sequence_input=True, + ) + + # 4. Set output: + sb.dataio.dataset.set_output_keys( + datasets, ["id", "sig", "wrd", "char_list", "tokens"], + ) + + return train_data, valid_data, test_datasets, label_encoder + + +if __name__ == "__main__": + + # CLI: + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # create ddp_group with the right communication protocol + sb.utils.distributed.ddp_init_group(run_opts) + + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Dataset prep (parsing Librispeech) + from librispeech_prepare import prepare_librispeech # noqa + + # multi-gpu (ddp) save data preparation + run_on_main( + prepare_librispeech, + kwargs={ + "data_folder": hparams["data_folder"], + "tr_splits": hparams["train_splits"], + "dev_splits": hparams["dev_splits"], + "te_splits": hparams["test_splits"], + "save_folder": hparams["output_folder"], + "merge_lst": hparams["train_splits"], + "merge_name": "train.csv", + "skip_prep": hparams["skip_prep"], + }, + ) + + # here we create the datasets objects as well as tokenization and encoding + train_data, valid_data, test_datasets, label_encoder = dataio_prepare( + hparams + ) + + # Trainer initialization + asr_brain = ASR( + modules=hparams["modules"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + # We load the pretrained wav2vec2 model + if "pretrainer" in hparams.keys(): + run_on_main(hparams["pretrainer"].collect_files) + hparams["pretrainer"].load_collected() + + # We dynamicaly add the tokenizer to our brain class. + # NB: This tokenizer corresponds to the one used for the LM!! + asr_brain.tokenizer = label_encoder + + ind2lab = label_encoder.ind2lab + vocab_list = [ind2lab[x] for x in range(len(ind2lab))] + + from speechbrain.decoders.ctc import CTCBeamSearcher + + test_searcher = CTCBeamSearcher( + **hparams["test_beam_search"], vocab_list=vocab_list, + ) + + # Training + asr_brain.fit( + asr_brain.hparams.epoch_counter, + train_data, + valid_data, + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + # Testing + if not os.path.exists(hparams["output_wer_folder"]): + os.makedirs(hparams["output_wer_folder"]) + + for k in test_datasets.keys(): # keys are test_clean, test_other etc + asr_brain.hparams.test_wer_file = os.path.join( + hparams["output_wer_folder"], f"wer_{k}.txt" + ) + asr_brain.evaluate( + test_datasets[k], + test_loader_kwargs=hparams["test_dataloader_opts"], + min_key="WER", + ) diff --git a/tests/recipes/GigaSpeech.csv b/tests/recipes/GigaSpeech.csv new file mode 100644 index 0000000000..a60e84f864 --- /dev/null +++ b/tests/recipes/GigaSpeech.csv @@ -0,0 +1,2 @@ +Task,Dataset,Script_file,Hparam_file,Data_prep_file,Readme_file,Result_url,HF_repo,test_debug_flags,test_debug_checks,performance +ASR-CTC,GigaSpeech,recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py,recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml,recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py,recipes/GigaSpeech/ASR/CTC/README.md,,,--data_folder=tests/samples/ASR/ --train_csv=tests/samples/annotation/ASR_train.csv --valid_csv=tests/samples/annotation/ASR_train.csv --test_csv=tests/samples/annotation/ASR_train.csv --number_of_epochs=1 --skip_prep=True --wav2vec2_folder=tests/tmp/wav2vec2_checkpoint, \ No newline at end of file From c2540853cdd12a23ea0cf46418e37c7ae577b245 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sat, 10 Feb 2024 16:15:42 +0100 Subject: [PATCH 10/77] update recipe to be compliant with gigaspeech csv --- .../ASR/CTC/hparams/train_hf_wavlm.yaml | 1 + .../GigaSpeech/ASR/CTC/train_with_wavlm.py | 46 +++++++++---------- 2 files changed, 22 insertions(+), 25 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml index b172cd559c..497e6198e0 100644 --- a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml +++ b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml @@ -28,6 +28,7 @@ ckpt_interval_minutes: 25 # save checkpoint every N min train_csv: !ref /train.csv valid_csv: !ref /dev.csv test_csv: !ref /test.csv +json_file: "GigaSpeech.json" # Training parameters number_of_epochs: 1 diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py index 9620d7bd06..edf31aa454 100644 --- a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py @@ -45,7 +45,6 @@ def compute_forward(self, batch, stage): x = self.modules.enc(feats) # Compute outputs - p_tokens = None logits = self.modules.ctc_lin(x) # Upsample the inputs if they have been highly downsampled @@ -72,6 +71,8 @@ def compute_forward(self, batch, stage): if hasattr(self.hparams, "rescorer"): p_tokens, _ = self.hparams.rescorer.rescore(candidates, scores) + else: + p_tokens = None return p_ctc, wav_lens, p_tokens @@ -98,17 +99,12 @@ def compute_objectives(self, predictions, batch, stage): for utt_seq in predicted_tokens ] elif stage == sb.Stage.TEST: - if hasattr(self.hparams, "rescorer"): - predicted_words = [ - hyp[0].split(" ") for hyp in predicted_tokens - ] - else: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] + predicted_words = [ + hyp[0].text.split(" ") for hyp in predicted_tokens + ] if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] + target_words = [wrd.split(" ") for wrd in batch.text] self.wer_metric.append(ids, predicted_words, target_words) self.cer_metric.append(ids, predicted_words, target_words) @@ -250,7 +246,7 @@ def dataio_prepare(hparams): datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.takes("audio_path") @sb.utils.data_pipeline.provides("sig") def audio_pipeline(wav): sig = sb.dataio.dataio.read_audio(wav) @@ -260,13 +256,13 @@ def audio_pipeline(wav): label_encoder = sb.dataio.encoder.CTCTextEncoder() # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") + @sb.utils.data_pipeline.takes("text") @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" + "text", "char_list", "tokens_list", "tokens" ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) + def text_pipeline(text): + yield text + char_list = list(text) yield char_list tokens_list = label_encoder.encode_sequence(char_list) yield tokens_list @@ -289,7 +285,7 @@ def text_pipeline(wrd): # 4. Set output: sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], + datasets, ["id", "sig", "text", "char_list", "tokens"], ) return train_data, valid_data, test_datasets, label_encoder @@ -314,19 +310,19 @@ def text_pipeline(wrd): ) # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa + from gigaspeech import prepare_gigaspeech # noqa # multi-gpu (ddp) save data preparation run_on_main( - prepare_librispeech, + prepare_gigaspeech, kwargs={ "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", + "save_folder": hparams["save_folder"], + "splits": hparams["splits"], + "output_train_csv_filename": hparams["train_csv"], + "output_dev_csv_filename": hparams["valid_csv"], + "output_test_csv_filename": hparams["test_csv"], + "json_file": hparams["json_file"], "skip_prep": hparams["skip_prep"], }, ) From b4de83a9a4cdb69f230b1525cf6d0f9957c7ae7e Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sat, 10 Feb 2024 21:49:03 +0100 Subject: [PATCH 11/77] add transformers dep --- recipes/GigaSpeech/ASR/CTC/extra_requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 recipes/GigaSpeech/ASR/CTC/extra_requirements.txt diff --git a/recipes/GigaSpeech/ASR/CTC/extra_requirements.txt b/recipes/GigaSpeech/ASR/CTC/extra_requirements.txt new file mode 100644 index 0000000000..976a2b1f39 --- /dev/null +++ b/recipes/GigaSpeech/ASR/CTC/extra_requirements.txt @@ -0,0 +1 @@ +transformers From c3afdccb16398334b262c86a5e408546c2dd5bc8 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sat, 10 Feb 2024 21:50:43 +0100 Subject: [PATCH 12/77] convert opus to wav --- .../GigaSpeech/ASR/CTC/gigaspeech_prepare.py | 86 +++++++++++++++---- recipes/GigaSpeech/gigaspeech_prepare.py | 68 ++++++++++++--- 2 files changed, 126 insertions(+), 28 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py index d13fda4bb2..5be5091b03 100644 --- a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py @@ -68,8 +68,12 @@ def prepare_gigaspeech( data_folder: str, save_folder: str, splits: list, + output_train_csv_filename=None, + output_dev_csv_filename=None, + output_test_csv_filename=None, json_file: str = "GigaSpeech.json", skip_prep: bool = False, + convert_opus_to_wav: bool = True, ) -> None: """ Prepare the csv files for GigaSpeech dataset. @@ -90,10 +94,18 @@ def prepare_gigaspeech( The path to the folder where the CSV files will be saved. splits : list The list of splits to be used for creating the CSV files. + output_train_csv_filename : str, optional + The name of the CSV file which will be containing the train subset. + output_dev_csv_filename : str, optional + The name of the CSV file which will be containing the dev subset. + output_test_csv_filename : str, optional + The name of the CSV file which will be containing the test subset. json_file : str, optional The name of the JSON file containing the metadata of the GigaSpeech dataset. skip_prep : bool, optional If True, the data preparation will be skipped, and the function will return immediately. + convert_opus_to_wav : bool, optional + If True, the opus files will be converted to wav files. Returns ------- @@ -121,11 +133,12 @@ def prepare_gigaspeech( save_csv_files = {} for split in splits: if split in TRAIN_SUBSET: - save_csv_files[split] = os.path.join(save_folder, "train.csv") + save_csv_files[split] = output_train_csv_filename else: - save_csv_files[split] = os.path.join( - save_folder, f"{split.lower()}.csv" - ) + if split == "DEV": + save_csv_files[split] = output_dev_csv_filename + elif split == "TEST": + save_csv_files[split] = output_test_csv_filename # check if the data is already prepared if skip(save_csv_files): @@ -135,21 +148,23 @@ def prepare_gigaspeech( logger.info("Starting data preparation...") check_gigaspeech_folders(data_folder, json_file) - - json_metadata = os.path.join(data_folder, json_file) logger.info(f"Starting reading {json_file}.") - with open(json_metadata, "r") as f: + with open(json_file, "r") as f: info = json.load(f) logger.info(f"Reading {json_file} done.") logger.info("Creating train, dev, and test subsets.") for split, output_csv_file in save_csv_files.items(): logger.info(f"Starting creating {output_csv_file} using {split} split.") - create_csv(output_csv_file, info, data_folder, split) + create_csv( + output_csv_file, info, data_folder, split, convert_opus_to_wav + ) logger.info("Data preparation completed!") -def process_line(audio: json, data_folder: str, split: str) -> list: +def process_line( + audio: json, data_folder: str, split: str, convert_opus_to_wav: bool +) -> list: """ Process the audio line and return the utterances for the given split. @@ -161,6 +176,8 @@ def process_line(audio: json, data_folder: str, split: str) -> list: The path to the GigaSpeech dataset. split : str The split to be used for filtering the data. + convert_opus_to_wav : bool + If True, the opus files will be converted to wav files. Returns ------- @@ -172,6 +189,9 @@ def process_line(audio: json, data_folder: str, split: str) -> list: audio_path = os.path.join(data_folder, audio["path"]) assert os.path.isfile(audio_path), f"File not found: {audio_path}" + if convert_opus_to_wav and audio_path.endswith(".opus"): + audio_path = convert_opus2wav(audio_path) + # 2. iterate over the utterances utterances = [] for segment in audio["segments"]: @@ -194,7 +214,13 @@ def process_line(audio: json, data_folder: str, split: str) -> list: return utterances -def create_csv(csv_file: str, info: json, data_folder: str, split: str) -> None: +def create_csv( + csv_file: str, + info: json, + data_folder: str, + split: str, + convert_opus_to_wav: bool, +) -> None: """ Create a CSV file based on the info in the GigaSpeech JSON file and filter the data based on the split. @@ -208,6 +234,8 @@ def create_csv(csv_file: str, info: json, data_folder: str, split: str) -> None: The path to the GigaSpeech dataset. split : str The split to be used for filtering the data. + convert_opus_to_wav : bool + If True, the opus files will be converted to wav files. Returns ------- @@ -217,7 +245,10 @@ def create_csv(csv_file: str, info: json, data_folder: str, split: str) -> None: nb_samples = 0 line_processor = functools.partial( - process_line, data_folder=data_folder, split=split, + process_line, + data_folder=data_folder, + split=split, + convert_opus_to_wav=convert_opus_to_wav, ) csv_file_tmp = csv_file + ".tmp" @@ -226,7 +257,7 @@ def create_csv(csv_file: str, info: json, data_folder: str, split: str) -> None: csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL ) header = [ - "utt_id", + "ID", "audio_id", "audio_path", "speaker", @@ -266,6 +297,31 @@ def create_csv(csv_file: str, info: json, data_folder: str, split: str) -> None: ) +def convert_opus2wav(audio_opus_path): + """Convert an opus file to a wav file. + + Parameters + ---------- + audio_opus_path : str + The path to the opus file to be converted. + + Returns + ------- + str + The path to the converted wav file. + + Raises + ------ + subprocess.CalledProcessError + If the conversion process fails. + """ + audio_wav_path = audio_opus_path.replace(".opus", ".wav") + os.system( + f"ffmpeg -y -i {audio_opus_path} -ac 1 -ar 16000 {audio_wav_path} > /dev/null 2>&1" + ) + return audio_wav_path + + def preprocess_text(text: str) -> str: """ Preprocesses the input text by removing garbage tags and replacing punctuation tags. @@ -356,12 +412,10 @@ def check_gigaspeech_folders( If GigaSpeech is not found at the specified path. """ # Checking if "GigaSpeech.json" exist - json_gigaspeech = os.path.join(data_folder, json_file) - - if not os.path.exists(json_gigaspeech): + if not os.path.exists(json_file): err_msg = ( "the opus file %s does not exist (it is expected in the " - "Gigaspeech dataset)" % json_gigaspeech + "Gigaspeech dataset)" % json_file ) raise OSError(err_msg) diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index a1728e59d1..5be5091b03 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -73,6 +73,7 @@ def prepare_gigaspeech( output_test_csv_filename=None, json_file: str = "GigaSpeech.json", skip_prep: bool = False, + convert_opus_to_wav: bool = True, ) -> None: """ Prepare the csv files for GigaSpeech dataset. @@ -103,6 +104,8 @@ def prepare_gigaspeech( The name of the JSON file containing the metadata of the GigaSpeech dataset. skip_prep : bool, optional If True, the data preparation will be skipped, and the function will return immediately. + convert_opus_to_wav : bool, optional + If True, the opus files will be converted to wav files. Returns ------- @@ -145,21 +148,23 @@ def prepare_gigaspeech( logger.info("Starting data preparation...") check_gigaspeech_folders(data_folder, json_file) - - json_metadata = os.path.join(data_folder, json_file) logger.info(f"Starting reading {json_file}.") - with open(json_metadata, "r") as f: + with open(json_file, "r") as f: info = json.load(f) logger.info(f"Reading {json_file} done.") logger.info("Creating train, dev, and test subsets.") for split, output_csv_file in save_csv_files.items(): logger.info(f"Starting creating {output_csv_file} using {split} split.") - create_csv(output_csv_file, info, data_folder, split) + create_csv( + output_csv_file, info, data_folder, split, convert_opus_to_wav + ) logger.info("Data preparation completed!") -def process_line(audio: json, data_folder: str, split: str) -> list: +def process_line( + audio: json, data_folder: str, split: str, convert_opus_to_wav: bool +) -> list: """ Process the audio line and return the utterances for the given split. @@ -171,6 +176,8 @@ def process_line(audio: json, data_folder: str, split: str) -> list: The path to the GigaSpeech dataset. split : str The split to be used for filtering the data. + convert_opus_to_wav : bool + If True, the opus files will be converted to wav files. Returns ------- @@ -182,6 +189,9 @@ def process_line(audio: json, data_folder: str, split: str) -> list: audio_path = os.path.join(data_folder, audio["path"]) assert os.path.isfile(audio_path), f"File not found: {audio_path}" + if convert_opus_to_wav and audio_path.endswith(".opus"): + audio_path = convert_opus2wav(audio_path) + # 2. iterate over the utterances utterances = [] for segment in audio["segments"]: @@ -204,7 +214,13 @@ def process_line(audio: json, data_folder: str, split: str) -> list: return utterances -def create_csv(csv_file: str, info: json, data_folder: str, split: str) -> None: +def create_csv( + csv_file: str, + info: json, + data_folder: str, + split: str, + convert_opus_to_wav: bool, +) -> None: """ Create a CSV file based on the info in the GigaSpeech JSON file and filter the data based on the split. @@ -218,6 +234,8 @@ def create_csv(csv_file: str, info: json, data_folder: str, split: str) -> None: The path to the GigaSpeech dataset. split : str The split to be used for filtering the data. + convert_opus_to_wav : bool + If True, the opus files will be converted to wav files. Returns ------- @@ -227,7 +245,10 @@ def create_csv(csv_file: str, info: json, data_folder: str, split: str) -> None: nb_samples = 0 line_processor = functools.partial( - process_line, data_folder=data_folder, split=split, + process_line, + data_folder=data_folder, + split=split, + convert_opus_to_wav=convert_opus_to_wav, ) csv_file_tmp = csv_file + ".tmp" @@ -236,7 +257,7 @@ def create_csv(csv_file: str, info: json, data_folder: str, split: str) -> None: csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL ) header = [ - "utt_id", + "ID", "audio_id", "audio_path", "speaker", @@ -276,6 +297,31 @@ def create_csv(csv_file: str, info: json, data_folder: str, split: str) -> None: ) +def convert_opus2wav(audio_opus_path): + """Convert an opus file to a wav file. + + Parameters + ---------- + audio_opus_path : str + The path to the opus file to be converted. + + Returns + ------- + str + The path to the converted wav file. + + Raises + ------ + subprocess.CalledProcessError + If the conversion process fails. + """ + audio_wav_path = audio_opus_path.replace(".opus", ".wav") + os.system( + f"ffmpeg -y -i {audio_opus_path} -ac 1 -ar 16000 {audio_wav_path} > /dev/null 2>&1" + ) + return audio_wav_path + + def preprocess_text(text: str) -> str: """ Preprocesses the input text by removing garbage tags and replacing punctuation tags. @@ -366,12 +412,10 @@ def check_gigaspeech_folders( If GigaSpeech is not found at the specified path. """ # Checking if "GigaSpeech.json" exist - json_gigaspeech = os.path.join(data_folder, json_file) - - if not os.path.exists(json_gigaspeech): + if not os.path.exists(json_file): err_msg = ( "the opus file %s does not exist (it is expected in the " - "Gigaspeech dataset)" % json_gigaspeech + "Gigaspeech dataset)" % json_file ) raise OSError(err_msg) From 945b8bb85d6dbd2e68bd4e5b7095836a22c487eb Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sat, 10 Feb 2024 21:54:06 +0100 Subject: [PATCH 13/77] recipe --debug mode works. --- .../ASR/CTC/hparams/train_hf_wavlm.yaml | 17 ++-- .../GigaSpeech/ASR/CTC/train_with_wavlm.py | 79 +++++++++---------- 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml index 497e6198e0..c1ff091b5f 100644 --- a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml +++ b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml @@ -24,17 +24,19 @@ data_folder: !PLACEHOLDER # e,g./path/to/GigaSpeech # and ["DEV", "TEST"] for the eval splits. splits: ["XS", "DEV", "TEST"] skip_prep: False +convert_opus_to_wav: True ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train.csv -valid_csv: !ref /dev.csv -test_csv: !ref /test.csv -json_file: "GigaSpeech.json" +train_csv: !ref /train.csv +valid_csv: !ref /dev.csv +test_csv: !ref /test.csv +json_file: !ref /GigaSpeech.json # Training parameters number_of_epochs: 1 lr: 0.9 lr_wav2vec: 0.0001 sorting: ascending +num_workers: 4 precision: fp32 # bf16, fp16 or fp32 sample_rate: 16000 @@ -42,14 +44,15 @@ sample_rate: 16000 # With DDP batch_size is multiplied by N jobs # Must be 3 per GPU to fit 32GB of VRAM batch_size: 6 -test_batch_size: 8 +test_batch_size: 1 # Dataloader options train_dataloader_opts: batch_size: !ref + num_workers: !ref valid_dataloader_opts: - batch_size: !ref + batch_size: !ref test_dataloader_opts: batch_size: !ref @@ -61,7 +64,7 @@ dnn_neurons: 1024 freeze_wav2vec: True # Outputs -output_neurons: 29 # BPE size, index(blank/eos/bos) = 0 +output_neurons: 34 blank_index: 0 # Decoding parameters diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py index edf31aa454..40b9236d7d 100644 --- a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py @@ -10,7 +10,6 @@ import speechbrain as sb from speechbrain.utils.distributed import run_on_main, if_main_process from hyperpyyaml import load_hyperpyyaml -from pathlib import Path logger = logging.getLogger(__name__) @@ -61,16 +60,6 @@ def compute_forward(self, batch, stage): ) elif stage == sb.Stage.TEST: p_tokens = test_searcher(p_ctc, wav_lens) - - candidates = [] - scores = [] - - for batch in p_tokens: - candidates.append([hyp.text for hyp in batch]) - scores.append([hyp.score for hyp in batch]) - - if hasattr(self.hparams, "rescorer"): - p_tokens, _ = self.hparams.rescorer.rescore(candidates, scores) else: p_tokens = None @@ -232,24 +221,24 @@ def dataio_prepare(hparams): ) valid_data = valid_data.filtered_sorted(sort_key="duration") - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) + test_data = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=hparams["test_csv"], replacements={"data_root": data_folder}, + ) - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] + # We also sort the validation data so it is faster to validate + test_data = test_data.filtered_sorted(sort_key="duration") + + datasets = [train_data, valid_data, test_data] # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("audio_path") + @sb.utils.data_pipeline.takes("audio_path", "begin_time", "end_time") @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) + def audio_pipeline(audio_path, begin_time, end_time): + start_sample = int(float(begin_time) * hparams["sample_rate"]) + stop_sample = int(float(end_time) * hparams["sample_rate"]) + sig = sb.dataio.dataio.read_audio( + {"file": audio_path, "start": start_sample, "stop": stop_sample} + ) return sig sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) @@ -288,7 +277,7 @@ def text_pipeline(text): datasets, ["id", "sig", "text", "char_list", "tokens"], ) - return train_data, valid_data, test_datasets, label_encoder + return train_data, valid_data, test_data, label_encoder if __name__ == "__main__": @@ -310,7 +299,7 @@ def text_pipeline(text): ) # Dataset prep (parsing Librispeech) - from gigaspeech import prepare_gigaspeech # noqa + from gigaspeech_prepare import prepare_gigaspeech # noqa # multi-gpu (ddp) save data preparation run_on_main( @@ -324,13 +313,12 @@ def text_pipeline(text): "output_test_csv_filename": hparams["test_csv"], "json_file": hparams["json_file"], "skip_prep": hparams["skip_prep"], + "convert_opus_to_wav": hparams["convert_opus_to_wav"], }, ) # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) + train_data, valid_data, test_data, label_encoder = dataio_prepare(hparams) # Trainer initialization asr_brain = ASR( @@ -368,15 +356,24 @@ def text_pipeline(text): ) # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) + os.makedirs(hparams["output_wer_folder"], exist_ok=True) - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) + # report WER on valid data + asr_brain.hparams.test_wer_file = os.path.join( + hparams["output_wer_folder"], f"valid_wer.txt" + ) + asr_brain.evaluate( + valid_data, + min_key="WER", + test_loader_kwargs=hparams["test_dataloader_opts"], + ) + + # report WER on test data + asr_brain.hparams.test_wer_file = os.path.join( + hparams["output_wer_folder"], f"test_wer.txt" + ) + asr_brain.evaluate( + test_data, + min_key="WER", + test_loader_kwargs=hparams["test_dataloader_opts"], + ) From ae9120910335ed3b6f2eb97aad7fe9bf14769099 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sat, 10 Feb 2024 22:20:31 +0100 Subject: [PATCH 14/77] typo GRABAGE_UTTERANCE_TAGS -> GARBAGE_UTTERANCE_TAGS --- recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py | 6 +++--- recipes/GigaSpeech/gigaspeech_prepare.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py index 5be5091b03..b285c2bbbc 100644 --- a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py @@ -19,7 +19,7 @@ logger = logging.getLogger(__name__) -GRABAGE_UTTERANCE_TAGS = ["", "", "", ""] +GARBAGE_UTTERANCE_TAGS = ["", "", "", ""] PUNCTUATION_TAGS = { "": ",", "": "!", @@ -343,7 +343,7 @@ def preprocess_text(text: str) -> str: Notes ----- - The function iterates over predefined garbage utterance tags (GRABAGE_UTTERANCE_TAGS) + The function iterates over predefined garbage utterance tags (GARBAGE_UTTERANCE_TAGS) and removes them from the input text. It then iterates over predefined punctuation tags (PUNCTUATION_TAGS) and replaces them with the corresponding punctuation. @@ -354,7 +354,7 @@ def preprocess_text(text: str) -> str: "douglas mcgray is going to be our guide you walk through the door, you see the red carpeting, you see someone in a suit. they may be greeting you." """ # Remove garbage tags - for tag in GRABAGE_UTTERANCE_TAGS: + for tag in GARBAGE_UTTERANCE_TAGS: if tag in text: return "" diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index 5be5091b03..b285c2bbbc 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -19,7 +19,7 @@ logger = logging.getLogger(__name__) -GRABAGE_UTTERANCE_TAGS = ["", "", "", ""] +GARBAGE_UTTERANCE_TAGS = ["", "", "", ""] PUNCTUATION_TAGS = { "": ",", "": "!", @@ -343,7 +343,7 @@ def preprocess_text(text: str) -> str: Notes ----- - The function iterates over predefined garbage utterance tags (GRABAGE_UTTERANCE_TAGS) + The function iterates over predefined garbage utterance tags (GARBAGE_UTTERANCE_TAGS) and removes them from the input text. It then iterates over predefined punctuation tags (PUNCTUATION_TAGS) and replaces them with the corresponding punctuation. @@ -354,7 +354,7 @@ def preprocess_text(text: str) -> str: "douglas mcgray is going to be our guide you walk through the door, you see the red carpeting, you see someone in a suit. they may be greeting you." """ # Remove garbage tags - for tag in GRABAGE_UTTERANCE_TAGS: + for tag in GARBAGE_UTTERANCE_TAGS: if tag in text: return "" From 28b4257e5c25a2e43724d3e25fd555b6e8c7c6a6 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sun, 11 Feb 2024 10:36:13 +0100 Subject: [PATCH 15/77] tmp DL file --- recipes/GigaSpeech/download_gigaspeech.py | 100 ++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 recipes/GigaSpeech/download_gigaspeech.py diff --git a/recipes/GigaSpeech/download_gigaspeech.py b/recipes/GigaSpeech/download_gigaspeech.py new file mode 100644 index 0000000000..5f154c6507 --- /dev/null +++ b/recipes/GigaSpeech/download_gigaspeech.py @@ -0,0 +1,100 @@ +""" +Note for reviewer: this is a temporary script. It may be removed in the future. + +Download script for GigaSpeech dataset. + +Download instructions: https://github.com/SpeechColab/GigaSpeech +Reference: https://arxiv.org/abs/2106.06909 + +Author +------- + * Adel Moumen, 2024 +""" + +import logging +from typing import Optional, Sequence, Union +import argparse + +logger = logging.getLogger(__name__) + + +def download_gigaspeech( + password: str, + target_dir: str = ".", + dataset_parts: Optional[Union[str, Sequence[str]]] = "auto", + host: Optional[str] = "tsinghua", +) -> str: + """Download GigaSpeech dataset. + + Parameters + ---------- + password : str + The password to access the GigaSpeech dataset. + target_dir : str, optional + The path to the directory where the dataset will be downloaded. + dataset_parts : Union[str, Sequence[str]], optional + The parts of the dataset to be downloaded. + If "auto", all parts will be downloaded. + If a string, it should be a comma-separated list of parts to be downloaded. + If a list, it should be a list of parts to be downloaded. + host : str, optional + The host to be used for downloading the dataset. + The available hosts are "tsinghua" and "jhu". + + Returns + ------- + str + The path to the directory where the dataset is downloaded. + """ + try: + from speechcolab.datasets.gigaspeech import GigaSpeech + except ImportError: + raise ImportError( + "Please install the speechcolab package to download the GigaSpeech dataset." + ) + gigaspeech = GigaSpeech(target_dir) + + if dataset_parts == "auto": + dataset_parts = ("XL", "DEV", "TEST") + elif isinstance(dataset_parts, str): + dataset_parts = [dataset_parts] + + for part in dataset_parts: + logging.info(f"Downloading GigaSpeech part: {part}") + gigaspeech.download(password, "{" + part + "}", host=host) + + return target_dir + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Download GigaSpeech dataset.") + parser.add_argument( + "--password", + type=str, + required=True, + help="The password to access the GigaSpeech dataset.", + ) + parser.add_argument( + "--target_dir", + type=str, + default=".", + help="The path to the directory where the dataset will be downloaded.", + ) + parser.add_argument( + "--dataset_parts", + type=str, + nargs="+", # '+' means one or more values will be collected into a list + default=["auto"], + help="The parts of the dataset to be downloaded.", + ) + parser.add_argument( + "--host", + type=str, + default="tsinghua", + help="The host to be used for downloading the dataset.", + ) + args = parser.parse_args() + + download_gigaspeech( + args.password, args.target_dir, args.dataset_parts, args.host + ) From 3a6396c21d0c1dc922d32b4cbe8175912ad692e2 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sun, 11 Feb 2024 11:57:12 +0100 Subject: [PATCH 16/77] update DL FILE --- recipes/GigaSpeech/download_gigaspeech.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/recipes/GigaSpeech/download_gigaspeech.py b/recipes/GigaSpeech/download_gigaspeech.py index 5f154c6507..c1e3f919d7 100644 --- a/recipes/GigaSpeech/download_gigaspeech.py +++ b/recipes/GigaSpeech/download_gigaspeech.py @@ -23,7 +23,7 @@ def download_gigaspeech( target_dir: str = ".", dataset_parts: Optional[Union[str, Sequence[str]]] = "auto", host: Optional[str] = "tsinghua", -) -> str: +) -> None: """Download GigaSpeech dataset. Parameters @@ -39,12 +39,7 @@ def download_gigaspeech( If a list, it should be a list of parts to be downloaded. host : str, optional The host to be used for downloading the dataset. - The available hosts are "tsinghua" and "jhu". - - Returns - ------- - str - The path to the directory where the dataset is downloaded. + The available hosts are described in https://github.com/SpeechColab/GigaSpeech. """ try: from speechcolab.datasets.gigaspeech import GigaSpeech @@ -54,16 +49,14 @@ def download_gigaspeech( ) gigaspeech = GigaSpeech(target_dir) - if dataset_parts == "auto": - dataset_parts = ("XL", "DEV", "TEST") - elif isinstance(dataset_parts, str): - dataset_parts = [dataset_parts] + if dataset_parts == ["auto"]: + dataset_parts = ["XL", "DEV", "TEST"] for part in dataset_parts: logging.info(f"Downloading GigaSpeech part: {part}") gigaspeech.download(password, "{" + part + "}", host=host) - return target_dir + logger.info(f"GigaSpeech dataset finished downloading to {target_dir}.") if __name__ == "__main__": From 6e771d78a1d01237d99805099d0154ac251c8e48 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sun, 11 Feb 2024 11:57:32 +0100 Subject: [PATCH 17/77] add DL file in ASR/CTC --- .../GigaSpeech/ASR/CTC/download_gigaspeech.py | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 recipes/GigaSpeech/ASR/CTC/download_gigaspeech.py diff --git a/recipes/GigaSpeech/ASR/CTC/download_gigaspeech.py b/recipes/GigaSpeech/ASR/CTC/download_gigaspeech.py new file mode 100644 index 0000000000..c1e3f919d7 --- /dev/null +++ b/recipes/GigaSpeech/ASR/CTC/download_gigaspeech.py @@ -0,0 +1,93 @@ +""" +Note for reviewer: this is a temporary script. It may be removed in the future. + +Download script for GigaSpeech dataset. + +Download instructions: https://github.com/SpeechColab/GigaSpeech +Reference: https://arxiv.org/abs/2106.06909 + +Author +------- + * Adel Moumen, 2024 +""" + +import logging +from typing import Optional, Sequence, Union +import argparse + +logger = logging.getLogger(__name__) + + +def download_gigaspeech( + password: str, + target_dir: str = ".", + dataset_parts: Optional[Union[str, Sequence[str]]] = "auto", + host: Optional[str] = "tsinghua", +) -> None: + """Download GigaSpeech dataset. + + Parameters + ---------- + password : str + The password to access the GigaSpeech dataset. + target_dir : str, optional + The path to the directory where the dataset will be downloaded. + dataset_parts : Union[str, Sequence[str]], optional + The parts of the dataset to be downloaded. + If "auto", all parts will be downloaded. + If a string, it should be a comma-separated list of parts to be downloaded. + If a list, it should be a list of parts to be downloaded. + host : str, optional + The host to be used for downloading the dataset. + The available hosts are described in https://github.com/SpeechColab/GigaSpeech. + """ + try: + from speechcolab.datasets.gigaspeech import GigaSpeech + except ImportError: + raise ImportError( + "Please install the speechcolab package to download the GigaSpeech dataset." + ) + gigaspeech = GigaSpeech(target_dir) + + if dataset_parts == ["auto"]: + dataset_parts = ["XL", "DEV", "TEST"] + + for part in dataset_parts: + logging.info(f"Downloading GigaSpeech part: {part}") + gigaspeech.download(password, "{" + part + "}", host=host) + + logger.info(f"GigaSpeech dataset finished downloading to {target_dir}.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Download GigaSpeech dataset.") + parser.add_argument( + "--password", + type=str, + required=True, + help="The password to access the GigaSpeech dataset.", + ) + parser.add_argument( + "--target_dir", + type=str, + default=".", + help="The path to the directory where the dataset will be downloaded.", + ) + parser.add_argument( + "--dataset_parts", + type=str, + nargs="+", # '+' means one or more values will be collected into a list + default=["auto"], + help="The parts of the dataset to be downloaded.", + ) + parser.add_argument( + "--host", + type=str, + default="tsinghua", + help="The host to be used for downloading the dataset.", + ) + args = parser.parse_args() + + download_gigaspeech( + args.password, args.target_dir, args.dataset_parts, args.host + ) From ebfcddb5401c6747b886c00be0e7d5f1a73cc7e3 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Sun, 11 Feb 2024 12:00:15 +0100 Subject: [PATCH 18/77] update extra_requirements.txt --- recipes/GigaSpeech/ASR/CTC/extra_requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/recipes/GigaSpeech/ASR/CTC/extra_requirements.txt b/recipes/GigaSpeech/ASR/CTC/extra_requirements.txt index 976a2b1f39..afad715b9f 100644 --- a/recipes/GigaSpeech/ASR/CTC/extra_requirements.txt +++ b/recipes/GigaSpeech/ASR/CTC/extra_requirements.txt @@ -1 +1,3 @@ +kenlm +speechcolab transformers From a68d0b8a9b590bbb167ebc6e86ef3e1693a2527c Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Mon, 12 Feb 2024 10:53:14 +0100 Subject: [PATCH 19/77] add support of savedir within Pretrained subclasses --- speechbrain/inference/ASR.py | 4 +++- speechbrain/inference/interfaces.py | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/speechbrain/inference/ASR.py b/speechbrain/inference/ASR.py index b1b0d4d105..661fe58374 100644 --- a/speechbrain/inference/ASR.py +++ b/speechbrain/inference/ASR.py @@ -235,7 +235,9 @@ def set_decoding_function(self): opt_beam_search_params["kenlm_model_path"] ) kenlm_model_path = str( - fetch(fl, source=source, savedir=".") + fetch( + fl, source=source, savedir=self.hparams.savedir + ) ) # we need to update the kenlm_model_path in the opt_beam_search_params opt_beam_search_params[ diff --git a/speechbrain/inference/interfaces.py b/speechbrain/inference/interfaces.py index c84d9aad15..2916985082 100644 --- a/speechbrain/inference/interfaces.py +++ b/speechbrain/inference/interfaces.py @@ -126,6 +126,7 @@ def foreign_class( with open(hparams_local_path) as fin: hparams = load_hyperpyyaml(fin, overrides, overrides_must_match) + hparams["savedir"] = savedir # Pretraining: pretrainer = hparams["pretrainer"] pretrainer.set_collect_in(savedir) @@ -447,6 +448,7 @@ def from_hparams( if savedir is None: clsname = cls.__name__ savedir = f"./pretrained_models/{clsname}-{hashlib.md5(source.encode('UTF-8', errors='replace')).hexdigest()}" + hparams_local_path = fetch( filename=hparams_file, source=source, @@ -483,6 +485,9 @@ def from_hparams( with open(hparams_local_path) as fin: hparams = load_hyperpyyaml(fin, overrides) + # add savedir to hparams + hparams["savedir"] = savedir + # Pretraining: pretrainer = hparams["pretrainer"] pretrainer.set_collect_in(savedir) From b2ed2a94dca525167d6d5d9344e1c38a1b91d32c Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Mon, 12 Feb 2024 21:42:53 +0100 Subject: [PATCH 20/77] add wbs requirements --- recipes/GigaSpeech/extra_requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/recipes/GigaSpeech/extra_requirements.txt b/recipes/GigaSpeech/extra_requirements.txt index 7a239a9e76..91de2461ce 100644 --- a/recipes/GigaSpeech/extra_requirements.txt +++ b/recipes/GigaSpeech/extra_requirements.txt @@ -1 +1,2 @@ speechcolab +webdataset From 4b8c53322700147a12f8520a071c49938881e953 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Tue, 13 Feb 2024 18:33:25 +0100 Subject: [PATCH 21/77] webdataset --- .../GigaSpeech/ASR/CTC/gigaspeech_prepare.py | 233 ++++++++++++++++-- recipes/GigaSpeech/gigaspeech_prepare.py | 233 ++++++++++++++++-- 2 files changed, 420 insertions(+), 46 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py index b285c2bbbc..0d2671fb09 100644 --- a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py @@ -16,6 +16,7 @@ from dataclasses import dataclass import functools from speechbrain.utils.parallel import parallel_map +import speechbrain as sb logger = logging.getLogger(__name__) @@ -28,6 +29,7 @@ } SPLITS = ["DEV", "TEST"] TRAIN_SUBSET = ["XS", "S", "M", "L", "XL"] +SAMPLING_RATE = 16000 @dataclass @@ -68,12 +70,16 @@ def prepare_gigaspeech( data_folder: str, save_folder: str, splits: list, - output_train_csv_filename=None, - output_dev_csv_filename=None, - output_test_csv_filename=None, + output_train: str, + output_dev: str, + output_test: str, json_file: str = "GigaSpeech.json", skip_prep: bool = False, convert_opus_to_wav: bool = True, + use_webdataset: bool = True, + verbose: int = 0, + samples_per_shard=500, + max_size_shard=1e9, ) -> None: """ Prepare the csv files for GigaSpeech dataset. @@ -94,23 +100,34 @@ def prepare_gigaspeech( The path to the folder where the CSV files will be saved. splits : list The list of splits to be used for creating the CSV files. - output_train_csv_filename : str, optional - The name of the CSV file which will be containing the train subset. - output_dev_csv_filename : str, optional - The name of the CSV file which will be containing the dev subset. - output_test_csv_filename : str, optional - The name of the CSV file which will be containing the test subset. + output_train : str + The path in which the train CSV or shards will be saved. + output_dev : str + The path in which the dev CSV or shards will be saved. + output_test : str + The path in which the test CSV or shards will be saved. json_file : str, optional The name of the JSON file containing the metadata of the GigaSpeech dataset. skip_prep : bool, optional If True, the data preparation will be skipped, and the function will return immediately. convert_opus_to_wav : bool, optional If True, the opus files will be converted to wav files. + use_webdataset : bool, optional + If True, the data will be saved in the webdataset format. + verbose : int, optional + The verbosity level for the webdataset. + samples_per_shard: int + The number of samples per shard. + max_size_shard : int + The maximum size of the shard. Returns ------- None """ + logger.info(f"Preparing GigaSpeech dataset in {save_folder}...") + print(f"Input args: {locals()}") + if skip_prep: logger.info("Skipping data preparation as `skip_prep` is set to `True`") return @@ -129,36 +146,54 @@ def prepare_gigaspeech( os.makedirs(save_folder, exist_ok=True) - # Setting output files - save_csv_files = {} + # Setting output paths + save_output = {} for split in splits: if split in TRAIN_SUBSET: - save_csv_files[split] = output_train_csv_filename + save_output[split] = output_train + if use_webdataset: + os.makedirs(save_output[split], exist_ok=True) else: if split == "DEV": - save_csv_files[split] = output_dev_csv_filename + save_output[split] = output_dev elif split == "TEST": - save_csv_files[split] = output_test_csv_filename + save_output[split] = output_test + if use_webdataset: + os.makedirs(save_output[split], exist_ok=True) # check if the data is already prepared - if skip(save_csv_files): + if use_webdataset and skip_webdataset(save_output): + logger.info("Skipping preparation, completed in previous run.") + return + elif skip_csv(save_output): logger.info("Skipping preparation, completed in previous run.") return else: logger.info("Starting data preparation...") + # check that the data folder contains the GigaSpeech dataset check_gigaspeech_folders(data_folder, json_file) + logger.info(f"Starting reading {json_file}.") with open(json_file, "r") as f: info = json.load(f) logger.info(f"Reading {json_file} done.") - logger.info("Creating train, dev, and test subsets.") - for split, output_csv_file in save_csv_files.items(): - logger.info(f"Starting creating {output_csv_file} using {split} split.") - create_csv( - output_csv_file, info, data_folder, split, convert_opus_to_wav - ) + for split, output in save_output.items(): + logger.info(f"Starting creating {output} using {split} split.") + if use_webdataset: + create_shards( + output, + info, + data_folder, + split, + convert_opus_to_wav, + verbose=verbose, + samples_per_shard=samples_per_shard, + max_size_shard=max_size_shard, + ) + else: + create_csv(output, info, data_folder, split, convert_opus_to_wav) logger.info("Data preparation completed!") @@ -214,6 +249,142 @@ def process_line( return utterances +def _write_in_sink( + items, max_size_shard, samples_per_shard, save_folder, verbose=0 +): + """ + Write the GigaSpeechRow in the webdataset sinks. + + Parameters + ---------- + items : list + The list of items to be written in the sink. + max_size_shard : int + The maximum size of the shard. + samples_per_shard: int + The number of samples per shard. + save_folder : str + The path to the folder where the shards will be saved. + verbose : int, optional + The verbosity level for the webdataset. + + Returns + ------- + list + The list of items written in the sink. + """ + import webdataset as wds + + id_proc, row = items + pattern = os.path.join(save_folder, f"GigaSpeech-{id_proc}-%06d.tar") + with wds.ShardWriter( + pattern, + maxsize=max_size_shard, + maxcount=samples_per_shard, + verbose=verbose, + ) as sink: + for item in row: + start_sample = int(item.begin_time * SAMPLING_RATE) + stop_sample = int(item.end_time * SAMPLING_RATE) + audio = sb.dataio.dataio.read_audio( + { + "file": item.audio_path, + "start": start_sample, + "stop": stop_sample, + } + ) + + sample = { + "__key__": item.utt_id, + "audio.pth": audio, + "text": item.text, + } + + # write back to sink + sink.write(sample) + return row + + +def create_shards( + shards_folder_path: str, + info: json, + data_folder: str, + split: str, + convert_opus_to_wav: bool, + verbose: int = 0, + samples_per_shard=500, + max_size_shard=1e9, +) -> None: + """ + Create shards for the GigaSpeech dataset. + + Parameters + ---------- + shards_folder_path : str + The path to the shards folder. + info : dict + The GigaSpeech JSON file content. + data_folder : str + The path to the GigaSpeech dataset. + split : str + The split to be used for filtering the data. + convert_opus_to_wav : bool + If True, the opus files will be converted to wav files. + samples_per_shard: int + The number of samples per shard. + max_size_shard : int + The maximum size of the shard. + + Returns + ------- + None + """ + total_duration = 0.0 + nb_samples = 0 + + line_processor = functools.partial( + process_line, + data_folder=data_folder, + split=split, + convert_opus_to_wav=convert_opus_to_wav, + ) + + os.makedirs(shards_folder_path, exist_ok=True) + + audios_info = [(0, [])] + id_proc = 1 + for row in parallel_map(line_processor, info["audios"]): + if row is None: + continue + + # creates buckets of samples_per_shard size for each shard + for item in row: + audios_info[-1][1].append(item) + if len(audios_info[-1][1]) == samples_per_shard: + audios_info.append((id_proc, [])) + id_proc += 1 + + sink_processor = functools.partial( + _write_in_sink, + save_folder=shards_folder_path, + max_size_shard=max_size_shard, + samples_per_shard=samples_per_shard, + verbose=verbose, + ) + + logger.info(f"Starting writing shards in {shards_folder_path}...") + for row in parallel_map(sink_processor, audios_info, chunk_size=1): + for item in row: + total_duration += item.duration + nb_samples += 1 + + logger.info(f"{split} shards succesfully created at {shards_folder_path}!") + logger.info(f"Number of samples in {split} split: {nb_samples}") + logger.info( + f"Total duration of {split} split: {round(total_duration / 3600, 2)} Hours" + ) + + def create_csv( csv_file: str, info: json, @@ -317,7 +488,7 @@ def convert_opus2wav(audio_opus_path): """ audio_wav_path = audio_opus_path.replace(".opus", ".wav") os.system( - f"ffmpeg -y -i {audio_opus_path} -ac 1 -ar 16000 {audio_wav_path} > /dev/null 2>&1" + f"ffmpeg -y -i {audio_opus_path} -ac 1 -ar {SAMPLING_RATE} {audio_wav_path} > /dev/null 2>&1" ) return audio_wav_path @@ -368,7 +539,23 @@ def preprocess_text(text: str) -> str: return text.lower() -def skip(save_csv_files: dict) -> bool: +def skip_webdataset(save_folders: str) -> bool: + """ Check if the webdataset shards already exist. + + Parameters + ---------- + save_folders : str + The path to the folder where the shards will be saved. + + Returns + ------- + bool + True if the webdataset shards already exist, False otherwise. + """ + return all(os.listdir(folder) for folder in save_folders.values()) + + +def skip_csv(save_csv_files: dict) -> bool: """ Check if the CSV files already exist. Parameters diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index b285c2bbbc..0d2671fb09 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -16,6 +16,7 @@ from dataclasses import dataclass import functools from speechbrain.utils.parallel import parallel_map +import speechbrain as sb logger = logging.getLogger(__name__) @@ -28,6 +29,7 @@ } SPLITS = ["DEV", "TEST"] TRAIN_SUBSET = ["XS", "S", "M", "L", "XL"] +SAMPLING_RATE = 16000 @dataclass @@ -68,12 +70,16 @@ def prepare_gigaspeech( data_folder: str, save_folder: str, splits: list, - output_train_csv_filename=None, - output_dev_csv_filename=None, - output_test_csv_filename=None, + output_train: str, + output_dev: str, + output_test: str, json_file: str = "GigaSpeech.json", skip_prep: bool = False, convert_opus_to_wav: bool = True, + use_webdataset: bool = True, + verbose: int = 0, + samples_per_shard=500, + max_size_shard=1e9, ) -> None: """ Prepare the csv files for GigaSpeech dataset. @@ -94,23 +100,34 @@ def prepare_gigaspeech( The path to the folder where the CSV files will be saved. splits : list The list of splits to be used for creating the CSV files. - output_train_csv_filename : str, optional - The name of the CSV file which will be containing the train subset. - output_dev_csv_filename : str, optional - The name of the CSV file which will be containing the dev subset. - output_test_csv_filename : str, optional - The name of the CSV file which will be containing the test subset. + output_train : str + The path in which the train CSV or shards will be saved. + output_dev : str + The path in which the dev CSV or shards will be saved. + output_test : str + The path in which the test CSV or shards will be saved. json_file : str, optional The name of the JSON file containing the metadata of the GigaSpeech dataset. skip_prep : bool, optional If True, the data preparation will be skipped, and the function will return immediately. convert_opus_to_wav : bool, optional If True, the opus files will be converted to wav files. + use_webdataset : bool, optional + If True, the data will be saved in the webdataset format. + verbose : int, optional + The verbosity level for the webdataset. + samples_per_shard: int + The number of samples per shard. + max_size_shard : int + The maximum size of the shard. Returns ------- None """ + logger.info(f"Preparing GigaSpeech dataset in {save_folder}...") + print(f"Input args: {locals()}") + if skip_prep: logger.info("Skipping data preparation as `skip_prep` is set to `True`") return @@ -129,36 +146,54 @@ def prepare_gigaspeech( os.makedirs(save_folder, exist_ok=True) - # Setting output files - save_csv_files = {} + # Setting output paths + save_output = {} for split in splits: if split in TRAIN_SUBSET: - save_csv_files[split] = output_train_csv_filename + save_output[split] = output_train + if use_webdataset: + os.makedirs(save_output[split], exist_ok=True) else: if split == "DEV": - save_csv_files[split] = output_dev_csv_filename + save_output[split] = output_dev elif split == "TEST": - save_csv_files[split] = output_test_csv_filename + save_output[split] = output_test + if use_webdataset: + os.makedirs(save_output[split], exist_ok=True) # check if the data is already prepared - if skip(save_csv_files): + if use_webdataset and skip_webdataset(save_output): + logger.info("Skipping preparation, completed in previous run.") + return + elif skip_csv(save_output): logger.info("Skipping preparation, completed in previous run.") return else: logger.info("Starting data preparation...") + # check that the data folder contains the GigaSpeech dataset check_gigaspeech_folders(data_folder, json_file) + logger.info(f"Starting reading {json_file}.") with open(json_file, "r") as f: info = json.load(f) logger.info(f"Reading {json_file} done.") - logger.info("Creating train, dev, and test subsets.") - for split, output_csv_file in save_csv_files.items(): - logger.info(f"Starting creating {output_csv_file} using {split} split.") - create_csv( - output_csv_file, info, data_folder, split, convert_opus_to_wav - ) + for split, output in save_output.items(): + logger.info(f"Starting creating {output} using {split} split.") + if use_webdataset: + create_shards( + output, + info, + data_folder, + split, + convert_opus_to_wav, + verbose=verbose, + samples_per_shard=samples_per_shard, + max_size_shard=max_size_shard, + ) + else: + create_csv(output, info, data_folder, split, convert_opus_to_wav) logger.info("Data preparation completed!") @@ -214,6 +249,142 @@ def process_line( return utterances +def _write_in_sink( + items, max_size_shard, samples_per_shard, save_folder, verbose=0 +): + """ + Write the GigaSpeechRow in the webdataset sinks. + + Parameters + ---------- + items : list + The list of items to be written in the sink. + max_size_shard : int + The maximum size of the shard. + samples_per_shard: int + The number of samples per shard. + save_folder : str + The path to the folder where the shards will be saved. + verbose : int, optional + The verbosity level for the webdataset. + + Returns + ------- + list + The list of items written in the sink. + """ + import webdataset as wds + + id_proc, row = items + pattern = os.path.join(save_folder, f"GigaSpeech-{id_proc}-%06d.tar") + with wds.ShardWriter( + pattern, + maxsize=max_size_shard, + maxcount=samples_per_shard, + verbose=verbose, + ) as sink: + for item in row: + start_sample = int(item.begin_time * SAMPLING_RATE) + stop_sample = int(item.end_time * SAMPLING_RATE) + audio = sb.dataio.dataio.read_audio( + { + "file": item.audio_path, + "start": start_sample, + "stop": stop_sample, + } + ) + + sample = { + "__key__": item.utt_id, + "audio.pth": audio, + "text": item.text, + } + + # write back to sink + sink.write(sample) + return row + + +def create_shards( + shards_folder_path: str, + info: json, + data_folder: str, + split: str, + convert_opus_to_wav: bool, + verbose: int = 0, + samples_per_shard=500, + max_size_shard=1e9, +) -> None: + """ + Create shards for the GigaSpeech dataset. + + Parameters + ---------- + shards_folder_path : str + The path to the shards folder. + info : dict + The GigaSpeech JSON file content. + data_folder : str + The path to the GigaSpeech dataset. + split : str + The split to be used for filtering the data. + convert_opus_to_wav : bool + If True, the opus files will be converted to wav files. + samples_per_shard: int + The number of samples per shard. + max_size_shard : int + The maximum size of the shard. + + Returns + ------- + None + """ + total_duration = 0.0 + nb_samples = 0 + + line_processor = functools.partial( + process_line, + data_folder=data_folder, + split=split, + convert_opus_to_wav=convert_opus_to_wav, + ) + + os.makedirs(shards_folder_path, exist_ok=True) + + audios_info = [(0, [])] + id_proc = 1 + for row in parallel_map(line_processor, info["audios"]): + if row is None: + continue + + # creates buckets of samples_per_shard size for each shard + for item in row: + audios_info[-1][1].append(item) + if len(audios_info[-1][1]) == samples_per_shard: + audios_info.append((id_proc, [])) + id_proc += 1 + + sink_processor = functools.partial( + _write_in_sink, + save_folder=shards_folder_path, + max_size_shard=max_size_shard, + samples_per_shard=samples_per_shard, + verbose=verbose, + ) + + logger.info(f"Starting writing shards in {shards_folder_path}...") + for row in parallel_map(sink_processor, audios_info, chunk_size=1): + for item in row: + total_duration += item.duration + nb_samples += 1 + + logger.info(f"{split} shards succesfully created at {shards_folder_path}!") + logger.info(f"Number of samples in {split} split: {nb_samples}") + logger.info( + f"Total duration of {split} split: {round(total_duration / 3600, 2)} Hours" + ) + + def create_csv( csv_file: str, info: json, @@ -317,7 +488,7 @@ def convert_opus2wav(audio_opus_path): """ audio_wav_path = audio_opus_path.replace(".opus", ".wav") os.system( - f"ffmpeg -y -i {audio_opus_path} -ac 1 -ar 16000 {audio_wav_path} > /dev/null 2>&1" + f"ffmpeg -y -i {audio_opus_path} -ac 1 -ar {SAMPLING_RATE} {audio_wav_path} > /dev/null 2>&1" ) return audio_wav_path @@ -368,7 +539,23 @@ def preprocess_text(text: str) -> str: return text.lower() -def skip(save_csv_files: dict) -> bool: +def skip_webdataset(save_folders: str) -> bool: + """ Check if the webdataset shards already exist. + + Parameters + ---------- + save_folders : str + The path to the folder where the shards will be saved. + + Returns + ------- + bool + True if the webdataset shards already exist, False otherwise. + """ + return all(os.listdir(folder) for folder in save_folders.values()) + + +def skip_csv(save_csv_files: dict) -> bool: """ Check if the CSV files already exist. Parameters From 44785c078a71774b0932aa498ff9c08a0583c4f6 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Tue, 13 Feb 2024 21:37:07 +0100 Subject: [PATCH 22/77] remove print --- recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py | 1 - recipes/GigaSpeech/gigaspeech_prepare.py | 1 - 2 files changed, 2 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py index 0d2671fb09..f8c52f3a9f 100644 --- a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py @@ -126,7 +126,6 @@ def prepare_gigaspeech( None """ logger.info(f"Preparing GigaSpeech dataset in {save_folder}...") - print(f"Input args: {locals()}") if skip_prep: logger.info("Skipping data preparation as `skip_prep` is set to `True`") diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index 0d2671fb09..f8c52f3a9f 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -126,7 +126,6 @@ def prepare_gigaspeech( None """ logger.info(f"Preparing GigaSpeech dataset in {save_folder}...") - print(f"Input args: {locals()}") if skip_prep: logger.info("Skipping data preparation as `skip_prep` is set to `True`") From e203d77b4f824162b2606b3ab22e290120839139 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Tue, 13 Feb 2024 21:38:58 +0100 Subject: [PATCH 23/77] tmp files webdataset --- .../ASR/CTC/hparams/train_hf_wavlm_wbs.yaml | 189 +++++++++ .../ASR/CTC/train_with_wavlm_wbs.py | 359 ++++++++++++++++++ 2 files changed, 548 insertions(+) create mode 100644 recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm_wbs.yaml create mode 100644 recipes/GigaSpeech/ASR/CTC/train_with_wavlm_wbs.py diff --git a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm_wbs.yaml b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm_wbs.yaml new file mode 100644 index 0000000000..ac1817ff0e --- /dev/null +++ b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm_wbs.yaml @@ -0,0 +1,189 @@ +# ################################ +# Model: wavlm + DNN + CTC +# Decoding AM: Greedy for validation, and Beam search for testing +# Augmentation: SpecAugment +# Authors: Adel Moumen 2024 +# ################################ + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/train_wavlm_char_wbs_v2/ +output_wer_folder: !ref / +save_folder: !ref /save +train_log: !ref /train_log.txt + +wav2vec2_hub: microsoft/wavlm-large +wav2vec2_folder: !ref /wav2vec2_checkpoint + +# Data files +data_folder: !PLACEHOLDER # e,g./path/to/GigaSpeech + +# see https://github.com/SpeechColab/GigaSpeech for more details on the dataset +# must be one of ["XS", "S", "M", "L", "XL"] +# and ["DEV", "TEST"] for the eval splits. +splits: ["XS", "DEV", "TEST"] +skip_prep: False +convert_opus_to_wav: True +ckpt_interval_minutes: 25 # save checkpoint every N min +# if use_webdataset is True, we expect the output_folder to contain the shards folder +use_webdataset: True +verbose: 0 +samples_per_shard: 500 +train_shards_folder_path: !ref /shards/train +valid_shards_folder_path: !ref /shards/dev +test_shards_folder_path: !ref /shards/test +json_file: !ref /GigaSpeech.json + +# Training parameters +number_of_epochs: 5 +lr: 0.9 +lr_wav2vec: 0.0001 +sorting: ascending +num_workers: 4 +precision: fp32 # bf16, fp16 or fp32 +sample_rate: 16000 + +# These parameters work for a single GPU of 32GB using fp16 +batch_size: 16 +test_batch_size: 1 + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + +# Model parameters +activation: !name:torch.nn.LeakyReLU +dnn_layers: 2 +dnn_neurons: 1024 +freeze_wav2vec: True + +# Outputs +output_neurons: 34 +blank_index: 0 + +# Decoding parameters +test_beam_search: + beam_size: 143 + topk: 1 + blank_index: !ref + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -12.0 + token_prune_min_logp: -1.2 + prune_history: True + +# +# Functions and classes +# +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref + speeds: [95, 100, 105] + +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1 + drop_length_high: 5 + drop_count_low: 1000 + drop_count_high: 2000 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + parallel_augment: False + concat_original: True + repeat_augment: 1 + shuffle_augmentations: False + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref , + !ref , + !ref ] + + +enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN + input_shape: [null, null, 1024] + activation: !ref + dnn_blocks: !ref + dnn_neurons: !ref + +wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + output_norm: True + freeze: !ref + save_path: !ref + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + +modules: + wav2vec2: !ref + enc: !ref + ctc_lin: !ref + +model: !new:torch.nn.ModuleList + - [!ref , !ref ] + +model_opt_class: !name:torch.optim.Adadelta + lr: !ref + rho: 0.95 + eps: 1.e-8 + +wav2vec_opt_class: !name:torch.optim.Adam + lr: !ref + +lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 + +lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.9 + patient: 0 + +label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + wav2vec2: !ref + model: !ref + scheduler_model: !ref + scheduler_wav2vec: !ref + counter: !ref + tokenizer: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm_wbs.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm_wbs.py new file mode 100644 index 0000000000..3f1f97a733 --- /dev/null +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm_wbs.py @@ -0,0 +1,359 @@ +"""TODO + +Authors + * Adel Moumen 2024 +""" +import os +import sys +import logging +import speechbrain as sb +from speechbrain.utils.distributed import run_on_main, if_main_process +from hyperpyyaml import load_hyperpyyaml +import webdataset as wds + +logger = logging.getLogger(__name__) + + +# Define training procedure +class ASR(sb.Brain): + def compute_forward(self, batch, stage): + """Forward computations from the waveform batches to the output probabilities.""" + batch = batch.to(self.device) + wavs, wav_lens = batch.sig + + print(wavs.shape) + exit() + + # Downsample the inputs if specified + if hasattr(self.modules, "downsampler"): + wavs = self.modules.downsampler(wavs) + + # Add waveform augmentation if specified. + if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): + wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens) + + # Forward pass + + # Handling SpeechBrain vs HuggingFance pretrained models + if hasattr(self.modules, "extractor"): # SpeechBrain pretrained model + latents = self.modules.extractor(wavs) + feats = self.modules.encoder_wrapper(latents, wav_lens=wav_lens)[ + "embeddings" + ] + else: # HuggingFace pretrained model + feats = self.modules.wav2vec2(wavs, wav_lens) + + x = self.modules.enc(feats) + + # Compute outputs + logits = self.modules.ctc_lin(x) + + # Upsample the inputs if they have been highly downsampled + if hasattr(self.hparams, "upsampling") and self.hparams.upsampling: + logits = logits.view( + logits.shape[0], -1, self.hparams.output_neurons + ) + + p_ctc = self.hparams.log_softmax(logits) + + if stage == sb.Stage.VALID: + p_tokens = sb.decoders.ctc_greedy_decode( + p_ctc, wav_lens, blank_id=self.hparams.blank_index + ) + # elif stage == sb.Stage.TEST: + # p_tokens = test_searcher(p_ctc, wav_lens) + else: + p_tokens = None + + return p_ctc, wav_lens, p_tokens + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss (CTC+NLL) given predictions and targets.""" + + p_ctc, wav_lens, predicted_tokens = predictions + + ids = batch.id + tokens, tokens_lens = batch.tokens + + # Label Augmentation + if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): + tokens = self.hparams.wav_augment.replicate_labels(tokens) + tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) + + loss_ctc = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) + loss = loss_ctc + + if stage == sb.Stage.VALID: + # Decode token terms to words + predicted_words = [ + "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") + for utt_seq in predicted_tokens + ] + elif stage == sb.Stage.TEST: + predicted_words = [ + hyp[0].text.split(" ") for hyp in predicted_tokens + ] + + if stage != sb.Stage.TRAIN: + target_words = [wrd.split(" ") for wrd in batch.text] + self.wer_metric.append(ids, predicted_words, target_words) + self.cer_metric.append(ids, predicted_words, target_words) + + return loss + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch""" + if stage != sb.Stage.TRAIN: + self.cer_metric = self.hparams.cer_computer() + self.wer_metric = self.hparams.error_rate_computer() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of an epoch.""" + # Compute/store important stats + stage_stats = {"loss": stage_loss} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + else: + stage_stats["CER"] = self.cer_metric.summarize("error_rate") + stage_stats["WER"] = self.wer_metric.summarize("error_rate") + + # Perform end-of-iteration things, like annealing, logging, etc. + if stage == sb.Stage.VALID: + old_lr_model, new_lr_model = self.hparams.lr_annealing_model( + stage_stats["loss"] + ) + old_lr_wav2vec, new_lr_wav2vec = self.hparams.lr_annealing_wav2vec( + stage_stats["loss"] + ) + sb.nnet.schedulers.update_learning_rate( + self.model_optimizer, new_lr_model + ) + sb.nnet.schedulers.update_learning_rate( + self.wav2vec_optimizer, new_lr_wav2vec + ) + self.hparams.train_logger.log_stats( + stats_meta={ + "epoch": epoch, + "lr_model": old_lr_model, + "lr_wav2vec": old_lr_wav2vec, + }, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + self.checkpointer.save_and_keep_only( + meta={"WER": stage_stats["WER"]}, min_keys=["WER"], + ) + elif stage == sb.Stage.TEST: + self.hparams.train_logger.log_stats( + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, + ) + if if_main_process(): + with open(self.hparams.test_wer_file, "w") as w: + self.wer_metric.write_stats(w) + + def init_optimizers(self): + "Initializes the wav2vec2 optimizer and model optimizer" + # Handling SpeechBrain vs HuggingFace pretrained models + if hasattr(self.modules, "extractor"): # SpeechBrain pretrained model + self.wav2vec_optimizer = self.hparams.wav2vec_opt_class( + self.modules.encoder_wrapper.parameters() + ) + + else: # HuggingFace pretrained model + self.wav2vec_optimizer = self.hparams.wav2vec_opt_class( + self.modules.wav2vec2.parameters() + ) + + self.model_optimizer = self.hparams.model_opt_class( + self.hparams.model.parameters() + ) + + # save the optimizers in a dictionary + # the key will be used in `freeze_optimizers()` + self.optimizers_dict = { + "model_optimizer": self.model_optimizer, + } + if not self.hparams.freeze_wav2vec: + self.optimizers_dict["wav2vec_optimizer"] = self.wav2vec_optimizer + + if self.checkpointer is not None: + self.checkpointer.add_recoverable( + "wav2vec_opt", self.wav2vec_optimizer + ) + self.checkpointer.add_recoverable("modelopt", self.model_optimizer) + + +def dataio_prepare(hparams): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions.""" + + label_encoder = sb.dataio.encoder.CTCTextEncoder() + lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") + special_labels = { + "blank_label": hparams["blank_index"], + } + + # 1. Create datasets + if hparams["use_webdataset"]: + + def audio_pipeline(sample_dict): + key = sample_dict["__key__"] + audio_tensor = sample_dict["audio.pth"] + text = sample_dict["text"] + return { + "id": key, + "sig": audio_tensor, + "text": text, + } + + import glob + + train_data = ( + wds.WebDataset( + glob.glob(hparams["train_shards_folder_path"] + "/*.tar") + ) + .repeat() + .decode() + .map(audio_pipeline) + ) + + # sb.dataio.dataset.add_dynamic_item([train_data], audio_pipeline) + + label_encoder.load_or_create( + path=lab_enc_file, + # from_didatasets=[train_data], + # output_key="char_list", + special_labels=special_labels, + # sequence_input=True, + ) + + valid_data = ( + wds.WebDataset( + glob.glob(hparams["valid_shards_folder_path"] + "/*.tar") + ) + .repeat() + .decode() + .map(audio_pipeline) + ) + + test_data = ( + wds.WebDataset( + glob.glob(hparams["test_shards_folder_path"] + "/*.tar") + ) + .repeat() + .decode() + .map(audio_pipeline) + ) + + else: + print("Not implemented yet") + + return train_data, valid_data, test_data, label_encoder + + +if __name__ == "__main__": + + # CLI: + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # create ddp_group with the right communication protocol + sb.utils.distributed.ddp_init_group(run_opts) + + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Dataset prep (parsing Librispeech) + from gigaspeech_prepare import prepare_gigaspeech # noqa + + # multi-gpu (ddp) save data preparation + run_on_main( + prepare_gigaspeech, + kwargs={ + "data_folder": hparams["data_folder"], + "save_folder": hparams["save_folder"], + "splits": hparams["splits"], + "output_train": hparams["train_shards_folder_path"], + "output_dev": hparams["valid_shards_folder_path"], + "output_test": hparams["test_shards_folder_path"], + "json_file": hparams["json_file"], + "verbose": hparams["verbose"], + "skip_prep": hparams["skip_prep"], + "convert_opus_to_wav": hparams["convert_opus_to_wav"], + "use_webdataset": hparams["use_webdataset"], + "samples_per_shard": hparams["samples_per_shard"], + "max_size_shard": hparams.get("max_size_shard", 1e9), + }, + ) + + # here we create the datasets objects as well as tokenization and encoding + train_data, valid_data, test_data, label_encoder = dataio_prepare(hparams) + + # Trainer initialization + asr_brain = ASR( + modules=hparams["modules"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + # # We load the pretrained wav2vec2 model + # if "pretrainer" in hparams.keys(): + # run_on_main(hparams["pretrainer"].collect_files) + # hparams["pretrainer"].load_collected() + + # # We dynamicaly add the tokenizer to our brain class. + # # NB: This tokenizer corresponds to the one used for the LM!! + # asr_brain.tokenizer = label_encoder + + # ind2lab = label_encoder.ind2lab + # vocab_list = [ind2lab[x] for x in range(len(ind2lab))] + # print(vocab_list) + + # from speechbrain.decoders.ctc import CTCBeamSearcher + + # test_searcher = CTCBeamSearcher( + # **hparams["test_beam_search"], vocab_list=vocab_list, + # ) + + hparams["train_dataloader_opts"]["collate_fn"] = sb.dataio.batch.PaddedBatch + hparams["valid_dataloader_opts"]["collate_fn"] = sb.dataio.batch.PaddedBatch + + # Training + asr_brain.fit( + asr_brain.hparams.epoch_counter, + train_data, + valid_data, + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + # Testing + os.makedirs(hparams["output_wer_folder"], exist_ok=True) + + # report WER on valid data + asr_brain.hparams.test_wer_file = os.path.join( + hparams["output_wer_folder"], f"valid_wer.txt" + ) + asr_brain.evaluate( + valid_data, + min_key="WER", + test_loader_kwargs=hparams["test_dataloader_opts"], + ) + + # report WER on test data + asr_brain.hparams.test_wer_file = os.path.join( + hparams["output_wer_folder"], f"test_wer.txt" + ) + asr_brain.evaluate( + test_data, + min_key="WER", + test_loader_kwargs=hparams["test_dataloader_opts"], + ) From 9b44e8db17c335484e44c132b73eb42512b7f0b2 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Wed, 14 Feb 2024 11:33:16 +0100 Subject: [PATCH 24/77] verbosity + metada.json --- .../GigaSpeech/ASR/CTC/gigaspeech_prepare.py | 29 ++++++++++--------- recipes/GigaSpeech/gigaspeech_prepare.py | 29 ++++++++++--------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py index f8c52f3a9f..2f9a995092 100644 --- a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py @@ -30,6 +30,7 @@ SPLITS = ["DEV", "TEST"] TRAIN_SUBSET = ["XS", "S", "M", "L", "XL"] SAMPLING_RATE = 16000 +VERBOSITY_WEBDATASET = 0 @dataclass @@ -77,7 +78,6 @@ def prepare_gigaspeech( skip_prep: bool = False, convert_opus_to_wav: bool = True, use_webdataset: bool = True, - verbose: int = 0, samples_per_shard=500, max_size_shard=1e9, ) -> None: @@ -114,8 +114,6 @@ def prepare_gigaspeech( If True, the opus files will be converted to wav files. use_webdataset : bool, optional If True, the data will be saved in the webdataset format. - verbose : int, optional - The verbosity level for the webdataset. samples_per_shard: int The number of samples per shard. max_size_shard : int @@ -150,15 +148,14 @@ def prepare_gigaspeech( for split in splits: if split in TRAIN_SUBSET: save_output[split] = output_train - if use_webdataset: - os.makedirs(save_output[split], exist_ok=True) else: if split == "DEV": save_output[split] = output_dev elif split == "TEST": save_output[split] = output_test - if use_webdataset: - os.makedirs(save_output[split], exist_ok=True) + + if use_webdataset: + os.makedirs(save_output[split], exist_ok=True) # check if the data is already prepared if use_webdataset and skip_webdataset(save_output): @@ -187,7 +184,6 @@ def prepare_gigaspeech( data_folder, split, convert_opus_to_wav, - verbose=verbose, samples_per_shard=samples_per_shard, max_size_shard=max_size_shard, ) @@ -249,7 +245,7 @@ def process_line( def _write_in_sink( - items, max_size_shard, samples_per_shard, save_folder, verbose=0 + items, max_size_shard, samples_per_shard, save_folder, ): """ Write the GigaSpeechRow in the webdataset sinks. @@ -264,8 +260,6 @@ def _write_in_sink( The number of samples per shard. save_folder : str The path to the folder where the shards will be saved. - verbose : int, optional - The verbosity level for the webdataset. Returns ------- @@ -280,7 +274,7 @@ def _write_in_sink( pattern, maxsize=max_size_shard, maxcount=samples_per_shard, - verbose=verbose, + verbose=VERBOSITY_WEBDATASET, ) as sink: for item in row: start_sample = int(item.begin_time * SAMPLING_RATE) @@ -310,7 +304,6 @@ def create_shards( data_folder: str, split: str, convert_opus_to_wav: bool, - verbose: int = 0, samples_per_shard=500, max_size_shard=1e9, ) -> None: @@ -368,7 +361,6 @@ def create_shards( save_folder=shards_folder_path, max_size_shard=max_size_shard, samples_per_shard=samples_per_shard, - verbose=verbose, ) logger.info(f"Starting writing shards in {shards_folder_path}...") @@ -377,6 +369,15 @@ def create_shards( total_duration += item.duration nb_samples += 1 + metadata_dict = { + "split": split, + "nb_samples": nb_samples, + } + metadata_file_path = os.path.join(shards_folder_path, "metadata.json") + # we need to save the size of the split so that TQDM can work properly + with open(metadata_file_path, "w") as f: + json.dump(metadata_dict, f) + logger.info(f"{split} shards succesfully created at {shards_folder_path}!") logger.info(f"Number of samples in {split} split: {nb_samples}") logger.info( diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index f8c52f3a9f..2f9a995092 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -30,6 +30,7 @@ SPLITS = ["DEV", "TEST"] TRAIN_SUBSET = ["XS", "S", "M", "L", "XL"] SAMPLING_RATE = 16000 +VERBOSITY_WEBDATASET = 0 @dataclass @@ -77,7 +78,6 @@ def prepare_gigaspeech( skip_prep: bool = False, convert_opus_to_wav: bool = True, use_webdataset: bool = True, - verbose: int = 0, samples_per_shard=500, max_size_shard=1e9, ) -> None: @@ -114,8 +114,6 @@ def prepare_gigaspeech( If True, the opus files will be converted to wav files. use_webdataset : bool, optional If True, the data will be saved in the webdataset format. - verbose : int, optional - The verbosity level for the webdataset. samples_per_shard: int The number of samples per shard. max_size_shard : int @@ -150,15 +148,14 @@ def prepare_gigaspeech( for split in splits: if split in TRAIN_SUBSET: save_output[split] = output_train - if use_webdataset: - os.makedirs(save_output[split], exist_ok=True) else: if split == "DEV": save_output[split] = output_dev elif split == "TEST": save_output[split] = output_test - if use_webdataset: - os.makedirs(save_output[split], exist_ok=True) + + if use_webdataset: + os.makedirs(save_output[split], exist_ok=True) # check if the data is already prepared if use_webdataset and skip_webdataset(save_output): @@ -187,7 +184,6 @@ def prepare_gigaspeech( data_folder, split, convert_opus_to_wav, - verbose=verbose, samples_per_shard=samples_per_shard, max_size_shard=max_size_shard, ) @@ -249,7 +245,7 @@ def process_line( def _write_in_sink( - items, max_size_shard, samples_per_shard, save_folder, verbose=0 + items, max_size_shard, samples_per_shard, save_folder, ): """ Write the GigaSpeechRow in the webdataset sinks. @@ -264,8 +260,6 @@ def _write_in_sink( The number of samples per shard. save_folder : str The path to the folder where the shards will be saved. - verbose : int, optional - The verbosity level for the webdataset. Returns ------- @@ -280,7 +274,7 @@ def _write_in_sink( pattern, maxsize=max_size_shard, maxcount=samples_per_shard, - verbose=verbose, + verbose=VERBOSITY_WEBDATASET, ) as sink: for item in row: start_sample = int(item.begin_time * SAMPLING_RATE) @@ -310,7 +304,6 @@ def create_shards( data_folder: str, split: str, convert_opus_to_wav: bool, - verbose: int = 0, samples_per_shard=500, max_size_shard=1e9, ) -> None: @@ -368,7 +361,6 @@ def create_shards( save_folder=shards_folder_path, max_size_shard=max_size_shard, samples_per_shard=samples_per_shard, - verbose=verbose, ) logger.info(f"Starting writing shards in {shards_folder_path}...") @@ -377,6 +369,15 @@ def create_shards( total_duration += item.duration nb_samples += 1 + metadata_dict = { + "split": split, + "nb_samples": nb_samples, + } + metadata_file_path = os.path.join(shards_folder_path, "metadata.json") + # we need to save the size of the split so that TQDM can work properly + with open(metadata_file_path, "w") as f: + json.dump(metadata_dict, f) + logger.info(f"{split} shards succesfully created at {shards_folder_path}!") logger.info(f"Number of samples in {split} split: {nb_samples}") logger.info( From 142615667eddbbe5e4ea711fdbb020ed5c7cbf3b Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Wed, 14 Feb 2024 11:34:54 +0100 Subject: [PATCH 25/77] letzo now label_encoder can actually train + the recipe seems to work. --- .../ASR/CTC/hparams/train_hf_wavlm_wbs.yaml | 4 +- .../ASR/CTC/train_with_wavlm_wbs.py | 139 +++++++++++++----- 2 files changed, 101 insertions(+), 42 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm_wbs.yaml b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm_wbs.yaml index ac1817ff0e..e60f560df6 100644 --- a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm_wbs.yaml +++ b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm_wbs.yaml @@ -28,7 +28,6 @@ convert_opus_to_wav: True ckpt_interval_minutes: 25 # save checkpoint every N min # if use_webdataset is True, we expect the output_folder to contain the shards folder use_webdataset: True -verbose: 0 samples_per_shard: 500 train_shards_folder_path: !ref /shards/train valid_shards_folder_path: !ref /shards/dev @@ -66,8 +65,9 @@ dnn_neurons: 1024 freeze_wav2vec: True # Outputs -output_neurons: 34 +output_neurons: 35 blank_index: 0 +unk_label: "" # Decoding parameters test_beam_search: diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm_wbs.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm_wbs.py index 3f1f97a733..cb2cee0883 100644 --- a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm_wbs.py +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm_wbs.py @@ -10,6 +10,7 @@ from speechbrain.utils.distributed import run_on_main, if_main_process from hyperpyyaml import load_hyperpyyaml import webdataset as wds +import torch logger = logging.getLogger(__name__) @@ -21,9 +22,6 @@ def compute_forward(self, batch, stage): batch = batch.to(self.device) wavs, wav_lens = batch.sig - print(wavs.shape) - exit() - # Downsample the inputs if specified if hasattr(self.modules, "downsampler"): wavs = self.modules.downsampler(wavs) @@ -60,8 +58,8 @@ def compute_forward(self, batch, stage): p_tokens = sb.decoders.ctc_greedy_decode( p_ctc, wav_lens, blank_id=self.hparams.blank_index ) - # elif stage == sb.Stage.TEST: - # p_tokens = test_searcher(p_ctc, wav_lens) + elif stage == sb.Stage.TEST: + p_tokens = test_searcher(p_ctc, wav_lens) else: p_tokens = None @@ -192,40 +190,71 @@ def dataio_prepare(hparams): lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") special_labels = { "blank_label": hparams["blank_index"], + "unk_label": hparams["unk_label"], } # 1. Create datasets if hparams["use_webdataset"]: + import json + + # load the meta info json file + file_path = os.path.join( + hparams["train_shards_folder_path"], "metadata.json" + ) + with wds.gopen(file_path, "rb") as f: + train_meta = json.load(f) + + file_path = os.path.join( + hparams["valid_shards_folder_path"], "metadata.json" + ) + with wds.gopen(file_path, "rb") as f: + val_meta = json.load(f) + + file_path = os.path.join( + hparams["test_shards_folder_path"], "metadata.json" + ) + with wds.gopen(file_path, "rb") as f: + test_meta = json.load(f) + + def _audio_pipeline(sample_dict): + return sample_dict["text"] + + import glob + + train_files = glob.glob(hparams["train_shards_folder_path"] + "/*.tar") + + def _text_generator(shard_files): + for shard_file in shard_files: + for sample_dict in ( + wds.WebDataset(shard_file).decode().map(_audio_pipeline) + ): + yield sample_dict + + label_encoder.load_or_create( + path=lab_enc_file, + from_iterables=[_text_generator(train_files)], + sequence_input=True, + special_labels=special_labels, + ) def audio_pipeline(sample_dict): key = sample_dict["__key__"] audio_tensor = sample_dict["audio.pth"] text = sample_dict["text"] + char_list = list(text) + tokens_list = label_encoder.encode_sequence(char_list) + tokens = torch.LongTensor(tokens_list) return { "id": key, "sig": audio_tensor, "text": text, + "char_list": char_list, + "tokens_list": tokens_list, + "tokens": tokens, } - import glob - train_data = ( - wds.WebDataset( - glob.glob(hparams["train_shards_folder_path"] + "/*.tar") - ) - .repeat() - .decode() - .map(audio_pipeline) - ) - - # sb.dataio.dataset.add_dynamic_item([train_data], audio_pipeline) - - label_encoder.load_or_create( - path=lab_enc_file, - # from_didatasets=[train_data], - # output_key="char_list", - special_labels=special_labels, - # sequence_input=True, + wds.WebDataset(train_files).repeat().decode().map(audio_pipeline) ) valid_data = ( @@ -249,7 +278,15 @@ def audio_pipeline(sample_dict): else: print("Not implemented yet") - return train_data, valid_data, test_data, label_encoder + return ( + train_data, + valid_data, + test_data, + label_encoder, + train_meta, + val_meta, + test_meta, + ) if __name__ == "__main__": @@ -284,7 +321,6 @@ def audio_pipeline(sample_dict): "output_dev": hparams["valid_shards_folder_path"], "output_test": hparams["test_shards_folder_path"], "json_file": hparams["json_file"], - "verbose": hparams["verbose"], "skip_prep": hparams["skip_prep"], "convert_opus_to_wav": hparams["convert_opus_to_wav"], "use_webdataset": hparams["use_webdataset"], @@ -294,7 +330,15 @@ def audio_pipeline(sample_dict): ) # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_data, label_encoder = dataio_prepare(hparams) + ( + train_data, + valid_data, + test_data, + label_encoder, + train_meta, + val_meta, + test_meta, + ) = dataio_prepare(hparams) # Trainer initialization asr_brain = ASR( @@ -304,27 +348,42 @@ def audio_pipeline(sample_dict): checkpointer=hparams["checkpointer"], ) - # # We load the pretrained wav2vec2 model - # if "pretrainer" in hparams.keys(): - # run_on_main(hparams["pretrainer"].collect_files) - # hparams["pretrainer"].load_collected() + # We load the pretrained wav2vec2 model + if "pretrainer" in hparams.keys(): + run_on_main(hparams["pretrainer"].collect_files) + hparams["pretrainer"].load_collected() - # # We dynamicaly add the tokenizer to our brain class. - # # NB: This tokenizer corresponds to the one used for the LM!! - # asr_brain.tokenizer = label_encoder + # We dynamicaly add the tokenizer to our brain class. + # NB: This tokenizer corresponds to the one used for the LM!! + asr_brain.tokenizer = label_encoder - # ind2lab = label_encoder.ind2lab - # vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - # print(vocab_list) + ind2lab = label_encoder.ind2lab + vocab_list = [ind2lab[x] for x in range(len(ind2lab))] + print(vocab_list) + print(len(vocab_list)) - # from speechbrain.decoders.ctc import CTCBeamSearcher + from speechbrain.decoders.ctc import CTCBeamSearcher - # test_searcher = CTCBeamSearcher( - # **hparams["test_beam_search"], vocab_list=vocab_list, - # ) + test_searcher = CTCBeamSearcher( + **hparams["test_beam_search"], vocab_list=vocab_list, + ) hparams["train_dataloader_opts"]["collate_fn"] = sb.dataio.batch.PaddedBatch hparams["valid_dataloader_opts"]["collate_fn"] = sb.dataio.batch.PaddedBatch + hparams["test_dataloader_opts"]["collate_fn"] = sb.dataio.batch.PaddedBatch + + hparams["train_dataloader_opts"]["looped_nominal_epoch"] = ( + train_meta["nb_samples"] + // hparams["train_dataloader_opts"]["batch_size"] + ) + + hparams["valid_dataloader_opts"]["looped_nominal_epoch"] = ( + val_meta["nb_samples"] // hparams["valid_dataloader_opts"]["batch_size"] + ) + + hparams["test_dataloader_opts"]["looped_nominal_epoch"] = ( + test_meta["nb_samples"] // hparams["test_dataloader_opts"]["batch_size"] + ) # Training asr_brain.fit( From ce12662e18b1463e859979ab63391f82ab3c249f Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Mon, 18 Mar 2024 14:08:39 +0100 Subject: [PATCH 26/77] remove wbs --- .../GigaSpeech/ASR/CTC/gigaspeech_prepare.py | 189 +----------------- recipes/GigaSpeech/gigaspeech_prepare.py | 189 +----------------- 2 files changed, 4 insertions(+), 374 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py index 2f9a995092..0d5e40cfa9 100644 --- a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py @@ -16,7 +16,6 @@ from dataclasses import dataclass import functools from speechbrain.utils.parallel import parallel_map -import speechbrain as sb logger = logging.getLogger(__name__) @@ -30,7 +29,6 @@ SPLITS = ["DEV", "TEST"] TRAIN_SUBSET = ["XS", "S", "M", "L", "XL"] SAMPLING_RATE = 16000 -VERBOSITY_WEBDATASET = 0 @dataclass @@ -77,9 +75,6 @@ def prepare_gigaspeech( json_file: str = "GigaSpeech.json", skip_prep: bool = False, convert_opus_to_wav: bool = True, - use_webdataset: bool = True, - samples_per_shard=500, - max_size_shard=1e9, ) -> None: """ Prepare the csv files for GigaSpeech dataset. @@ -112,12 +107,6 @@ def prepare_gigaspeech( If True, the data preparation will be skipped, and the function will return immediately. convert_opus_to_wav : bool, optional If True, the opus files will be converted to wav files. - use_webdataset : bool, optional - If True, the data will be saved in the webdataset format. - samples_per_shard: int - The number of samples per shard. - max_size_shard : int - The maximum size of the shard. Returns ------- @@ -154,14 +143,8 @@ def prepare_gigaspeech( elif split == "TEST": save_output[split] = output_test - if use_webdataset: - os.makedirs(save_output[split], exist_ok=True) - # check if the data is already prepared - if use_webdataset and skip_webdataset(save_output): - logger.info("Skipping preparation, completed in previous run.") - return - elif skip_csv(save_output): + if skip_csv(save_output): logger.info("Skipping preparation, completed in previous run.") return else: @@ -177,18 +160,7 @@ def prepare_gigaspeech( for split, output in save_output.items(): logger.info(f"Starting creating {output} using {split} split.") - if use_webdataset: - create_shards( - output, - info, - data_folder, - split, - convert_opus_to_wav, - samples_per_shard=samples_per_shard, - max_size_shard=max_size_shard, - ) - else: - create_csv(output, info, data_folder, split, convert_opus_to_wav) + create_csv(output, info, data_folder, split, convert_opus_to_wav) logger.info("Data preparation completed!") @@ -244,147 +216,6 @@ def process_line( return utterances -def _write_in_sink( - items, max_size_shard, samples_per_shard, save_folder, -): - """ - Write the GigaSpeechRow in the webdataset sinks. - - Parameters - ---------- - items : list - The list of items to be written in the sink. - max_size_shard : int - The maximum size of the shard. - samples_per_shard: int - The number of samples per shard. - save_folder : str - The path to the folder where the shards will be saved. - - Returns - ------- - list - The list of items written in the sink. - """ - import webdataset as wds - - id_proc, row = items - pattern = os.path.join(save_folder, f"GigaSpeech-{id_proc}-%06d.tar") - with wds.ShardWriter( - pattern, - maxsize=max_size_shard, - maxcount=samples_per_shard, - verbose=VERBOSITY_WEBDATASET, - ) as sink: - for item in row: - start_sample = int(item.begin_time * SAMPLING_RATE) - stop_sample = int(item.end_time * SAMPLING_RATE) - audio = sb.dataio.dataio.read_audio( - { - "file": item.audio_path, - "start": start_sample, - "stop": stop_sample, - } - ) - - sample = { - "__key__": item.utt_id, - "audio.pth": audio, - "text": item.text, - } - - # write back to sink - sink.write(sample) - return row - - -def create_shards( - shards_folder_path: str, - info: json, - data_folder: str, - split: str, - convert_opus_to_wav: bool, - samples_per_shard=500, - max_size_shard=1e9, -) -> None: - """ - Create shards for the GigaSpeech dataset. - - Parameters - ---------- - shards_folder_path : str - The path to the shards folder. - info : dict - The GigaSpeech JSON file content. - data_folder : str - The path to the GigaSpeech dataset. - split : str - The split to be used for filtering the data. - convert_opus_to_wav : bool - If True, the opus files will be converted to wav files. - samples_per_shard: int - The number of samples per shard. - max_size_shard : int - The maximum size of the shard. - - Returns - ------- - None - """ - total_duration = 0.0 - nb_samples = 0 - - line_processor = functools.partial( - process_line, - data_folder=data_folder, - split=split, - convert_opus_to_wav=convert_opus_to_wav, - ) - - os.makedirs(shards_folder_path, exist_ok=True) - - audios_info = [(0, [])] - id_proc = 1 - for row in parallel_map(line_processor, info["audios"]): - if row is None: - continue - - # creates buckets of samples_per_shard size for each shard - for item in row: - audios_info[-1][1].append(item) - if len(audios_info[-1][1]) == samples_per_shard: - audios_info.append((id_proc, [])) - id_proc += 1 - - sink_processor = functools.partial( - _write_in_sink, - save_folder=shards_folder_path, - max_size_shard=max_size_shard, - samples_per_shard=samples_per_shard, - ) - - logger.info(f"Starting writing shards in {shards_folder_path}...") - for row in parallel_map(sink_processor, audios_info, chunk_size=1): - for item in row: - total_duration += item.duration - nb_samples += 1 - - metadata_dict = { - "split": split, - "nb_samples": nb_samples, - } - metadata_file_path = os.path.join(shards_folder_path, "metadata.json") - # we need to save the size of the split so that TQDM can work properly - with open(metadata_file_path, "w") as f: - json.dump(metadata_dict, f) - - logger.info(f"{split} shards succesfully created at {shards_folder_path}!") - logger.info(f"Number of samples in {split} split: {nb_samples}") - logger.info( - f"Total duration of {split} split: {round(total_duration / 3600, 2)} Hours" - ) - - def create_csv( csv_file: str, info: json, @@ -539,22 +370,6 @@ def preprocess_text(text: str) -> str: return text.lower() -def skip_webdataset(save_folders: str) -> bool: - """ Check if the webdataset shards already exist. - - Parameters - ---------- - save_folders : str - The path to the folder where the shards will be saved. - - Returns - ------- - bool - True if the webdataset shards already exist, False otherwise. - """ - return all(os.listdir(folder) for folder in save_folders.values()) - - def skip_csv(save_csv_files: dict) -> bool: """ Check if the CSV files already exist. diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index 2f9a995092..0d5e40cfa9 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -16,7 +16,6 @@ from dataclasses import dataclass import functools from speechbrain.utils.parallel import parallel_map -import speechbrain as sb logger = logging.getLogger(__name__) @@ -30,7 +29,6 @@ SPLITS = ["DEV", "TEST"] TRAIN_SUBSET = ["XS", "S", "M", "L", "XL"] SAMPLING_RATE = 16000 -VERBOSITY_WEBDATASET = 0 @dataclass @@ -77,9 +75,6 @@ def prepare_gigaspeech( json_file: str = "GigaSpeech.json", skip_prep: bool = False, convert_opus_to_wav: bool = True, - use_webdataset: bool = True, - samples_per_shard=500, - max_size_shard=1e9, ) -> None: """ Prepare the csv files for GigaSpeech dataset. @@ -112,12 +107,6 @@ def prepare_gigaspeech( If True, the data preparation will be skipped, and the function will return immediately. convert_opus_to_wav : bool, optional If True, the opus files will be converted to wav files. - use_webdataset : bool, optional - If True, the data will be saved in the webdataset format. - samples_per_shard: int - The number of samples per shard. - max_size_shard : int - The maximum size of the shard. Returns ------- @@ -154,14 +143,8 @@ def prepare_gigaspeech( elif split == "TEST": save_output[split] = output_test - if use_webdataset: - os.makedirs(save_output[split], exist_ok=True) - # check if the data is already prepared - if use_webdataset and skip_webdataset(save_output): - logger.info("Skipping preparation, completed in previous run.") - return - elif skip_csv(save_output): + if skip_csv(save_output): logger.info("Skipping preparation, completed in previous run.") return else: @@ -177,18 +160,7 @@ def prepare_gigaspeech( for split, output in save_output.items(): logger.info(f"Starting creating {output} using {split} split.") - if use_webdataset: - create_shards( - output, - info, - data_folder, - split, - convert_opus_to_wav, - samples_per_shard=samples_per_shard, - max_size_shard=max_size_shard, - ) - else: - create_csv(output, info, data_folder, split, convert_opus_to_wav) + create_csv(output, info, data_folder, split, convert_opus_to_wav) logger.info("Data preparation completed!") @@ -244,147 +216,6 @@ def process_line( return utterances -def _write_in_sink( - items, max_size_shard, samples_per_shard, save_folder, -): - """ - Write the GigaSpeechRow in the webdataset sinks. - - Parameters - ---------- - items : list - The list of items to be written in the sink. - max_size_shard : int - The maximum size of the shard. - samples_per_shard: int - The number of samples per shard. - save_folder : str - The path to the folder where the shards will be saved. - - Returns - ------- - list - The list of items written in the sink. - """ - import webdataset as wds - - id_proc, row = items - pattern = os.path.join(save_folder, f"GigaSpeech-{id_proc}-%06d.tar") - with wds.ShardWriter( - pattern, - maxsize=max_size_shard, - maxcount=samples_per_shard, - verbose=VERBOSITY_WEBDATASET, - ) as sink: - for item in row: - start_sample = int(item.begin_time * SAMPLING_RATE) - stop_sample = int(item.end_time * SAMPLING_RATE) - audio = sb.dataio.dataio.read_audio( - { - "file": item.audio_path, - "start": start_sample, - "stop": stop_sample, - } - ) - - sample = { - "__key__": item.utt_id, - "audio.pth": audio, - "text": item.text, - } - - # write back to sink - sink.write(sample) - return row - - -def create_shards( - shards_folder_path: str, - info: json, - data_folder: str, - split: str, - convert_opus_to_wav: bool, - samples_per_shard=500, - max_size_shard=1e9, -) -> None: - """ - Create shards for the GigaSpeech dataset. - - Parameters - ---------- - shards_folder_path : str - The path to the shards folder. - info : dict - The GigaSpeech JSON file content. - data_folder : str - The path to the GigaSpeech dataset. - split : str - The split to be used for filtering the data. - convert_opus_to_wav : bool - If True, the opus files will be converted to wav files. - samples_per_shard: int - The number of samples per shard. - max_size_shard : int - The maximum size of the shard. - - Returns - ------- - None - """ - total_duration = 0.0 - nb_samples = 0 - - line_processor = functools.partial( - process_line, - data_folder=data_folder, - split=split, - convert_opus_to_wav=convert_opus_to_wav, - ) - - os.makedirs(shards_folder_path, exist_ok=True) - - audios_info = [(0, [])] - id_proc = 1 - for row in parallel_map(line_processor, info["audios"]): - if row is None: - continue - - # creates buckets of samples_per_shard size for each shard - for item in row: - audios_info[-1][1].append(item) - if len(audios_info[-1][1]) == samples_per_shard: - audios_info.append((id_proc, [])) - id_proc += 1 - - sink_processor = functools.partial( - _write_in_sink, - save_folder=shards_folder_path, - max_size_shard=max_size_shard, - samples_per_shard=samples_per_shard, - ) - - logger.info(f"Starting writing shards in {shards_folder_path}...") - for row in parallel_map(sink_processor, audios_info, chunk_size=1): - for item in row: - total_duration += item.duration - nb_samples += 1 - - metadata_dict = { - "split": split, - "nb_samples": nb_samples, - } - metadata_file_path = os.path.join(shards_folder_path, "metadata.json") - # we need to save the size of the split so that TQDM can work properly - with open(metadata_file_path, "w") as f: - json.dump(metadata_dict, f) - - logger.info(f"{split} shards succesfully created at {shards_folder_path}!") - logger.info(f"Number of samples in {split} split: {nb_samples}") - logger.info( - f"Total duration of {split} split: {round(total_duration / 3600, 2)} Hours" - ) - - def create_csv( csv_file: str, info: json, @@ -539,22 +370,6 @@ def preprocess_text(text: str) -> str: return text.lower() -def skip_webdataset(save_folders: str) -> bool: - """ Check if the webdataset shards already exist. - - Parameters - ---------- - save_folders : str - The path to the folder where the shards will be saved. - - Returns - ------- - bool - True if the webdataset shards already exist, False otherwise. - """ - return all(os.listdir(folder) for folder in save_folders.values()) - - def skip_csv(save_csv_files: dict) -> bool: """ Check if the CSV files already exist. From ed3ba035776ab736602055c8b6560ff64ca02891 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Mon, 18 Mar 2024 14:09:12 +0100 Subject: [PATCH 27/77] DL info --- recipes/GigaSpeech/ASR/CTC/download_gigaspeech.py | 2 ++ recipes/GigaSpeech/download_gigaspeech.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/recipes/GigaSpeech/ASR/CTC/download_gigaspeech.py b/recipes/GigaSpeech/ASR/CTC/download_gigaspeech.py index c1e3f919d7..357540bbe5 100644 --- a/recipes/GigaSpeech/ASR/CTC/download_gigaspeech.py +++ b/recipes/GigaSpeech/ASR/CTC/download_gigaspeech.py @@ -1,5 +1,7 @@ """ Note for reviewer: this is a temporary script. It may be removed in the future. +Note2: for EU/US users, using this script might be VERY slow. It is instead +recommended to use the HuggingFace script. Download script for GigaSpeech dataset. diff --git a/recipes/GigaSpeech/download_gigaspeech.py b/recipes/GigaSpeech/download_gigaspeech.py index c1e3f919d7..357540bbe5 100644 --- a/recipes/GigaSpeech/download_gigaspeech.py +++ b/recipes/GigaSpeech/download_gigaspeech.py @@ -1,5 +1,7 @@ """ Note for reviewer: this is a temporary script. It may be removed in the future. +Note2: for EU/US users, using this script might be VERY slow. It is instead +recommended to use the HuggingFace script. Download script for GigaSpeech dataset. From 8ae360bdab42b44ae0612a4b067e8170926f36a5 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Mon, 18 Mar 2024 14:18:52 +0100 Subject: [PATCH 28/77] HF DL support --- recipes/GigaSpeech/ASR/CTC/dataset.py | 441 ++++++++++++++++++ .../GigaSpeech/ASR/CTC/gigaspeech_prepare.py | 167 ++++++- recipes/GigaSpeech/dataset.py | 441 ++++++++++++++++++ recipes/GigaSpeech/gigaspeech_prepare.py | 167 ++++++- 4 files changed, 1196 insertions(+), 20 deletions(-) create mode 100644 recipes/GigaSpeech/ASR/CTC/dataset.py create mode 100644 recipes/GigaSpeech/dataset.py diff --git a/recipes/GigaSpeech/ASR/CTC/dataset.py b/recipes/GigaSpeech/ASR/CTC/dataset.py new file mode 100644 index 0000000000..9dee453b28 --- /dev/null +++ b/recipes/GigaSpeech/ASR/CTC/dataset.py @@ -0,0 +1,441 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# MODIFIED BY: Adel Moumen 2024 +""" +GigaSpeech is an evolving, multi-domain English speech recognition corpus with 10,000 hours of high quality +labeled audio suitable for supervised training, and 40,000 hours of total audio suitable for semi-supervised +and unsupervised training. Around 40,000 hours of transcribed audio is first collected from audiobooks, podcasts +and YouTube, covering both read and spontaneous speaking styles, and a variety of topics, such as arts, science, +sports, etc. A new forced alignment and segmentation pipeline is proposed to create sentence segments suitable +for speech recognition training, and to filter out segments with low-quality transcription. For system training, +GigaSpeech provides five subsets of different sizes, 10h, 250h, 1000h, 2500h, and 10000h. +For our 10,000-hour XL training subset, we cap the word error rate at 4% during the filtering/validation stage, +and for all our other smaller training subsets, we cap it at 0%. The DEV and TEST evaluation sets, on the other hand, +are re-processed by professional human transcribers to ensure high transcription quality. +""" + +import csv +import os + +import datasets + +_CITATION = """\ +@article{DBLP:journals/corr/abs-2106-06909, + author = {Guoguo Chen and + Shuzhou Chai and + Guanbo Wang and + Jiayu Du and + Wei{-}Qiang Zhang and + Chao Weng and + Dan Su and + Daniel Povey and + Jan Trmal and + Junbo Zhang and + Mingjie Jin and + Sanjeev Khudanpur and + Shinji Watanabe and + Shuaijiang Zhao and + Wei Zou and + Xiangang Li and + Xuchen Yao and + Yongqing Wang and + Yujun Wang and + Zhao You and + Zhiyong Yan}, + title = {GigaSpeech: An Evolving, Multi-domain {ASR} Corpus with 10, 000 Hours + of Transcribed Audio}, + journal = {CoRR}, + volume = {abs/2106.06909}, + year = {2021}, + url = {https://arxiv.org/abs/2106.06909}, + eprinttype = {arXiv}, + eprint = {2106.06909}, + timestamp = {Wed, 29 Dec 2021 14:29:26 +0100}, + biburl = {https://dblp.org/rec/journals/corr/abs-2106-06909.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +""" + +_DESCRIPTION = """\ +GigaSpeech is an evolving, multi-domain English speech recognition corpus with 10,000 hours of high quality +labeled audio suitable for supervised training, and 40,000 hours of total audio suitable for semi-supervised +and unsupervised training. Around 40,000 hours of transcribed audio is first collected from audiobooks, podcasts +and YouTube, covering both read and spontaneous speaking styles, and a variety of topics, such as arts, science, +sports, etc. A new forced alignment and segmentation pipeline is proposed to create sentence segments suitable +for speech recognition training, and to filter out segments with low-quality transcription. For system training, +GigaSpeech provides five subsets of different sizes, 10h, 250h, 1000h, 2500h, and 10000h. +For our 10,000-hour XL training subset, we cap the word error rate at 4% during the filtering/validation stage, +and for all our other smaller training subsets, we cap it at 0%. The DEV and TEST evaluation sets, on the other hand, +are re-processed by professional human transcribers to ensure high transcription quality. +""" + +_HOMEPAGE = "https://github.com/SpeechColab/GigaSpeech" + +_LICENSE = "Apache License 2.0" + +_CATEGORIES = ( + "People and Blogs", + "Business", + "Nonprofits and Activism", + "Crime", + "History", + "Pets and Animals", + "News and Politics", + "Travel and Events", + "Kids and Family", + "Leisure", + "N/A", + "Comedy", + "News and Politics", + "Sports", + "Arts", + "Science and Technology", + "Autos and Vehicles", + "Science and Technology", + "People and Blogs", + "Music", + "Society and Culture", + "Education", + "Howto and Style", + "Film and Animation", + "Gaming", + "Entertainment", + "Travel and Events", + "Health and Fitness", + "audiobook", +) + +_SOURCES = ("audiobook", "podcast", "youtube") + +_SUBSETS = ("xs", "s", "m", "l", "xl") + +_BASE_DATA_URL = ( + "https://huggingface.co/datasets/speechcolab/gigaspeech/resolve/main/data/" +) + +_AUDIO_ARCHIVE_URL = ( + _BASE_DATA_URL + + "audio/{subset}_files{is_additional}/{subset}_chunks_{archive_id:04}.tar.gz" +) + +_META_URL = ( + _BASE_DATA_URL + + "metadata/{subset}_metadata{is_additional}/{subset}_chunks_{archive_id:04}_metadata.csv" +) + +_N_ARCHIVES_URL = _BASE_DATA_URL + "{subset}_n_archives{is_additional}.txt" + +logger = datasets.utils.logging.get_logger(__name__) + + +class GigaspeechConfig(datasets.BuilderConfig): + """BuilderConfig for Gigaspeech.""" + + def __init__(self, name, *args, **kwargs): + """BuilderConfig for Gigaspeech + """ + super().__init__(name=name, *args, **kwargs) + # larger subsets are supersets of smaller subsets, + # if we want to download "m", we need to download "xs" and "s" data too. + # so if name == "m", self.subsets_to_download will be ("xs", "s", "m") + if name not in {"dev", "test"}: + self.subsets_to_download = _SUBSETS[: _SUBSETS.index(name) + 1] + else: + self.subsets_to_download = (name,) + + +class Gigaspeech(datasets.GeneratorBasedBuilder): + """ + GigaSpeech is an evolving, multi-domain English speech recognition corpus with 10,000 hours of high quality + labeled audio suitable for supervised training, and 40,000 hours of total audio suitable for semi-supervised + and unsupervised training (this implementation contains only labelled data for now). + Around 40,000 hours of transcribed audio is first collected from audiobooks, podcasts + and YouTube, covering both read and spontaneous speaking styles, and a variety of topics, such as arts, science, + sports, etc. A new forced alignment and segmentation pipeline is proposed to create sentence segments suitable + for speech recognition training, and to filter out segments with low-quality transcription. For system training, + GigaSpeech provides five subsets of different sizes, 10h, 250h, 1000h, 2500h, and 10000h. + For our 10,000-hour XL training subset, we cap the word error rate at 4% during the filtering/validation stage, + and for all our other smaller training subsets, we cap it at 0%. The DEV and TEST evaluation sets, on the other hand, + are re-processed by professional human transcribers to ensure high transcription quality. + """ + + VERSION = datasets.Version("1.0.0") + + BUILDER_CONFIGS = [ + GigaspeechConfig(name=subset) for subset in _SUBSETS + ("dev", "test") + ] + + DEFAULT_WRITER_BATCH_SIZE = 128 + + def _info(self): + features = datasets.Features( + { + "segment_id": datasets.Value("string"), + "speaker": datasets.Value("string"), + "text": datasets.Value("string"), + "audio": datasets.Audio(sampling_rate=16_000, decode=False), + "begin_time": datasets.Value("float32"), + "end_time": datasets.Value("float32"), + "audio_id": datasets.Value("string"), + "title": datasets.Value("string"), + "url": datasets.Value("string"), + "source": datasets.ClassLabel(names=_SOURCES), + "category": datasets.ClassLabel(names=_CATEGORIES), + "original_full_path": datasets.Value( + "string" + ), # relative path to full audio in original data dirs + } + ) + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _is_additional_data(self, name): + if name in {"s", "m", "l", "xl"}: + return "_additional" + return "" + + @property + def _splits_to_subsets(self): + return { + "train": self.config.subsets_to_download, + "dev": ["dev"], + "test": ["test"], + } + + def _read_n_archives(self, n_archives_path): + with open(n_archives_path, encoding="utf-8") as f: + return int(f.read().strip()) + + def _split_generators(self, dl_manager): + splits_to_subsets = self._splits_to_subsets + if self.config.name in {"dev", "test"}: + splits = (self.config.name,) + else: + splits = ("train", "dev", "test") + + # 1. get number of archives (shards) in each subset + n_archives_links = { + split: { + subset: _N_ARCHIVES_URL.format( + subset=subset, + is_additional=self._is_additional_data(subset), + ) + for subset in splits_to_subsets[split] + } + for split in splits + } + n_archives_paths = dl_manager.download_and_extract(n_archives_links) + n_archives = { + # mapping from a subset to a single number - number of audio archives (shards) in a subset + split: { + subset: self._read_n_archives(n_archives_paths[split][subset]) + for subset in splits_to_subsets[split] + } + for split in splits + } + + # 2. prepare sharded archives with audio files + audio_archives_urls = { + split: { + subset: [ + _AUDIO_ARCHIVE_URL.format( + subset=subset, + is_additional=self._is_additional_data(subset), + archive_id=i, + ) + for i in range(n_archives[split][subset]) + ] + for subset in splits_to_subsets[split] + } + for split in splits + } + audio_archives_paths = dl_manager.download(audio_archives_urls) + # flatten archives paths from + # {"train": {"xs": [path1, path2,], "s": [path3], "m": [path5, path5]}, "dev": {"dev": [path6,...]}, "test": {"test": [...]}} + # to {"train": [path1, path2, path3, path4, path5], "dev": [path6, ...], "test": [...]} + audio_archives_paths = _flatten_nested_dict(audio_archives_paths) + local_audio_archives_paths = ( + dl_manager.extract(audio_archives_paths) + if not dl_manager.is_streaming + else None + ) + + # 3. prepare sharded metadata csv files + meta_urls = { + split: { + subset: [ + _META_URL.format( + subset=subset, + is_additional=self._is_additional_data(subset), + archive_id=i, + ) + for i in range(n_archives[split][subset]) + ] + for subset in splits_to_subsets[split] + } + for split in splits + } + meta_paths = dl_manager.download_and_extract(meta_urls) + meta_paths = _flatten_nested_dict(meta_paths) + + if self.config.name not in {"dev", "test"}: + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "audio_archives_iterators": [ + dl_manager.iter_archive(archive_path) + for archive_path in audio_archives_paths["train"] + ], + "local_audio_archives_paths": local_audio_archives_paths[ + "train" + ] + if local_audio_archives_paths + else None, + "meta_paths": meta_paths["train"], + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "audio_archives_iterators": [ + dl_manager.iter_archive(archive_path) + for archive_path in audio_archives_paths["dev"] + ], + "local_audio_archives_paths": local_audio_archives_paths[ + "dev" + ] + if local_audio_archives_paths + else None, + "meta_paths": meta_paths["dev"], + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "audio_archives_iterators": [ + dl_manager.iter_archive(archive_path) + for archive_path in audio_archives_paths["test"] + ], + "local_audio_archives_paths": local_audio_archives_paths[ + "test" + ] + if local_audio_archives_paths + else None, + "meta_paths": meta_paths["test"], + }, + ), + ] + + if self.config.name == "dev": + return [ + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "audio_archives_iterators": [ + dl_manager.iter_archive(archive_path) + for archive_path in audio_archives_paths["dev"] + ], + "local_audio_archives_paths": local_audio_archives_paths[ + "dev" + ] + if local_audio_archives_paths + else None, + "meta_paths": meta_paths["dev"], + }, + ), + ] + + if self.config.name == "test": + return [ + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "audio_archives_iterators": [ + dl_manager.iter_archive(archive_path) + for archive_path in audio_archives_paths["test"] + ], + "local_audio_archives_paths": local_audio_archives_paths[ + "test" + ] + if local_audio_archives_paths + else None, + "meta_paths": meta_paths["test"], + }, + ), + ] + + def _generate_examples( + self, audio_archives_iterators, local_audio_archives_paths, meta_paths + ): + assert len(audio_archives_iterators) == len(meta_paths) + if local_audio_archives_paths: + assert len(audio_archives_iterators) == len( + local_audio_archives_paths + ) + + for i, (meta_path, audio_archive_iterator) in enumerate( + zip(meta_paths, audio_archives_iterators) + ): + meta_dict = dict() + with open(meta_path) as csvfile: + meta_csv = csv.DictReader(csvfile) + for line in meta_csv: + meta_dict[line["sid"]] = line + + for audio_path_in_archive, audio_file in audio_archive_iterator: + # `audio_path_in_archive` is like "dev_chunks_0000/YOU1000000029_S0000095.wav" + audio_filename = os.path.split(audio_path_in_archive)[1] + audio_id = audio_filename.split(".wav")[0] + audio_meta = meta_dict[audio_id] + audio_meta["segment_id"] = audio_meta.pop("sid") + audio_meta["original_full_path"] = audio_meta.pop("path") + audio_meta["text"] = audio_meta.pop("text_tn") + audio_meta["audio_id"] = audio_meta.pop("aid") + if not audio_meta["category"]: + audio_meta["category"] = "N/A" + + path = ( + os.path.join( + local_audio_archives_paths[i], audio_path_in_archive + ) + if local_audio_archives_paths + else audio_path_in_archive + ) + + yield audio_id, { + "audio": {"path": path, "bytes": audio_file.read()}, + **{ + feature: value + for feature, value in audio_meta.items() + if feature in self.info.features + }, + } + + +def _flatten_nested_dict(nested_dict): + return { + key: [ + inner_list_element + for inner_list in value_to_lists.values() + for inner_list_element in inner_list + ] + for key, value_to_lists in nested_dict.items() + } diff --git a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py index 0d5e40cfa9..1b45ef07ee 100644 --- a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py @@ -16,6 +16,7 @@ from dataclasses import dataclass import functools from speechbrain.utils.parallel import parallel_map +import speechbrain as sb logger = logging.getLogger(__name__) @@ -75,6 +76,7 @@ def prepare_gigaspeech( json_file: str = "GigaSpeech.json", skip_prep: bool = False, convert_opus_to_wav: bool = True, + download_with_HF: bool = False, ) -> None: """ Prepare the csv files for GigaSpeech dataset. @@ -107,6 +109,12 @@ def prepare_gigaspeech( If True, the data preparation will be skipped, and the function will return immediately. convert_opus_to_wav : bool, optional If True, the opus files will be converted to wav files. + download_with_HF : bool, optional + If True, the dataset will be downloaded using the Hugging Face datasets library. + We highly recommend using this option if you are based in the EU or US as it will + be faster and more reliable than the official host. Make sure to read the + instructions on how to get the dataset from Hugging Face here: + https://huggingface.co/datasets/speechcolab/gigaspeech Returns ------- @@ -134,9 +142,11 @@ def prepare_gigaspeech( # Setting output paths save_output = {} + train_split = "" for split in splits: if split in TRAIN_SUBSET: save_output[split] = output_train + train_split = split else: if split == "DEV": save_output[split] = output_dev @@ -150,22 +160,38 @@ def prepare_gigaspeech( else: logger.info("Starting data preparation...") - # check that the data folder contains the GigaSpeech dataset - check_gigaspeech_folders(data_folder, json_file) + if download_with_HF: + from datasets import load_dataset - logger.info(f"Starting reading {json_file}.") - with open(json_file, "r") as f: - info = json.load(f) - logger.info(f"Reading {json_file} done.") + hf_dataset = load_dataset( + "dataset.py", + train_split.lower(), + trust_remote_code=True, + cache_dir=data_folder, + data_dir=data_folder, + ) + for split, output in save_output.items(): + logger.info(f"Starting creating {output} using {split} split.") + HF_create_csv( + output, hf_dataset[split], split, + ) + else: + # check that the data folder contains the GigaSpeech dataset + check_gigaspeech_folders(data_folder, json_file) - for split, output in save_output.items(): - logger.info(f"Starting creating {output} using {split} split.") - create_csv(output, info, data_folder, split, convert_opus_to_wav) + logger.info(f"Starting reading {json_file}.") + with open(json_file, "r") as f: + info = json.load(f) + logger.info(f"Reading {json_file} done.") + + for split, output in save_output.items(): + logger.info(f"Starting creating {output} using {split} split.") + create_csv(output, info, data_folder, split, convert_opus_to_wav) logger.info("Data preparation completed!") def process_line( - audio: json, data_folder: str, split: str, convert_opus_to_wav: bool + audio: json, data_folder: str, split: str, convert_opus_to_wav: bool, ) -> list: """ Process the audio line and return the utterances for the given split. @@ -299,6 +325,127 @@ def create_csv( ) +def HF_create_csv(csv_file: str, hf_dataset, split: str,) -> None: + """ + Create a CSV file based on the info in the GigaSpeech JSON file and filter the data based on the split. + + Parameters + ---------- + csv_file : str + The path to the CSV file to be created. + info : dict + The GigaSpeech JSON file content. + data_folder : str + The path to the GigaSpeech dataset. + split : str + The split to be used for filtering the data. + + Returns + ------- + None + """ + total_duration = 0.0 + nb_samples = 0 + + line_processor = functools.partial(HF_process_line,) + + csv_file_tmp = csv_file + ".tmp" + with open(csv_file_tmp, mode="w", encoding="utf-8") as csv_f: + csv_writer = csv.writer( + csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL + ) + header = [ + "ID", + "audio_id", + "audio_path", + "speaker", + "begin_time", + "end_time", + "duration", + "text", + ] + csv_writer.writerow(header) + + for row in parallel_map(line_processor, hf_dataset, chunk_size=1024): + if row is None: + continue + + csv_writer.writerow( + [ + row.utt_id, + row.audio_id, + row.audio_path, + row.speaker, + str(row.begin_time), + str(row.end_time), + str(row.duration), + row.text, + ] + ) + + total_duration += row.duration + nb_samples += 1 + + os.replace(csv_file_tmp, csv_file) + + print(f"{csv_file} succesfully created!") + print(f"Number of samples in {split} split: {nb_samples}") + print( + f"Total duration of {split} split: {round(total_duration / 3600, 2)} Hours" + ) + + +def HF_process_line(row,) -> list: + """ + Process the audio line and return the utterances for the given split. + + Parameters + ---------- + row: dict + The audio line to be processed. + + Returns + ------- + list + The list of utterances for the given split. + """ + audio_path = os.path.join(row["audio"]["path"]) + + assert os.path.isfile(audio_path), f"File not found: {audio_path}" + + # check reading the audio file ; HF may have some corrupted files + try: + _ = sb.dataio.dataio.read_audio(audio_path) + except Exception as e: + logger.error(f"Failed reading {audio_path}: {e}") + return None + + text = preprocess_text(row["text"]) + if text: + utt_id = row["segment_id"] + audio_id = row["audio_id"] + audio_path = row["audio"]["path"] + speaker = row["speaker"] + begin_time = float(row["begin_time"]) + end_time = float(row["end_time"]) + duration = end_time - begin_time + + row = GigaSpeechRow( + utt_id=utt_id, + audio_id=audio_id, + audio_path=audio_path, + speaker=speaker, + begin_time=begin_time, + end_time=end_time, + duration=duration, + text=text, + ) + + return row + else: + return None + + def convert_opus2wav(audio_opus_path): """Convert an opus file to a wav file. diff --git a/recipes/GigaSpeech/dataset.py b/recipes/GigaSpeech/dataset.py new file mode 100644 index 0000000000..9dee453b28 --- /dev/null +++ b/recipes/GigaSpeech/dataset.py @@ -0,0 +1,441 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# MODIFIED BY: Adel Moumen 2024 +""" +GigaSpeech is an evolving, multi-domain English speech recognition corpus with 10,000 hours of high quality +labeled audio suitable for supervised training, and 40,000 hours of total audio suitable for semi-supervised +and unsupervised training. Around 40,000 hours of transcribed audio is first collected from audiobooks, podcasts +and YouTube, covering both read and spontaneous speaking styles, and a variety of topics, such as arts, science, +sports, etc. A new forced alignment and segmentation pipeline is proposed to create sentence segments suitable +for speech recognition training, and to filter out segments with low-quality transcription. For system training, +GigaSpeech provides five subsets of different sizes, 10h, 250h, 1000h, 2500h, and 10000h. +For our 10,000-hour XL training subset, we cap the word error rate at 4% during the filtering/validation stage, +and for all our other smaller training subsets, we cap it at 0%. The DEV and TEST evaluation sets, on the other hand, +are re-processed by professional human transcribers to ensure high transcription quality. +""" + +import csv +import os + +import datasets + +_CITATION = """\ +@article{DBLP:journals/corr/abs-2106-06909, + author = {Guoguo Chen and + Shuzhou Chai and + Guanbo Wang and + Jiayu Du and + Wei{-}Qiang Zhang and + Chao Weng and + Dan Su and + Daniel Povey and + Jan Trmal and + Junbo Zhang and + Mingjie Jin and + Sanjeev Khudanpur and + Shinji Watanabe and + Shuaijiang Zhao and + Wei Zou and + Xiangang Li and + Xuchen Yao and + Yongqing Wang and + Yujun Wang and + Zhao You and + Zhiyong Yan}, + title = {GigaSpeech: An Evolving, Multi-domain {ASR} Corpus with 10, 000 Hours + of Transcribed Audio}, + journal = {CoRR}, + volume = {abs/2106.06909}, + year = {2021}, + url = {https://arxiv.org/abs/2106.06909}, + eprinttype = {arXiv}, + eprint = {2106.06909}, + timestamp = {Wed, 29 Dec 2021 14:29:26 +0100}, + biburl = {https://dblp.org/rec/journals/corr/abs-2106-06909.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +""" + +_DESCRIPTION = """\ +GigaSpeech is an evolving, multi-domain English speech recognition corpus with 10,000 hours of high quality +labeled audio suitable for supervised training, and 40,000 hours of total audio suitable for semi-supervised +and unsupervised training. Around 40,000 hours of transcribed audio is first collected from audiobooks, podcasts +and YouTube, covering both read and spontaneous speaking styles, and a variety of topics, such as arts, science, +sports, etc. A new forced alignment and segmentation pipeline is proposed to create sentence segments suitable +for speech recognition training, and to filter out segments with low-quality transcription. For system training, +GigaSpeech provides five subsets of different sizes, 10h, 250h, 1000h, 2500h, and 10000h. +For our 10,000-hour XL training subset, we cap the word error rate at 4% during the filtering/validation stage, +and for all our other smaller training subsets, we cap it at 0%. The DEV and TEST evaluation sets, on the other hand, +are re-processed by professional human transcribers to ensure high transcription quality. +""" + +_HOMEPAGE = "https://github.com/SpeechColab/GigaSpeech" + +_LICENSE = "Apache License 2.0" + +_CATEGORIES = ( + "People and Blogs", + "Business", + "Nonprofits and Activism", + "Crime", + "History", + "Pets and Animals", + "News and Politics", + "Travel and Events", + "Kids and Family", + "Leisure", + "N/A", + "Comedy", + "News and Politics", + "Sports", + "Arts", + "Science and Technology", + "Autos and Vehicles", + "Science and Technology", + "People and Blogs", + "Music", + "Society and Culture", + "Education", + "Howto and Style", + "Film and Animation", + "Gaming", + "Entertainment", + "Travel and Events", + "Health and Fitness", + "audiobook", +) + +_SOURCES = ("audiobook", "podcast", "youtube") + +_SUBSETS = ("xs", "s", "m", "l", "xl") + +_BASE_DATA_URL = ( + "https://huggingface.co/datasets/speechcolab/gigaspeech/resolve/main/data/" +) + +_AUDIO_ARCHIVE_URL = ( + _BASE_DATA_URL + + "audio/{subset}_files{is_additional}/{subset}_chunks_{archive_id:04}.tar.gz" +) + +_META_URL = ( + _BASE_DATA_URL + + "metadata/{subset}_metadata{is_additional}/{subset}_chunks_{archive_id:04}_metadata.csv" +) + +_N_ARCHIVES_URL = _BASE_DATA_URL + "{subset}_n_archives{is_additional}.txt" + +logger = datasets.utils.logging.get_logger(__name__) + + +class GigaspeechConfig(datasets.BuilderConfig): + """BuilderConfig for Gigaspeech.""" + + def __init__(self, name, *args, **kwargs): + """BuilderConfig for Gigaspeech + """ + super().__init__(name=name, *args, **kwargs) + # larger subsets are supersets of smaller subsets, + # if we want to download "m", we need to download "xs" and "s" data too. + # so if name == "m", self.subsets_to_download will be ("xs", "s", "m") + if name not in {"dev", "test"}: + self.subsets_to_download = _SUBSETS[: _SUBSETS.index(name) + 1] + else: + self.subsets_to_download = (name,) + + +class Gigaspeech(datasets.GeneratorBasedBuilder): + """ + GigaSpeech is an evolving, multi-domain English speech recognition corpus with 10,000 hours of high quality + labeled audio suitable for supervised training, and 40,000 hours of total audio suitable for semi-supervised + and unsupervised training (this implementation contains only labelled data for now). + Around 40,000 hours of transcribed audio is first collected from audiobooks, podcasts + and YouTube, covering both read and spontaneous speaking styles, and a variety of topics, such as arts, science, + sports, etc. A new forced alignment and segmentation pipeline is proposed to create sentence segments suitable + for speech recognition training, and to filter out segments with low-quality transcription. For system training, + GigaSpeech provides five subsets of different sizes, 10h, 250h, 1000h, 2500h, and 10000h. + For our 10,000-hour XL training subset, we cap the word error rate at 4% during the filtering/validation stage, + and for all our other smaller training subsets, we cap it at 0%. The DEV and TEST evaluation sets, on the other hand, + are re-processed by professional human transcribers to ensure high transcription quality. + """ + + VERSION = datasets.Version("1.0.0") + + BUILDER_CONFIGS = [ + GigaspeechConfig(name=subset) for subset in _SUBSETS + ("dev", "test") + ] + + DEFAULT_WRITER_BATCH_SIZE = 128 + + def _info(self): + features = datasets.Features( + { + "segment_id": datasets.Value("string"), + "speaker": datasets.Value("string"), + "text": datasets.Value("string"), + "audio": datasets.Audio(sampling_rate=16_000, decode=False), + "begin_time": datasets.Value("float32"), + "end_time": datasets.Value("float32"), + "audio_id": datasets.Value("string"), + "title": datasets.Value("string"), + "url": datasets.Value("string"), + "source": datasets.ClassLabel(names=_SOURCES), + "category": datasets.ClassLabel(names=_CATEGORIES), + "original_full_path": datasets.Value( + "string" + ), # relative path to full audio in original data dirs + } + ) + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _is_additional_data(self, name): + if name in {"s", "m", "l", "xl"}: + return "_additional" + return "" + + @property + def _splits_to_subsets(self): + return { + "train": self.config.subsets_to_download, + "dev": ["dev"], + "test": ["test"], + } + + def _read_n_archives(self, n_archives_path): + with open(n_archives_path, encoding="utf-8") as f: + return int(f.read().strip()) + + def _split_generators(self, dl_manager): + splits_to_subsets = self._splits_to_subsets + if self.config.name in {"dev", "test"}: + splits = (self.config.name,) + else: + splits = ("train", "dev", "test") + + # 1. get number of archives (shards) in each subset + n_archives_links = { + split: { + subset: _N_ARCHIVES_URL.format( + subset=subset, + is_additional=self._is_additional_data(subset), + ) + for subset in splits_to_subsets[split] + } + for split in splits + } + n_archives_paths = dl_manager.download_and_extract(n_archives_links) + n_archives = { + # mapping from a subset to a single number - number of audio archives (shards) in a subset + split: { + subset: self._read_n_archives(n_archives_paths[split][subset]) + for subset in splits_to_subsets[split] + } + for split in splits + } + + # 2. prepare sharded archives with audio files + audio_archives_urls = { + split: { + subset: [ + _AUDIO_ARCHIVE_URL.format( + subset=subset, + is_additional=self._is_additional_data(subset), + archive_id=i, + ) + for i in range(n_archives[split][subset]) + ] + for subset in splits_to_subsets[split] + } + for split in splits + } + audio_archives_paths = dl_manager.download(audio_archives_urls) + # flatten archives paths from + # {"train": {"xs": [path1, path2,], "s": [path3], "m": [path5, path5]}, "dev": {"dev": [path6,...]}, "test": {"test": [...]}} + # to {"train": [path1, path2, path3, path4, path5], "dev": [path6, ...], "test": [...]} + audio_archives_paths = _flatten_nested_dict(audio_archives_paths) + local_audio_archives_paths = ( + dl_manager.extract(audio_archives_paths) + if not dl_manager.is_streaming + else None + ) + + # 3. prepare sharded metadata csv files + meta_urls = { + split: { + subset: [ + _META_URL.format( + subset=subset, + is_additional=self._is_additional_data(subset), + archive_id=i, + ) + for i in range(n_archives[split][subset]) + ] + for subset in splits_to_subsets[split] + } + for split in splits + } + meta_paths = dl_manager.download_and_extract(meta_urls) + meta_paths = _flatten_nested_dict(meta_paths) + + if self.config.name not in {"dev", "test"}: + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "audio_archives_iterators": [ + dl_manager.iter_archive(archive_path) + for archive_path in audio_archives_paths["train"] + ], + "local_audio_archives_paths": local_audio_archives_paths[ + "train" + ] + if local_audio_archives_paths + else None, + "meta_paths": meta_paths["train"], + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "audio_archives_iterators": [ + dl_manager.iter_archive(archive_path) + for archive_path in audio_archives_paths["dev"] + ], + "local_audio_archives_paths": local_audio_archives_paths[ + "dev" + ] + if local_audio_archives_paths + else None, + "meta_paths": meta_paths["dev"], + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "audio_archives_iterators": [ + dl_manager.iter_archive(archive_path) + for archive_path in audio_archives_paths["test"] + ], + "local_audio_archives_paths": local_audio_archives_paths[ + "test" + ] + if local_audio_archives_paths + else None, + "meta_paths": meta_paths["test"], + }, + ), + ] + + if self.config.name == "dev": + return [ + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "audio_archives_iterators": [ + dl_manager.iter_archive(archive_path) + for archive_path in audio_archives_paths["dev"] + ], + "local_audio_archives_paths": local_audio_archives_paths[ + "dev" + ] + if local_audio_archives_paths + else None, + "meta_paths": meta_paths["dev"], + }, + ), + ] + + if self.config.name == "test": + return [ + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "audio_archives_iterators": [ + dl_manager.iter_archive(archive_path) + for archive_path in audio_archives_paths["test"] + ], + "local_audio_archives_paths": local_audio_archives_paths[ + "test" + ] + if local_audio_archives_paths + else None, + "meta_paths": meta_paths["test"], + }, + ), + ] + + def _generate_examples( + self, audio_archives_iterators, local_audio_archives_paths, meta_paths + ): + assert len(audio_archives_iterators) == len(meta_paths) + if local_audio_archives_paths: + assert len(audio_archives_iterators) == len( + local_audio_archives_paths + ) + + for i, (meta_path, audio_archive_iterator) in enumerate( + zip(meta_paths, audio_archives_iterators) + ): + meta_dict = dict() + with open(meta_path) as csvfile: + meta_csv = csv.DictReader(csvfile) + for line in meta_csv: + meta_dict[line["sid"]] = line + + for audio_path_in_archive, audio_file in audio_archive_iterator: + # `audio_path_in_archive` is like "dev_chunks_0000/YOU1000000029_S0000095.wav" + audio_filename = os.path.split(audio_path_in_archive)[1] + audio_id = audio_filename.split(".wav")[0] + audio_meta = meta_dict[audio_id] + audio_meta["segment_id"] = audio_meta.pop("sid") + audio_meta["original_full_path"] = audio_meta.pop("path") + audio_meta["text"] = audio_meta.pop("text_tn") + audio_meta["audio_id"] = audio_meta.pop("aid") + if not audio_meta["category"]: + audio_meta["category"] = "N/A" + + path = ( + os.path.join( + local_audio_archives_paths[i], audio_path_in_archive + ) + if local_audio_archives_paths + else audio_path_in_archive + ) + + yield audio_id, { + "audio": {"path": path, "bytes": audio_file.read()}, + **{ + feature: value + for feature, value in audio_meta.items() + if feature in self.info.features + }, + } + + +def _flatten_nested_dict(nested_dict): + return { + key: [ + inner_list_element + for inner_list in value_to_lists.values() + for inner_list_element in inner_list + ] + for key, value_to_lists in nested_dict.items() + } diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index 0d5e40cfa9..1b45ef07ee 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -16,6 +16,7 @@ from dataclasses import dataclass import functools from speechbrain.utils.parallel import parallel_map +import speechbrain as sb logger = logging.getLogger(__name__) @@ -75,6 +76,7 @@ def prepare_gigaspeech( json_file: str = "GigaSpeech.json", skip_prep: bool = False, convert_opus_to_wav: bool = True, + download_with_HF: bool = False, ) -> None: """ Prepare the csv files for GigaSpeech dataset. @@ -107,6 +109,12 @@ def prepare_gigaspeech( If True, the data preparation will be skipped, and the function will return immediately. convert_opus_to_wav : bool, optional If True, the opus files will be converted to wav files. + download_with_HF : bool, optional + If True, the dataset will be downloaded using the Hugging Face datasets library. + We highly recommend using this option if you are based in the EU or US as it will + be faster and more reliable than the official host. Make sure to read the + instructions on how to get the dataset from Hugging Face here: + https://huggingface.co/datasets/speechcolab/gigaspeech Returns ------- @@ -134,9 +142,11 @@ def prepare_gigaspeech( # Setting output paths save_output = {} + train_split = "" for split in splits: if split in TRAIN_SUBSET: save_output[split] = output_train + train_split = split else: if split == "DEV": save_output[split] = output_dev @@ -150,22 +160,38 @@ def prepare_gigaspeech( else: logger.info("Starting data preparation...") - # check that the data folder contains the GigaSpeech dataset - check_gigaspeech_folders(data_folder, json_file) + if download_with_HF: + from datasets import load_dataset - logger.info(f"Starting reading {json_file}.") - with open(json_file, "r") as f: - info = json.load(f) - logger.info(f"Reading {json_file} done.") + hf_dataset = load_dataset( + "dataset.py", + train_split.lower(), + trust_remote_code=True, + cache_dir=data_folder, + data_dir=data_folder, + ) + for split, output in save_output.items(): + logger.info(f"Starting creating {output} using {split} split.") + HF_create_csv( + output, hf_dataset[split], split, + ) + else: + # check that the data folder contains the GigaSpeech dataset + check_gigaspeech_folders(data_folder, json_file) - for split, output in save_output.items(): - logger.info(f"Starting creating {output} using {split} split.") - create_csv(output, info, data_folder, split, convert_opus_to_wav) + logger.info(f"Starting reading {json_file}.") + with open(json_file, "r") as f: + info = json.load(f) + logger.info(f"Reading {json_file} done.") + + for split, output in save_output.items(): + logger.info(f"Starting creating {output} using {split} split.") + create_csv(output, info, data_folder, split, convert_opus_to_wav) logger.info("Data preparation completed!") def process_line( - audio: json, data_folder: str, split: str, convert_opus_to_wav: bool + audio: json, data_folder: str, split: str, convert_opus_to_wav: bool, ) -> list: """ Process the audio line and return the utterances for the given split. @@ -299,6 +325,127 @@ def create_csv( ) +def HF_create_csv(csv_file: str, hf_dataset, split: str,) -> None: + """ + Create a CSV file based on the info in the GigaSpeech JSON file and filter the data based on the split. + + Parameters + ---------- + csv_file : str + The path to the CSV file to be created. + info : dict + The GigaSpeech JSON file content. + data_folder : str + The path to the GigaSpeech dataset. + split : str + The split to be used for filtering the data. + + Returns + ------- + None + """ + total_duration = 0.0 + nb_samples = 0 + + line_processor = functools.partial(HF_process_line,) + + csv_file_tmp = csv_file + ".tmp" + with open(csv_file_tmp, mode="w", encoding="utf-8") as csv_f: + csv_writer = csv.writer( + csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL + ) + header = [ + "ID", + "audio_id", + "audio_path", + "speaker", + "begin_time", + "end_time", + "duration", + "text", + ] + csv_writer.writerow(header) + + for row in parallel_map(line_processor, hf_dataset, chunk_size=1024): + if row is None: + continue + + csv_writer.writerow( + [ + row.utt_id, + row.audio_id, + row.audio_path, + row.speaker, + str(row.begin_time), + str(row.end_time), + str(row.duration), + row.text, + ] + ) + + total_duration += row.duration + nb_samples += 1 + + os.replace(csv_file_tmp, csv_file) + + print(f"{csv_file} succesfully created!") + print(f"Number of samples in {split} split: {nb_samples}") + print( + f"Total duration of {split} split: {round(total_duration / 3600, 2)} Hours" + ) + + +def HF_process_line(row,) -> list: + """ + Process the audio line and return the utterances for the given split. + + Parameters + ---------- + row: dict + The audio line to be processed. + + Returns + ------- + list + The list of utterances for the given split. + """ + audio_path = os.path.join(row["audio"]["path"]) + + assert os.path.isfile(audio_path), f"File not found: {audio_path}" + + # check reading the audio file ; HF may have some corrupted files + try: + _ = sb.dataio.dataio.read_audio(audio_path) + except Exception as e: + logger.error(f"Failed reading {audio_path}: {e}") + return None + + text = preprocess_text(row["text"]) + if text: + utt_id = row["segment_id"] + audio_id = row["audio_id"] + audio_path = row["audio"]["path"] + speaker = row["speaker"] + begin_time = float(row["begin_time"]) + end_time = float(row["end_time"]) + duration = end_time - begin_time + + row = GigaSpeechRow( + utt_id=utt_id, + audio_id=audio_id, + audio_path=audio_path, + speaker=speaker, + begin_time=begin_time, + end_time=end_time, + duration=duration, + text=text, + ) + + return row + else: + return None + + def convert_opus2wav(audio_opus_path): """Convert an opus file to a wav file. From 1601ddc1de45cecfc0abfe12c0911d789b7da145 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Mon, 18 Mar 2024 14:19:13 +0100 Subject: [PATCH 29/77] remove webdataset as it sucks :p --- .../ASR/CTC/hparams/train_hf_wavlm_wbs.yaml | 189 -------- .../ASR/CTC/train_with_wavlm_wbs.py | 418 ------------------ 2 files changed, 607 deletions(-) delete mode 100644 recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm_wbs.yaml delete mode 100644 recipes/GigaSpeech/ASR/CTC/train_with_wavlm_wbs.py diff --git a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm_wbs.yaml b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm_wbs.yaml deleted file mode 100644 index e60f560df6..0000000000 --- a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm_wbs.yaml +++ /dev/null @@ -1,189 +0,0 @@ -# ################################ -# Model: wavlm + DNN + CTC -# Decoding AM: Greedy for validation, and Beam search for testing -# Augmentation: SpecAugment -# Authors: Adel Moumen 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/train_wavlm_char_wbs_v2/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - -wav2vec2_hub: microsoft/wavlm-large -wav2vec2_folder: !ref /wav2vec2_checkpoint - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/GigaSpeech - -# see https://github.com/SpeechColab/GigaSpeech for more details on the dataset -# must be one of ["XS", "S", "M", "L", "XL"] -# and ["DEV", "TEST"] for the eval splits. -splits: ["XS", "DEV", "TEST"] -skip_prep: False -convert_opus_to_wav: True -ckpt_interval_minutes: 25 # save checkpoint every N min -# if use_webdataset is True, we expect the output_folder to contain the shards folder -use_webdataset: True -samples_per_shard: 500 -train_shards_folder_path: !ref /shards/train -valid_shards_folder_path: !ref /shards/dev -test_shards_folder_path: !ref /shards/test -json_file: !ref /GigaSpeech.json - -# Training parameters -number_of_epochs: 5 -lr: 0.9 -lr_wav2vec: 0.0001 -sorting: ascending -num_workers: 4 -precision: fp32 # bf16, fp16 or fp32 -sample_rate: 16000 - -# These parameters work for a single GPU of 32GB using fp16 -batch_size: 16 -test_batch_size: 1 - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - num_workers: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.LeakyReLU -dnn_layers: 2 -dnn_neurons: 1024 -freeze_wav2vec: True - -# Outputs -output_neurons: 35 -blank_index: 0 -unk_label: "" - -# Decoding parameters -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - -# -# Functions and classes -# -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -# Speed perturbation -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref - speeds: [95, 100, 105] - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: 0 - drop_freq_high: 1 - drop_freq_count_low: 1 - drop_freq_count_high: 3 - drop_freq_width: 0.05 - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: 1 - drop_length_high: 5 - drop_count_low: 1000 - drop_count_high: 2000 - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref , - !ref , - !ref ] - - -enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN - input_shape: [null, null, 1024] - activation: !ref - dnn_blocks: !ref - dnn_neurons: !ref - -wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 - source: !ref - output_norm: True - freeze: !ref - save_path: !ref - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: !ref - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - wav2vec2: !ref - enc: !ref - ctc_lin: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref ] - -model_opt_class: !name:torch.optim.Adadelta - lr: !ref - rho: 0.95 - eps: 1.e-8 - -wav2vec_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.9 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder - -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - wav2vec2: !ref - model: !ref - scheduler_model: !ref - scheduler_wav2vec: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm_wbs.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm_wbs.py deleted file mode 100644 index cb2cee0883..0000000000 --- a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm_wbs.py +++ /dev/null @@ -1,418 +0,0 @@ -"""TODO - -Authors - * Adel Moumen 2024 -""" -import os -import sys -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -import webdataset as wds -import torch - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Downsample the inputs if specified - if hasattr(self.modules, "downsampler"): - wavs = self.modules.downsampler(wavs) - - # Add waveform augmentation if specified. - if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): - wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens) - - # Forward pass - - # Handling SpeechBrain vs HuggingFance pretrained models - if hasattr(self.modules, "extractor"): # SpeechBrain pretrained model - latents = self.modules.extractor(wavs) - feats = self.modules.encoder_wrapper(latents, wav_lens=wav_lens)[ - "embeddings" - ] - else: # HuggingFace pretrained model - feats = self.modules.wav2vec2(wavs, wav_lens) - - x = self.modules.enc(feats) - - # Compute outputs - logits = self.modules.ctc_lin(x) - - # Upsample the inputs if they have been highly downsampled - if hasattr(self.hparams, "upsampling") and self.hparams.upsampling: - logits = logits.view( - logits.shape[0], -1, self.hparams.output_neurons - ) - - p_ctc = self.hparams.log_softmax(logits) - - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - else: - p_tokens = None - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - - ids = batch.id - tokens, tokens_lens = batch.tokens - - # Label Augmentation - if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) - - loss_ctc = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - loss = loss_ctc - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.text] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - old_lr_wav2vec, new_lr_wav2vec = self.hparams.lr_annealing_wav2vec( - stage_stats["loss"] - ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - sb.nnet.schedulers.update_learning_rate( - self.wav2vec_optimizer, new_lr_wav2vec - ) - self.hparams.train_logger.log_stats( - stats_meta={ - "epoch": epoch, - "lr_model": old_lr_model, - "lr_wav2vec": old_lr_wav2vec, - }, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - "Initializes the wav2vec2 optimizer and model optimizer" - # Handling SpeechBrain vs HuggingFace pretrained models - if hasattr(self.modules, "extractor"): # SpeechBrain pretrained model - self.wav2vec_optimizer = self.hparams.wav2vec_opt_class( - self.modules.encoder_wrapper.parameters() - ) - - else: # HuggingFace pretrained model - self.wav2vec_optimizer = self.hparams.wav2vec_opt_class( - self.modules.wav2vec2.parameters() - ) - - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - - # save the optimizers in a dictionary - # the key will be used in `freeze_optimizers()` - self.optimizers_dict = { - "model_optimizer": self.model_optimizer, - } - if not self.hparams.freeze_wav2vec: - self.optimizers_dict["wav2vec_optimizer"] = self.wav2vec_optimizer - - if self.checkpointer is not None: - self.checkpointer.add_recoverable( - "wav2vec_opt", self.wav2vec_optimizer - ) - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - - label_encoder = sb.dataio.encoder.CTCTextEncoder() - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_label"], - } - - # 1. Create datasets - if hparams["use_webdataset"]: - import json - - # load the meta info json file - file_path = os.path.join( - hparams["train_shards_folder_path"], "metadata.json" - ) - with wds.gopen(file_path, "rb") as f: - train_meta = json.load(f) - - file_path = os.path.join( - hparams["valid_shards_folder_path"], "metadata.json" - ) - with wds.gopen(file_path, "rb") as f: - val_meta = json.load(f) - - file_path = os.path.join( - hparams["test_shards_folder_path"], "metadata.json" - ) - with wds.gopen(file_path, "rb") as f: - test_meta = json.load(f) - - def _audio_pipeline(sample_dict): - return sample_dict["text"] - - import glob - - train_files = glob.glob(hparams["train_shards_folder_path"] + "/*.tar") - - def _text_generator(shard_files): - for shard_file in shard_files: - for sample_dict in ( - wds.WebDataset(shard_file).decode().map(_audio_pipeline) - ): - yield sample_dict - - label_encoder.load_or_create( - path=lab_enc_file, - from_iterables=[_text_generator(train_files)], - sequence_input=True, - special_labels=special_labels, - ) - - def audio_pipeline(sample_dict): - key = sample_dict["__key__"] - audio_tensor = sample_dict["audio.pth"] - text = sample_dict["text"] - char_list = list(text) - tokens_list = label_encoder.encode_sequence(char_list) - tokens = torch.LongTensor(tokens_list) - return { - "id": key, - "sig": audio_tensor, - "text": text, - "char_list": char_list, - "tokens_list": tokens_list, - "tokens": tokens, - } - - train_data = ( - wds.WebDataset(train_files).repeat().decode().map(audio_pipeline) - ) - - valid_data = ( - wds.WebDataset( - glob.glob(hparams["valid_shards_folder_path"] + "/*.tar") - ) - .repeat() - .decode() - .map(audio_pipeline) - ) - - test_data = ( - wds.WebDataset( - glob.glob(hparams["test_shards_folder_path"] + "/*.tar") - ) - .repeat() - .decode() - .map(audio_pipeline) - ) - - else: - print("Not implemented yet") - - return ( - train_data, - valid_data, - test_data, - label_encoder, - train_meta, - val_meta, - test_meta, - ) - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from gigaspeech_prepare import prepare_gigaspeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_gigaspeech, - kwargs={ - "data_folder": hparams["data_folder"], - "save_folder": hparams["save_folder"], - "splits": hparams["splits"], - "output_train": hparams["train_shards_folder_path"], - "output_dev": hparams["valid_shards_folder_path"], - "output_test": hparams["test_shards_folder_path"], - "json_file": hparams["json_file"], - "skip_prep": hparams["skip_prep"], - "convert_opus_to_wav": hparams["convert_opus_to_wav"], - "use_webdataset": hparams["use_webdataset"], - "samples_per_shard": hparams["samples_per_shard"], - "max_size_shard": hparams.get("max_size_shard", 1e9), - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - ( - train_data, - valid_data, - test_data, - label_encoder, - train_meta, - val_meta, - test_meta, - ) = dataio_prepare(hparams) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # We load the pretrained wav2vec2 model - if "pretrainer" in hparams.keys(): - run_on_main(hparams["pretrainer"].collect_files) - hparams["pretrainer"].load_collected() - - # We dynamicaly add the tokenizer to our brain class. - # NB: This tokenizer corresponds to the one used for the LM!! - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - print(vocab_list) - print(len(vocab_list)) - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - hparams["train_dataloader_opts"]["collate_fn"] = sb.dataio.batch.PaddedBatch - hparams["valid_dataloader_opts"]["collate_fn"] = sb.dataio.batch.PaddedBatch - hparams["test_dataloader_opts"]["collate_fn"] = sb.dataio.batch.PaddedBatch - - hparams["train_dataloader_opts"]["looped_nominal_epoch"] = ( - train_meta["nb_samples"] - // hparams["train_dataloader_opts"]["batch_size"] - ) - - hparams["valid_dataloader_opts"]["looped_nominal_epoch"] = ( - val_meta["nb_samples"] // hparams["valid_dataloader_opts"]["batch_size"] - ) - - hparams["test_dataloader_opts"]["looped_nominal_epoch"] = ( - test_meta["nb_samples"] // hparams["test_dataloader_opts"]["batch_size"] - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - os.makedirs(hparams["output_wer_folder"], exist_ok=True) - - # report WER on valid data - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"valid_wer.txt" - ) - asr_brain.evaluate( - valid_data, - min_key="WER", - test_loader_kwargs=hparams["test_dataloader_opts"], - ) - - # report WER on test data - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"test_wer.txt" - ) - asr_brain.evaluate( - test_data, - min_key="WER", - test_loader_kwargs=hparams["test_dataloader_opts"], - ) From 9531d0b6455d29df36fd5867df6ea9c0bfd48c4d Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Mon, 18 Mar 2024 14:19:26 +0100 Subject: [PATCH 30/77] name --- recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py index 40b9236d7d..c4e54331c7 100644 --- a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py @@ -308,9 +308,9 @@ def text_pipeline(text): "data_folder": hparams["data_folder"], "save_folder": hparams["save_folder"], "splits": hparams["splits"], - "output_train_csv_filename": hparams["train_csv"], - "output_dev_csv_filename": hparams["valid_csv"], - "output_test_csv_filename": hparams["test_csv"], + "output_train": hparams["train_csv"], + "output_dev": hparams["valid_csv"], + "output_test": hparams["test_csv"], "json_file": hparams["json_file"], "skip_prep": hparams["skip_prep"], "convert_opus_to_wav": hparams["convert_opus_to_wav"], From 1356ff175375bece1ffdea110808bb282d291883 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Mon, 18 Mar 2024 14:19:38 +0100 Subject: [PATCH 31/77] ngram commands --- recipes/GigaSpeech/ASR/CTC/README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/recipes/GigaSpeech/ASR/CTC/README.md b/recipes/GigaSpeech/ASR/CTC/README.md index 9cfe1d8671..b9d49667f5 100644 --- a/recipes/GigaSpeech/ASR/CTC/README.md +++ b/recipes/GigaSpeech/ASR/CTC/README.md @@ -1 +1,8 @@ -to do \ No newline at end of file +to do + +```bash +mkdir lm +git clone https://huggingface.co/wgb14/gigaspeech_lm lm +gunzip -c lm/3gram_pruned_1e7.arpa.gz > lm/3gram_pruned_1e7.arpa +gunzip -c lm/4gram.arpa.gz > lm/4gram.arpa +``` \ No newline at end of file From 0485173e75c6b95a40ba6f33416b939f69828779 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Mon, 18 Mar 2024 14:33:06 +0100 Subject: [PATCH 32/77] whisper baseline --- recipes/GigaSpeech/ASR/transformer/hparams/train_hf_whisper.yaml | 1 + .../GigaSpeech/ASR/transformer/hparams/train_with_hf_whisper.py | 1 + 2 files changed, 2 insertions(+) create mode 100644 recipes/GigaSpeech/ASR/transformer/hparams/train_hf_whisper.yaml create mode 100644 recipes/GigaSpeech/ASR/transformer/hparams/train_with_hf_whisper.py diff --git a/recipes/GigaSpeech/ASR/transformer/hparams/train_hf_whisper.yaml b/recipes/GigaSpeech/ASR/transformer/hparams/train_hf_whisper.yaml new file mode 100644 index 0000000000..464090415c --- /dev/null +++ b/recipes/GigaSpeech/ASR/transformer/hparams/train_hf_whisper.yaml @@ -0,0 +1 @@ +# TODO diff --git a/recipes/GigaSpeech/ASR/transformer/hparams/train_with_hf_whisper.py b/recipes/GigaSpeech/ASR/transformer/hparams/train_with_hf_whisper.py new file mode 100644 index 0000000000..044a4824d9 --- /dev/null +++ b/recipes/GigaSpeech/ASR/transformer/hparams/train_with_hf_whisper.py @@ -0,0 +1 @@ +# todo From b360f8bcbb637ef2eab5aba2b0a375a9ceae2e0a Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Mon, 18 Mar 2024 20:38:37 +0100 Subject: [PATCH 33/77] fix HF --- recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py | 6 +++--- .../GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml | 4 +++- recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py | 14 +++++++++----- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py index 1b45ef07ee..77b6b6cdb4 100644 --- a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py @@ -145,13 +145,13 @@ def prepare_gigaspeech( train_split = "" for split in splits: if split in TRAIN_SUBSET: - save_output[split] = output_train + save_output["train"] = output_train train_split = split else: if split == "DEV": - save_output[split] = output_dev + save_output["validation"] = output_dev elif split == "TEST": - save_output[split] = output_test + save_output["test"] = output_test # check if the data is already prepared if skip_csv(save_output): diff --git a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml index c1ff091b5f..daf3443d9a 100644 --- a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml +++ b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml @@ -8,7 +8,8 @@ # Seed needs to be set at top of yaml, before objects with parameters are made seed: 1986 __set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/train_wavlm_char/ +experiment_name: train_wavlm_char +output_folder: !ref results// output_wer_folder: !ref / save_folder: !ref /save train_log: !ref /train_log.txt @@ -24,6 +25,7 @@ data_folder: !PLACEHOLDER # e,g./path/to/GigaSpeech # and ["DEV", "TEST"] for the eval splits. splits: ["XS", "DEV", "TEST"] skip_prep: False +download_with_HF: True convert_opus_to_wav: True ckpt_interval_minutes: 25 # save checkpoint every N min train_csv: !ref /train.csv diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py index c4e54331c7..cde698345f 100644 --- a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py @@ -234,11 +234,14 @@ def dataio_prepare(hparams): @sb.utils.data_pipeline.takes("audio_path", "begin_time", "end_time") @sb.utils.data_pipeline.provides("sig") def audio_pipeline(audio_path, begin_time, end_time): - start_sample = int(float(begin_time) * hparams["sample_rate"]) - stop_sample = int(float(end_time) * hparams["sample_rate"]) - sig = sb.dataio.dataio.read_audio( - {"file": audio_path, "start": start_sample, "stop": stop_sample} - ) + if hparams["download_with_HF"]: + sig = sb.dataio.dataio.read_audio(audio_path) + else: + start_sample = int(float(begin_time) * hparams["sample_rate"]) + stop_sample = int(float(end_time) * hparams["sample_rate"]) + sig = sb.dataio.dataio.read_audio( + {"file": audio_path, "start": start_sample, "stop": stop_sample} + ) return sig sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) @@ -314,6 +317,7 @@ def text_pipeline(text): "json_file": hparams["json_file"], "skip_prep": hparams["skip_prep"], "convert_opus_to_wav": hparams["convert_opus_to_wav"], + "download_with_HF": hparams["download_with_HF"], }, ) From 81884ee23fceadf057c4f6890675e4d858277514 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Fri, 29 Mar 2024 16:17:57 +0100 Subject: [PATCH 34/77] pre-commit + sentencepiece char --- recipes/GigaSpeech/ASR/CTC/dataset.py | 53 ++++++----- .../GigaSpeech/ASR/CTC/gigaspeech_prepare.py | 37 +++++--- .../GigaSpeech/ASR/CTC/train_with_wavlm.py | 88 +++++++++++-------- recipes/GigaSpeech/dataset.py | 53 ++++++----- recipes/GigaSpeech/gigaspeech_prepare.py | 43 +++++---- 5 files changed, 154 insertions(+), 120 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/dataset.py b/recipes/GigaSpeech/ASR/CTC/dataset.py index 9dee453b28..3b6219efc8 100644 --- a/recipes/GigaSpeech/ASR/CTC/dataset.py +++ b/recipes/GigaSpeech/ASR/CTC/dataset.py @@ -144,8 +144,7 @@ class GigaspeechConfig(datasets.BuilderConfig): """BuilderConfig for Gigaspeech.""" def __init__(self, name, *args, **kwargs): - """BuilderConfig for Gigaspeech - """ + """BuilderConfig for Gigaspeech""" super().__init__(name=name, *args, **kwargs) # larger subsets are supersets of smaller subsets, # if we want to download "m", we need to download "xs" and "s" data too. @@ -304,11 +303,11 @@ def _split_generators(self, dl_manager): dl_manager.iter_archive(archive_path) for archive_path in audio_archives_paths["train"] ], - "local_audio_archives_paths": local_audio_archives_paths[ - "train" - ] - if local_audio_archives_paths - else None, + "local_audio_archives_paths": ( + local_audio_archives_paths["train"] + if local_audio_archives_paths + else None + ), "meta_paths": meta_paths["train"], }, ), @@ -319,11 +318,11 @@ def _split_generators(self, dl_manager): dl_manager.iter_archive(archive_path) for archive_path in audio_archives_paths["dev"] ], - "local_audio_archives_paths": local_audio_archives_paths[ - "dev" - ] - if local_audio_archives_paths - else None, + "local_audio_archives_paths": ( + local_audio_archives_paths["dev"] + if local_audio_archives_paths + else None + ), "meta_paths": meta_paths["dev"], }, ), @@ -334,11 +333,11 @@ def _split_generators(self, dl_manager): dl_manager.iter_archive(archive_path) for archive_path in audio_archives_paths["test"] ], - "local_audio_archives_paths": local_audio_archives_paths[ - "test" - ] - if local_audio_archives_paths - else None, + "local_audio_archives_paths": ( + local_audio_archives_paths["test"] + if local_audio_archives_paths + else None + ), "meta_paths": meta_paths["test"], }, ), @@ -353,11 +352,11 @@ def _split_generators(self, dl_manager): dl_manager.iter_archive(archive_path) for archive_path in audio_archives_paths["dev"] ], - "local_audio_archives_paths": local_audio_archives_paths[ - "dev" - ] - if local_audio_archives_paths - else None, + "local_audio_archives_paths": ( + local_audio_archives_paths["dev"] + if local_audio_archives_paths + else None + ), "meta_paths": meta_paths["dev"], }, ), @@ -372,11 +371,11 @@ def _split_generators(self, dl_manager): dl_manager.iter_archive(archive_path) for archive_path in audio_archives_paths["test"] ], - "local_audio_archives_paths": local_audio_archives_paths[ - "test" - ] - if local_audio_archives_paths - else None, + "local_audio_archives_paths": ( + local_audio_archives_paths["test"] + if local_audio_archives_paths + else None + ), "meta_paths": meta_paths["test"], }, ), diff --git a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py index 77b6b6cdb4..a1cd2f1db7 100644 --- a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py @@ -34,7 +34,7 @@ @dataclass class GigaSpeechRow: - """ Dataclass for handling GigaSpeech rows. + """Dataclass for handling GigaSpeech rows. Attributes ---------- @@ -78,7 +78,7 @@ def prepare_gigaspeech( convert_opus_to_wav: bool = True, download_with_HF: bool = False, ) -> None: - """ Prepare the csv files for GigaSpeech dataset. + """Prepare the csv files for GigaSpeech dataset. Download instructions: https://github.com/SpeechColab/GigaSpeech Reference: https://arxiv.org/abs/2106.06909 @@ -173,7 +173,9 @@ def prepare_gigaspeech( for split, output in save_output.items(): logger.info(f"Starting creating {output} using {split} split.") HF_create_csv( - output, hf_dataset[split], split, + output, + hf_dataset[split], + split, ) else: # check that the data folder contains the GigaSpeech dataset @@ -191,7 +193,10 @@ def prepare_gigaspeech( def process_line( - audio: json, data_folder: str, split: str, convert_opus_to_wav: bool, + audio: json, + data_folder: str, + split: str, + convert_opus_to_wav: bool, ) -> list: """ Process the audio line and return the utterances for the given split. @@ -318,14 +323,18 @@ def create_csv( os.replace(csv_file_tmp, csv_file) - logger.info(f"{csv_file} succesfully created!") + logger.info(f"{csv_file} successfully created!") logger.info(f"Number of samples in {split} split: {nb_samples}") logger.info( f"Total duration of {split} split: {round(total_duration / 3600, 2)} Hours" ) -def HF_create_csv(csv_file: str, hf_dataset, split: str,) -> None: +def HF_create_csv( + csv_file: str, + hf_dataset, + split: str, +) -> None: """ Create a CSV file based on the info in the GigaSpeech JSON file and filter the data based on the split. @@ -347,7 +356,9 @@ def HF_create_csv(csv_file: str, hf_dataset, split: str,) -> None: total_duration = 0.0 nb_samples = 0 - line_processor = functools.partial(HF_process_line,) + line_processor = functools.partial( + HF_process_line, + ) csv_file_tmp = csv_file + ".tmp" with open(csv_file_tmp, mode="w", encoding="utf-8") as csv_f: @@ -388,14 +399,16 @@ def HF_create_csv(csv_file: str, hf_dataset, split: str,) -> None: os.replace(csv_file_tmp, csv_file) - print(f"{csv_file} succesfully created!") - print(f"Number of samples in {split} split: {nb_samples}") - print( + logger.info(f"{csv_file} successfully created!") + logger.info(f"Number of samples in {split} split: {nb_samples}") + logger.info( f"Total duration of {split} split: {round(total_duration / 3600, 2)} Hours" ) -def HF_process_line(row,) -> list: +def HF_process_line( + row, +) -> list: """ Process the audio line and return the utterances for the given split. @@ -518,7 +531,7 @@ def preprocess_text(text: str) -> str: def skip_csv(save_csv_files: dict) -> bool: - """ Check if the CSV files already exist. + """Check if the CSV files already exist. Parameters ---------- diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py index cde698345f..e528829c96 100644 --- a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py @@ -3,12 +3,14 @@ Authors * Adel Moumen 2024 """ + import os import sys import torch import logging import speechbrain as sb from speechbrain.utils.distributed import run_on_main, if_main_process +from speechbrain.tokenizers.SentencePiece import SentencePiece from hyperpyyaml import load_hyperpyyaml logger = logging.getLogger(__name__) @@ -78,22 +80,20 @@ def compute_objectives(self, predictions, batch, stage): tokens = self.hparams.wav_augment.replicate_labels(tokens) tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) - loss_ctc = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - loss = loss_ctc + loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) if stage == sb.Stage.VALID: # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] + predicted_words = self.tokenizer( + predicted_tokens, task="decode_from_list" + ) elif stage == sb.Stage.TEST: predicted_words = [ hyp[0].text.split(" ") for hyp in predicted_tokens ] if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.text] + target_words = [wrd.split(" ") for wrd in batch.wrd] self.wer_metric.append(ids, predicted_words, target_words) self.cer_metric.append(ids, predicted_words, target_words) @@ -143,7 +143,8 @@ def on_stage_end(self, stage, stage_loss, epoch): valid_stats=stage_stats, ) self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], + meta={"WER": stage_stats["WER"]}, + min_keys=["WER"], ) elif stage == sb.Stage.TEST: self.hparams.train_logger.log_stats( @@ -186,13 +187,15 @@ def init_optimizers(self): self.checkpointer.add_recoverable("modelopt", self.model_optimizer) -def dataio_prepare(hparams): +def dataio_prepare(hparams, tokenizer): """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" + It also defines the data processing pipeline through user-defined functions. + """ data_folder = hparams["data_folder"] train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, + csv_path=hparams["train_csv"], + replacements={"data_root": data_folder}, ) if hparams["sorting"] == "ascending": @@ -217,12 +220,14 @@ def dataio_prepare(hparams): ) valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, + csv_path=hparams["valid_csv"], + replacements={"data_root": data_folder}, ) valid_data = valid_data.filtered_sorted(sort_key="duration") test_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["test_csv"], replacements={"data_root": data_folder}, + csv_path=hparams["test_csv"], + replacements={"data_root": data_folder}, ) # We also sort the validation data so it is faster to validate @@ -245,42 +250,30 @@ def audio_pipeline(audio_path, begin_time, end_time): return sig sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() # 3. Define text pipeline: @sb.utils.data_pipeline.takes("text") @sb.utils.data_pipeline.provides( - "text", "char_list", "tokens_list", "tokens" + "wrd", "char_list", "tokens_list", "tokens" ) - def text_pipeline(text): - yield text - char_list = list(text) + def text_pipeline(wrd): + yield wrd + char_list = list(wrd) yield char_list - tokens_list = label_encoder.encode_sequence(char_list) + tokens_list = tokenizer.sp.encode_as_ids(wrd) yield tokens_list tokens = torch.LongTensor(tokens_list) yield tokens sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - # 4. Set output: sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "text", "char_list", "tokens"], + datasets, + ["id", "sig", "text", "char_list", "tokens"], ) - return train_data, valid_data, test_data, label_encoder + return train_data, valid_data, test_data if __name__ == "__main__": @@ -321,8 +314,22 @@ def text_pipeline(text): }, ) + # Defining tokenizer and loading it + tokenizer = SentencePiece( + model_dir=hparams["save_folder"], + vocab_size=hparams["output_neurons"], + annotation_train=hparams["train_csv"], + annotation_read="wrd", + model_type=hparams["token_type"], + character_coverage=hparams["character_coverage"], + bos_id=hparams["bos_index"], + eos_id=hparams["eos_index"], + ) + # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_data, label_encoder = dataio_prepare(hparams) + train_data, valid_data, test_data, label_encoder = dataio_prepare( + hparams, tokenizer + ) # Trainer initialization asr_brain = ASR( @@ -337,17 +344,20 @@ def text_pipeline(text): run_on_main(hparams["pretrainer"].collect_files) hparams["pretrainer"].load_collected() - # We dynamicaly add the tokenizer to our brain class. + # We dynamically add the tokenizer to our brain class. # NB: This tokenizer corresponds to the one used for the LM!! asr_brain.tokenizer = label_encoder ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] + vocab_list = [ + tokenizer.sp.id_to_piece(i) for i in range(tokenizer.sp.vocab_size()) + ] from speechbrain.decoders.ctc import CTCBeamSearcher test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, + **hparams["test_beam_search"], + vocab_list=vocab_list, ) # Training @@ -364,7 +374,7 @@ def text_pipeline(text): # report WER on valid data asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"valid_wer.txt" + hparams["output_wer_folder"], "valid_wer.txt" ) asr_brain.evaluate( valid_data, @@ -374,7 +384,7 @@ def text_pipeline(text): # report WER on test data asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"test_wer.txt" + hparams["output_wer_folder"], "test_wer.txt" ) asr_brain.evaluate( test_data, diff --git a/recipes/GigaSpeech/dataset.py b/recipes/GigaSpeech/dataset.py index 9dee453b28..3b6219efc8 100644 --- a/recipes/GigaSpeech/dataset.py +++ b/recipes/GigaSpeech/dataset.py @@ -144,8 +144,7 @@ class GigaspeechConfig(datasets.BuilderConfig): """BuilderConfig for Gigaspeech.""" def __init__(self, name, *args, **kwargs): - """BuilderConfig for Gigaspeech - """ + """BuilderConfig for Gigaspeech""" super().__init__(name=name, *args, **kwargs) # larger subsets are supersets of smaller subsets, # if we want to download "m", we need to download "xs" and "s" data too. @@ -304,11 +303,11 @@ def _split_generators(self, dl_manager): dl_manager.iter_archive(archive_path) for archive_path in audio_archives_paths["train"] ], - "local_audio_archives_paths": local_audio_archives_paths[ - "train" - ] - if local_audio_archives_paths - else None, + "local_audio_archives_paths": ( + local_audio_archives_paths["train"] + if local_audio_archives_paths + else None + ), "meta_paths": meta_paths["train"], }, ), @@ -319,11 +318,11 @@ def _split_generators(self, dl_manager): dl_manager.iter_archive(archive_path) for archive_path in audio_archives_paths["dev"] ], - "local_audio_archives_paths": local_audio_archives_paths[ - "dev" - ] - if local_audio_archives_paths - else None, + "local_audio_archives_paths": ( + local_audio_archives_paths["dev"] + if local_audio_archives_paths + else None + ), "meta_paths": meta_paths["dev"], }, ), @@ -334,11 +333,11 @@ def _split_generators(self, dl_manager): dl_manager.iter_archive(archive_path) for archive_path in audio_archives_paths["test"] ], - "local_audio_archives_paths": local_audio_archives_paths[ - "test" - ] - if local_audio_archives_paths - else None, + "local_audio_archives_paths": ( + local_audio_archives_paths["test"] + if local_audio_archives_paths + else None + ), "meta_paths": meta_paths["test"], }, ), @@ -353,11 +352,11 @@ def _split_generators(self, dl_manager): dl_manager.iter_archive(archive_path) for archive_path in audio_archives_paths["dev"] ], - "local_audio_archives_paths": local_audio_archives_paths[ - "dev" - ] - if local_audio_archives_paths - else None, + "local_audio_archives_paths": ( + local_audio_archives_paths["dev"] + if local_audio_archives_paths + else None + ), "meta_paths": meta_paths["dev"], }, ), @@ -372,11 +371,11 @@ def _split_generators(self, dl_manager): dl_manager.iter_archive(archive_path) for archive_path in audio_archives_paths["test"] ], - "local_audio_archives_paths": local_audio_archives_paths[ - "test" - ] - if local_audio_archives_paths - else None, + "local_audio_archives_paths": ( + local_audio_archives_paths["test"] + if local_audio_archives_paths + else None + ), "meta_paths": meta_paths["test"], }, ), diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index 1b45ef07ee..a1cd2f1db7 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -34,7 +34,7 @@ @dataclass class GigaSpeechRow: - """ Dataclass for handling GigaSpeech rows. + """Dataclass for handling GigaSpeech rows. Attributes ---------- @@ -78,7 +78,7 @@ def prepare_gigaspeech( convert_opus_to_wav: bool = True, download_with_HF: bool = False, ) -> None: - """ Prepare the csv files for GigaSpeech dataset. + """Prepare the csv files for GigaSpeech dataset. Download instructions: https://github.com/SpeechColab/GigaSpeech Reference: https://arxiv.org/abs/2106.06909 @@ -145,13 +145,13 @@ def prepare_gigaspeech( train_split = "" for split in splits: if split in TRAIN_SUBSET: - save_output[split] = output_train + save_output["train"] = output_train train_split = split else: if split == "DEV": - save_output[split] = output_dev + save_output["validation"] = output_dev elif split == "TEST": - save_output[split] = output_test + save_output["test"] = output_test # check if the data is already prepared if skip_csv(save_output): @@ -173,7 +173,9 @@ def prepare_gigaspeech( for split, output in save_output.items(): logger.info(f"Starting creating {output} using {split} split.") HF_create_csv( - output, hf_dataset[split], split, + output, + hf_dataset[split], + split, ) else: # check that the data folder contains the GigaSpeech dataset @@ -191,7 +193,10 @@ def prepare_gigaspeech( def process_line( - audio: json, data_folder: str, split: str, convert_opus_to_wav: bool, + audio: json, + data_folder: str, + split: str, + convert_opus_to_wav: bool, ) -> list: """ Process the audio line and return the utterances for the given split. @@ -318,14 +323,18 @@ def create_csv( os.replace(csv_file_tmp, csv_file) - logger.info(f"{csv_file} succesfully created!") + logger.info(f"{csv_file} successfully created!") logger.info(f"Number of samples in {split} split: {nb_samples}") logger.info( f"Total duration of {split} split: {round(total_duration / 3600, 2)} Hours" ) -def HF_create_csv(csv_file: str, hf_dataset, split: str,) -> None: +def HF_create_csv( + csv_file: str, + hf_dataset, + split: str, +) -> None: """ Create a CSV file based on the info in the GigaSpeech JSON file and filter the data based on the split. @@ -347,7 +356,9 @@ def HF_create_csv(csv_file: str, hf_dataset, split: str,) -> None: total_duration = 0.0 nb_samples = 0 - line_processor = functools.partial(HF_process_line,) + line_processor = functools.partial( + HF_process_line, + ) csv_file_tmp = csv_file + ".tmp" with open(csv_file_tmp, mode="w", encoding="utf-8") as csv_f: @@ -388,14 +399,16 @@ def HF_create_csv(csv_file: str, hf_dataset, split: str,) -> None: os.replace(csv_file_tmp, csv_file) - print(f"{csv_file} succesfully created!") - print(f"Number of samples in {split} split: {nb_samples}") - print( + logger.info(f"{csv_file} successfully created!") + logger.info(f"Number of samples in {split} split: {nb_samples}") + logger.info( f"Total duration of {split} split: {round(total_duration / 3600, 2)} Hours" ) -def HF_process_line(row,) -> list: +def HF_process_line( + row, +) -> list: """ Process the audio line and return the utterances for the given split. @@ -518,7 +531,7 @@ def preprocess_text(text: str) -> str: def skip_csv(save_csv_files: dict) -> bool: - """ Check if the CSV files already exist. + """Check if the CSV files already exist. Parameters ---------- From 0f3da3239a2ae2bd520cb630436e30a890f633d3 Mon Sep 17 00:00:00 2001 From: Adel Moumen Date: Fri, 29 Mar 2024 16:48:16 +0100 Subject: [PATCH 35/77] remove csv --- recipes/GigaSpeech/ASR/transformer/hparams/train_hf_whisper.yaml | 1 - .../GigaSpeech/ASR/transformer/hparams/train_with_hf_whisper.py | 1 - 2 files changed, 2 deletions(-) delete mode 100644 recipes/GigaSpeech/ASR/transformer/hparams/train_hf_whisper.yaml delete mode 100644 recipes/GigaSpeech/ASR/transformer/hparams/train_with_hf_whisper.py diff --git a/recipes/GigaSpeech/ASR/transformer/hparams/train_hf_whisper.yaml b/recipes/GigaSpeech/ASR/transformer/hparams/train_hf_whisper.yaml deleted file mode 100644 index 464090415c..0000000000 --- a/recipes/GigaSpeech/ASR/transformer/hparams/train_hf_whisper.yaml +++ /dev/null @@ -1 +0,0 @@ -# TODO diff --git a/recipes/GigaSpeech/ASR/transformer/hparams/train_with_hf_whisper.py b/recipes/GigaSpeech/ASR/transformer/hparams/train_with_hf_whisper.py deleted file mode 100644 index 044a4824d9..0000000000 --- a/recipes/GigaSpeech/ASR/transformer/hparams/train_with_hf_whisper.py +++ /dev/null @@ -1 +0,0 @@ -# todo From 0009cf275686215a9ea49fe055c82c306c01883c Mon Sep 17 00:00:00 2001 From: TParcollet Date: Tue, 8 Oct 2024 13:48:47 +0100 Subject: [PATCH 36/77] bunch of updates to make it run --- .../GigaSpeech/ASR/CTC/extra_requirements.txt | 2 + .../ASR/CTC/hparams/train_hf_wavlm.yaml | 84 ++++++++--- .../GigaSpeech/ASR/CTC/train_with_wavlm.py | 88 +++++++++-- recipes/GigaSpeech/extra_requirements.txt | 2 - recipes/GigaSpeech/gigaspeech_prepare.py | 140 +++++++++++++----- 5 files changed, 251 insertions(+), 65 deletions(-) delete mode 100644 recipes/GigaSpeech/extra_requirements.txt diff --git a/recipes/GigaSpeech/ASR/CTC/extra_requirements.txt b/recipes/GigaSpeech/ASR/CTC/extra_requirements.txt index afad715b9f..a619ba044a 100644 --- a/recipes/GigaSpeech/ASR/CTC/extra_requirements.txt +++ b/recipes/GigaSpeech/ASR/CTC/extra_requirements.txt @@ -1,3 +1,5 @@ +datasets kenlm +soundfile speechcolab transformers diff --git a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml index daf3443d9a..884ee205d9 100644 --- a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml +++ b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml @@ -2,7 +2,7 @@ # Model: wavlm + DNN + CTC # Decoding AM: Greedy for validation, and Beam search for testing # Augmentation: SpecAugment -# Authors: Adel Moumen 2024 +# Authors: Adel Moumen 2024, Titouan Parcollet 2024 # ################################ # Seed needs to be set at top of yaml, before objects with parameters are made @@ -23,10 +23,12 @@ data_folder: !PLACEHOLDER # e,g./path/to/GigaSpeech # see https://github.com/SpeechColab/GigaSpeech for more details on the dataset # must be one of ["XS", "S", "M", "L", "XL"] # and ["DEV", "TEST"] for the eval splits. -splits: ["XS", "DEV", "TEST"] +splits: ["XL", "DEV", "TEST"] skip_prep: False download_with_HF: True convert_opus_to_wav: True +keep_filler_words: False +keep_punctuation: False ckpt_interval_minutes: 25 # save checkpoint every N min train_csv: !ref /train.csv valid_csv: !ref /dev.csv @@ -34,18 +36,18 @@ test_csv: !ref /test.csv json_file: !ref /GigaSpeech.json # Training parameters -number_of_epochs: 1 +number_of_epochs: 5 lr: 0.9 lr_wav2vec: 0.0001 sorting: ascending num_workers: 4 -precision: fp32 # bf16, fp16 or fp32 +precision: fp16 # bf16, fp16 or fp32 sample_rate: 16000 # With data_parallel batch_size is split into N jobs # With DDP batch_size is multiplied by N jobs # Must be 3 per GPU to fit 32GB of VRAM -batch_size: 6 +batch_size: 8 test_batch_size: 1 # Dataloader options @@ -59,15 +61,46 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref +# Using dynamic batching by default. This works with 4x24GB GPUs +# Or turn it off (but training speed will decrease) +dynamic_batching: True +max_batch_length_train: 60 +max_batch_length_val: 30 # we reduce it as the beam is much wider (VRAM) +num_bucket: 200 +shuffle: True # if true re-creates batches at each epoch shuffling examples. +batch_ordering: random +max_batch_ex: 256 + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_valid: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 + # Model parameters activation: !name:torch.nn.LeakyReLU -dnn_layers: 2 dnn_neurons: 1024 -freeze_wav2vec: True +dropout: 0.1 +freeze_wav2vec: False +wav2vec_output_dim: 1024 # Outputs -output_neurons: 34 +output_neurons: 29 # without punctuation blank_index: 0 +bos_index: -1 # No bos/eos with CTC +eos_index: -1 # Decoding parameters test_beam_search: @@ -109,8 +142,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter concat_original: True repeat_augment: 1 shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 + min_augmentations: 2 + max_augmentations: 2 augment_prob: 1.0 augmentations: [ !ref , @@ -118,16 +151,33 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref ] -enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN - input_shape: [null, null, 1024] - activation: !ref - dnn_blocks: !ref - dnn_neurons: !ref +enc: !new:speechbrain.nnet.containers.Sequential + input_shape: [null, null, !ref ] + linear1: !name:speechbrain.nnet.linear.Linear + n_neurons: !ref + bias: True + bn1: !name:speechbrain.nnet.normalization.BatchNorm1d + activation: !new:torch.nn.LeakyReLU + drop: !new:torch.nn.Dropout + p: !ref + linear2: !name:speechbrain.nnet.linear.Linear + n_neurons: !ref + bias: True + bn2: !name:speechbrain.nnet.normalization.BatchNorm1d + activation2: !new:torch.nn.LeakyReLU + drop2: !new:torch.nn.Dropout + p: !ref + linear3: !name:speechbrain.nnet.linear.Linear + n_neurons: !ref + bias: True + bn3: !name:speechbrain.nnet.normalization.BatchNorm1d + activation3: !new:torch.nn.LeakyReLU wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 source: !ref - output_norm: True + output_norm: False freeze: !ref + freeze_feature_extractor: True save_path: !ref ctc_lin: !new:speechbrain.nnet.linear.Linear @@ -153,7 +203,7 @@ model_opt_class: !name:torch.optim.Adadelta rho: 0.95 eps: 1.e-8 -wav2vec_opt_class: !name:torch.optim.Adam +wav2vec_opt_class: !name:torch.optim.AdamW lr: !ref lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py index e528829c96..5d1d6cbd9f 100644 --- a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py @@ -4,14 +4,17 @@ * Adel Moumen 2024 """ +import logging import os import sys + import torch -import logging +from hyperpyyaml import load_hyperpyyaml + import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process from speechbrain.tokenizers.SentencePiece import SentencePiece -from hyperpyyaml import load_hyperpyyaml +from speechbrain.utils.data_utils import undo_padding +from speechbrain.utils.distributed import if_main_process, run_on_main logger = logging.getLogger(__name__) @@ -93,7 +96,9 @@ def compute_objectives(self, predictions, batch, stage): ] if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] + # Convert indices to words + target_words = undo_padding(tokens, tokens_lens) + target_words = self.tokenizer(target_words, task="decode_from_list") self.wer_metric.append(ids, predicted_words, target_words) self.cer_metric.append(ids, predicted_words, target_words) @@ -273,7 +278,33 @@ def text_pipeline(wrd): ["id", "sig", "text", "char_list", "tokens"], ) - return train_data, valid_data, test_data + # 5. If Dynamic Batching is used, we instantiate the needed samplers. + train_batch_sampler = None + valid_batch_sampler = None + if hparams["dynamic_batching"]: + from speechbrain.dataio.sampler import DynamicBatchSampler # noqa + + dynamic_hparams_train = hparams["dynamic_batch_sampler_train"] + dynamic_hparams_valid = hparams["dynamic_batch_sampler_valid"] + + train_batch_sampler = DynamicBatchSampler( + train_data, + length_func=lambda x: x["duration"], + **dynamic_hparams_train, + ) + valid_batch_sampler = DynamicBatchSampler( + valid_data, + length_func=lambda x: x["duration"], + **dynamic_hparams_valid, + ) + + return ( + train_data, + valid_data, + test_data, + train_batch_sampler, + valid_batch_sampler, + ) if __name__ == "__main__": @@ -311,6 +342,8 @@ def text_pipeline(wrd): "skip_prep": hparams["skip_prep"], "convert_opus_to_wav": hparams["convert_opus_to_wav"], "download_with_HF": hparams["download_with_HF"], + "punctuation": hparams["keep_punctuation"], + "filler": hparams["keep_filler_words"], }, ) @@ -319,7 +352,7 @@ def text_pipeline(wrd): model_dir=hparams["save_folder"], vocab_size=hparams["output_neurons"], annotation_train=hparams["train_csv"], - annotation_read="wrd", + annotation_read="text", model_type=hparams["token_type"], character_coverage=hparams["character_coverage"], bos_id=hparams["bos_index"], @@ -327,9 +360,13 @@ def text_pipeline(wrd): ) # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_data, label_encoder = dataio_prepare( - hparams, tokenizer - ) + ( + train_data, + valid_data, + test_data, + train_bsampler, + valid_bsampler, + ) = dataio_prepare(hparams, tokenizer) # Trainer initialization asr_brain = ASR( @@ -346,9 +383,34 @@ def text_pipeline(wrd): # We dynamically add the tokenizer to our brain class. # NB: This tokenizer corresponds to the one used for the LM!! - asr_brain.tokenizer = label_encoder + asr_brain.tokenizer = tokenizer + + # Manage dynamic batching + train_dataloader_opts = hparams["train_dataloader_opts"] + valid_dataloader_opts = hparams["valid_dataloader_opts"] + if train_bsampler is not None: + collate_fn = None + if "collate_fn" in train_dataloader_opts: + collate_fn = train_dataloader_opts["collate_fn"] + + train_dataloader_opts = { + "batch_sampler": train_bsampler, + "num_workers": hparams["num_workers"], + } + + if collate_fn is not None: + train_dataloader_opts["collate_fn"] = collate_fn + + if valid_bsampler is not None: + collate_fn = None + if "collate_fn" in valid_dataloader_opts: + collate_fn = valid_dataloader_opts["collate_fn"] + + valid_dataloader_opts = {"batch_sampler": valid_bsampler} + + if collate_fn is not None: + valid_dataloader_opts["collate_fn"] = collate_fn - ind2lab = label_encoder.ind2lab vocab_list = [ tokenizer.sp.id_to_piece(i) for i in range(tokenizer.sp.vocab_size()) ] @@ -365,8 +427,8 @@ def text_pipeline(wrd): asr_brain.hparams.epoch_counter, train_data, valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], + train_loader_kwargs=train_dataloader_opts, + valid_loader_kwargs=valid_dataloader_opts, ) # Testing diff --git a/recipes/GigaSpeech/extra_requirements.txt b/recipes/GigaSpeech/extra_requirements.txt deleted file mode 100644 index 91de2461ce..0000000000 --- a/recipes/GigaSpeech/extra_requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -speechcolab -webdataset diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index a1cd2f1db7..d9ec4b6b56 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -9,17 +9,34 @@ * Adel Moumen, 2024 """ +import csv +import functools +import json import logging import os -import json -import csv from dataclasses import dataclass -import functools -from speechbrain.utils.parallel import parallel_map + import speechbrain as sb +from speechbrain.utils.parallel import parallel_map logger = logging.getLogger(__name__) - +FILLERS = [ + "UH", + "UHH", + "UM", + "EH", + "MM", + "HM", + "AH", + "HUH", + "HA", + "ER", + "OOF", + "HEE", + "ACH", + "EEE", + "EW", +] GARBAGE_UTTERANCE_TAGS = ["", "", "", ""] PUNCTUATION_TAGS = { "": ",", @@ -77,6 +94,8 @@ def prepare_gigaspeech( skip_prep: bool = False, convert_opus_to_wav: bool = True, download_with_HF: bool = False, + punctuation: bool = False, + filler: bool = False, ) -> None: """Prepare the csv files for GigaSpeech dataset. @@ -115,6 +134,10 @@ def prepare_gigaspeech( be faster and more reliable than the official host. Make sure to read the instructions on how to get the dataset from Hugging Face here: https://huggingface.co/datasets/speechcolab/gigaspeech + punctuation: bool, optional + Keeping the punctuation, or not. + filler: bool, optional + Keeping filler words (hum), or not. Returns ------- @@ -163,6 +186,13 @@ def prepare_gigaspeech( if download_with_HF: from datasets import load_dataset + if os.path.exists("dataset.py"): + logger.info("HuggingFace dataset.py found.") + else: + raise FileNotFoundError( + "HuggingFace dataset.py not found. Please run this recipe from the correct recipe folder or copy the dataset.py file." + ) + hf_dataset = load_dataset( "dataset.py", train_split.lower(), @@ -172,11 +202,7 @@ def prepare_gigaspeech( ) for split, output in save_output.items(): logger.info(f"Starting creating {output} using {split} split.") - HF_create_csv( - output, - hf_dataset[split], - split, - ) + HF_create_csv(output, hf_dataset[split], split, punctuation, filler) else: # check that the data folder contains the GigaSpeech dataset check_gigaspeech_folders(data_folder, json_file) @@ -188,7 +214,15 @@ def prepare_gigaspeech( for split, output in save_output.items(): logger.info(f"Starting creating {output} using {split} split.") - create_csv(output, info, data_folder, split, convert_opus_to_wav) + create_csv( + output, + info, + data_folder, + split, + convert_opus_to_wav, + punctuation, + filler, + ) logger.info("Data preparation completed!") @@ -197,6 +231,8 @@ def process_line( data_folder: str, split: str, convert_opus_to_wav: bool, + punctuation: bool = False, + filler: bool = False, ) -> list: """ Process the audio line and return the utterances for the given split. @@ -211,6 +247,10 @@ def process_line( The split to be used for filtering the data. convert_opus_to_wav : bool If True, the opus files will be converted to wav files. + punctuation : bool + Keeping punctuation or not. Default is no. + filler : bool + Keeping filler words or not (hum, er). Default is no. Returns ------- @@ -228,7 +268,7 @@ def process_line( # 2. iterate over the utterances utterances = [] for segment in audio["segments"]: - text = preprocess_text(segment["text_tn"]) + text = preprocess_text(segment["text_tn"], punctuation, filler) if text: begin_time = float(segment["begin_time"]) end_time = float(segment["end_time"]) @@ -253,6 +293,8 @@ def create_csv( data_folder: str, split: str, convert_opus_to_wav: bool, + punctuation: bool = False, + filler: bool = False, ) -> None: """ Create a CSV file based on the info in the GigaSpeech JSON file and filter the data based on the split. @@ -269,6 +311,10 @@ def create_csv( The split to be used for filtering the data. convert_opus_to_wav : bool If True, the opus files will be converted to wav files. + punctuation : bool + Keeping punctuation or not. Default is no. + filler : bool + Keeping filler words or not (hum, er). Default is no. Returns ------- @@ -334,20 +380,25 @@ def HF_create_csv( csv_file: str, hf_dataset, split: str, + punctuation: bool = False, + filler: bool = False, ) -> None: """ - Create a CSV file based on the info in the GigaSpeech JSON file and filter the data based on the split. + Create a CSV file based on a HuggingFace dataset. Parameters ---------- csv_file : str The path to the CSV file to be created. - info : dict - The GigaSpeech JSON file content. - data_folder : str - The path to the GigaSpeech dataset. + hf_dataset : huggingface dataset, + The huggingface dataset. split : str The split to be used for filtering the data. + punctuation : bool + Keeping punctuation or not. Default is no. + filler : bool + Keeping filler words or not (hum, er). Default is no. + Returns ------- @@ -358,6 +409,8 @@ def HF_create_csv( line_processor = functools.partial( HF_process_line, + punctuation=punctuation, + filler=filler, ) csv_file_tmp = csv_file + ".tmp" @@ -406,9 +459,7 @@ def HF_create_csv( ) -def HF_process_line( - row, -) -> list: +def HF_process_line(row: dict, punctuation: bool, filler: bool) -> list: """ Process the audio line and return the utterances for the given split. @@ -416,6 +467,10 @@ def HF_process_line( ---------- row: dict The audio line to be processed. + punctuation : bool + Keeping punctuation or not. Default is no. + filler : bool + Keeping filler words or not (hum, er). Default is no. Returns ------- @@ -433,7 +488,7 @@ def HF_process_line( logger.error(f"Failed reading {audio_path}: {e}") return None - text = preprocess_text(row["text"]) + text = preprocess_text(row["text"], punctuation, filler) if text: utt_id = row["segment_id"] audio_id = row["audio_id"] @@ -484,14 +539,19 @@ def convert_opus2wav(audio_opus_path): return audio_wav_path -def preprocess_text(text: str) -> str: +def preprocess_text(text: str, punctuation: bool, filler: bool) -> str: """ - Preprocesses the input text by removing garbage tags and replacing punctuation tags. + Preprocesses the input text by removing garbage tags and removing punctuation + and filler words if specified. Parameters ---------- text : str The input text to be preprocessed. + punctuation : bool + Keeping punctuation or not. Default is no. + filler : bool + Keeping filler words or not (hum, er). Default is no. Returns ------- @@ -515,19 +575,33 @@ def preprocess_text(text: str) -> str: >>> preprocess_text(text) "douglas mcgray is going to be our guide you walk through the door, you see the red carpeting, you see someone in a suit. they may be greeting you." """ - # Remove garbage tags - for tag in GARBAGE_UTTERANCE_TAGS: - if tag in text: - return "" - # Remove punctuation tags - for tag, punctuation in PUNCTUATION_TAGS.items(): - text = text.replace(" " + tag, punctuation) + text = text.upper() + text = text.replace("-", " ") + + to_remove = GARBAGE_UTTERANCE_TAGS + if not punctuation: + to_remove += PUNCTUATION_TAGS + if not filler: + to_remove += FILLERS + + processed = [] + for word in text.split(): + if word in to_remove: + continue + processed.append(word) + + sentence = " ".join(processed) + + if punctuation: + for tag, punctuation in PUNCTUATION_TAGS.items(): + sentence = sentence.replace(" " + tag, punctuation) assert ( - "<" not in text and ">" not in text - ), f"Found tags in the text: {text}" - return text.lower() + "<" not in sentence and ">" not in sentence + ), f"Found tags in the text: {sentence}" + + return sentence def skip_csv(save_csv_files: dict) -> bool: From 8bdbd1ea991e9228618714be422decb503927920 Mon Sep 17 00:00:00 2001 From: TParcollet Date: Tue, 8 Oct 2024 13:53:45 +0100 Subject: [PATCH 37/77] no download script --- .../GigaSpeech/ASR/CTC/download_gigaspeech.py | 95 ------------------- recipes/GigaSpeech/download_gigaspeech.py | 95 ------------------- 2 files changed, 190 deletions(-) delete mode 100644 recipes/GigaSpeech/ASR/CTC/download_gigaspeech.py delete mode 100644 recipes/GigaSpeech/download_gigaspeech.py diff --git a/recipes/GigaSpeech/ASR/CTC/download_gigaspeech.py b/recipes/GigaSpeech/ASR/CTC/download_gigaspeech.py deleted file mode 100644 index 357540bbe5..0000000000 --- a/recipes/GigaSpeech/ASR/CTC/download_gigaspeech.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -Note for reviewer: this is a temporary script. It may be removed in the future. -Note2: for EU/US users, using this script might be VERY slow. It is instead -recommended to use the HuggingFace script. - -Download script for GigaSpeech dataset. - -Download instructions: https://github.com/SpeechColab/GigaSpeech -Reference: https://arxiv.org/abs/2106.06909 - -Author -------- - * Adel Moumen, 2024 -""" - -import logging -from typing import Optional, Sequence, Union -import argparse - -logger = logging.getLogger(__name__) - - -def download_gigaspeech( - password: str, - target_dir: str = ".", - dataset_parts: Optional[Union[str, Sequence[str]]] = "auto", - host: Optional[str] = "tsinghua", -) -> None: - """Download GigaSpeech dataset. - - Parameters - ---------- - password : str - The password to access the GigaSpeech dataset. - target_dir : str, optional - The path to the directory where the dataset will be downloaded. - dataset_parts : Union[str, Sequence[str]], optional - The parts of the dataset to be downloaded. - If "auto", all parts will be downloaded. - If a string, it should be a comma-separated list of parts to be downloaded. - If a list, it should be a list of parts to be downloaded. - host : str, optional - The host to be used for downloading the dataset. - The available hosts are described in https://github.com/SpeechColab/GigaSpeech. - """ - try: - from speechcolab.datasets.gigaspeech import GigaSpeech - except ImportError: - raise ImportError( - "Please install the speechcolab package to download the GigaSpeech dataset." - ) - gigaspeech = GigaSpeech(target_dir) - - if dataset_parts == ["auto"]: - dataset_parts = ["XL", "DEV", "TEST"] - - for part in dataset_parts: - logging.info(f"Downloading GigaSpeech part: {part}") - gigaspeech.download(password, "{" + part + "}", host=host) - - logger.info(f"GigaSpeech dataset finished downloading to {target_dir}.") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Download GigaSpeech dataset.") - parser.add_argument( - "--password", - type=str, - required=True, - help="The password to access the GigaSpeech dataset.", - ) - parser.add_argument( - "--target_dir", - type=str, - default=".", - help="The path to the directory where the dataset will be downloaded.", - ) - parser.add_argument( - "--dataset_parts", - type=str, - nargs="+", # '+' means one or more values will be collected into a list - default=["auto"], - help="The parts of the dataset to be downloaded.", - ) - parser.add_argument( - "--host", - type=str, - default="tsinghua", - help="The host to be used for downloading the dataset.", - ) - args = parser.parse_args() - - download_gigaspeech( - args.password, args.target_dir, args.dataset_parts, args.host - ) diff --git a/recipes/GigaSpeech/download_gigaspeech.py b/recipes/GigaSpeech/download_gigaspeech.py deleted file mode 100644 index 357540bbe5..0000000000 --- a/recipes/GigaSpeech/download_gigaspeech.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -Note for reviewer: this is a temporary script. It may be removed in the future. -Note2: for EU/US users, using this script might be VERY slow. It is instead -recommended to use the HuggingFace script. - -Download script for GigaSpeech dataset. - -Download instructions: https://github.com/SpeechColab/GigaSpeech -Reference: https://arxiv.org/abs/2106.06909 - -Author -------- - * Adel Moumen, 2024 -""" - -import logging -from typing import Optional, Sequence, Union -import argparse - -logger = logging.getLogger(__name__) - - -def download_gigaspeech( - password: str, - target_dir: str = ".", - dataset_parts: Optional[Union[str, Sequence[str]]] = "auto", - host: Optional[str] = "tsinghua", -) -> None: - """Download GigaSpeech dataset. - - Parameters - ---------- - password : str - The password to access the GigaSpeech dataset. - target_dir : str, optional - The path to the directory where the dataset will be downloaded. - dataset_parts : Union[str, Sequence[str]], optional - The parts of the dataset to be downloaded. - If "auto", all parts will be downloaded. - If a string, it should be a comma-separated list of parts to be downloaded. - If a list, it should be a list of parts to be downloaded. - host : str, optional - The host to be used for downloading the dataset. - The available hosts are described in https://github.com/SpeechColab/GigaSpeech. - """ - try: - from speechcolab.datasets.gigaspeech import GigaSpeech - except ImportError: - raise ImportError( - "Please install the speechcolab package to download the GigaSpeech dataset." - ) - gigaspeech = GigaSpeech(target_dir) - - if dataset_parts == ["auto"]: - dataset_parts = ["XL", "DEV", "TEST"] - - for part in dataset_parts: - logging.info(f"Downloading GigaSpeech part: {part}") - gigaspeech.download(password, "{" + part + "}", host=host) - - logger.info(f"GigaSpeech dataset finished downloading to {target_dir}.") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Download GigaSpeech dataset.") - parser.add_argument( - "--password", - type=str, - required=True, - help="The password to access the GigaSpeech dataset.", - ) - parser.add_argument( - "--target_dir", - type=str, - default=".", - help="The path to the directory where the dataset will be downloaded.", - ) - parser.add_argument( - "--dataset_parts", - type=str, - nargs="+", # '+' means one or more values will be collected into a list - default=["auto"], - help="The parts of the dataset to be downloaded.", - ) - parser.add_argument( - "--host", - type=str, - default="tsinghua", - help="The host to be used for downloading the dataset.", - ) - args = parser.parse_args() - - download_gigaspeech( - args.password, args.target_dir, args.dataset_parts, args.host - ) From 80838726ff0cdbeec965c41566a9149102fff61d Mon Sep 17 00:00:00 2001 From: TParcollet Date: Tue, 8 Oct 2024 13:56:39 +0100 Subject: [PATCH 38/77] fix precommit --- .../GigaSpeech/ASR/CTC/gigaspeech_prepare.py | 593 +----------------- 1 file changed, 1 insertion(+), 592 deletions(-) mode change 100644 => 120000 recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py diff --git a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py deleted file mode 100644 index a1cd2f1db7..0000000000 --- a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py +++ /dev/null @@ -1,592 +0,0 @@ -""" -Data preparation script for the GigaSpeech dataset. - -Download instructions: https://github.com/SpeechColab/GigaSpeech -Reference: https://arxiv.org/abs/2106.06909 - -Author -------- - * Adel Moumen, 2024 -""" - -import logging -import os -import json -import csv -from dataclasses import dataclass -import functools -from speechbrain.utils.parallel import parallel_map -import speechbrain as sb - -logger = logging.getLogger(__name__) - -GARBAGE_UTTERANCE_TAGS = ["", "", "", ""] -PUNCTUATION_TAGS = { - "": ",", - "": "!", - "": ".", - "": "?", -} -SPLITS = ["DEV", "TEST"] -TRAIN_SUBSET = ["XS", "S", "M", "L", "XL"] -SAMPLING_RATE = 16000 - - -@dataclass -class GigaSpeechRow: - """Dataclass for handling GigaSpeech rows. - - Attributes - ---------- - utt_id : str - The segment ID. - audio_id : str - The audio ID. - audio_path : str - The path to the audio file. - speaker : str - The speaker ID. - begin_time : float - The start time of the segment. - end_time : float - The end time of the segment. - duration : float - The duration of the segment. - text : str - The text of the segment. - """ - - utt_id: str # segment[sid] - audio_id: str # audio[aid] - audio_path: str # by default this is opus files - speaker: str # audio["speaker"] - begin_time: float - end_time: float - duration: float - text: str - - -def prepare_gigaspeech( - data_folder: str, - save_folder: str, - splits: list, - output_train: str, - output_dev: str, - output_test: str, - json_file: str = "GigaSpeech.json", - skip_prep: bool = False, - convert_opus_to_wav: bool = True, - download_with_HF: bool = False, -) -> None: - """Prepare the csv files for GigaSpeech dataset. - - Download instructions: https://github.com/SpeechColab/GigaSpeech - Reference: https://arxiv.org/abs/2106.06909 - - The `train.csv` file is created by following the train subset specified in the `splits` list. - It must be part of the `TRAIN_SUBSET` list. You cannot use multiple train subsets. - - The `dev.csv` and `test.csv` files are created based on the `DEV` and `TEST` splits - specified in the `splits` list. - - Parameters - ---------- - data_folder : str - The path to the GigaSpeech dataset. - save_folder : str - The path to the folder where the CSV files will be saved. - splits : list - The list of splits to be used for creating the CSV files. - output_train : str - The path in which the train CSV or shards will be saved. - output_dev : str - The path in which the dev CSV or shards will be saved. - output_test : str - The path in which the test CSV or shards will be saved. - json_file : str, optional - The name of the JSON file containing the metadata of the GigaSpeech dataset. - skip_prep : bool, optional - If True, the data preparation will be skipped, and the function will return immediately. - convert_opus_to_wav : bool, optional - If True, the opus files will be converted to wav files. - download_with_HF : bool, optional - If True, the dataset will be downloaded using the Hugging Face datasets library. - We highly recommend using this option if you are based in the EU or US as it will - be faster and more reliable than the official host. Make sure to read the - instructions on how to get the dataset from Hugging Face here: - https://huggingface.co/datasets/speechcolab/gigaspeech - - Returns - ------- - None - """ - logger.info(f"Preparing GigaSpeech dataset in {save_folder}...") - - if skip_prep: - logger.info("Skipping data preparation as `skip_prep` is set to `True`") - return - - # check that `splits` input is valid - for split in splits: - assert ( - split in SPLITS + TRAIN_SUBSET - ), f"Split {split} not recognized. Valid splits are {SPLITS + TRAIN_SUBSET}." - - # check that we are not using multiple train subsets - if len(set(splits).intersection(TRAIN_SUBSET)) > 1: - raise ValueError( - "You cannot use multiple train subsets. Please select only one train subset." - ) - - os.makedirs(save_folder, exist_ok=True) - - # Setting output paths - save_output = {} - train_split = "" - for split in splits: - if split in TRAIN_SUBSET: - save_output["train"] = output_train - train_split = split - else: - if split == "DEV": - save_output["validation"] = output_dev - elif split == "TEST": - save_output["test"] = output_test - - # check if the data is already prepared - if skip_csv(save_output): - logger.info("Skipping preparation, completed in previous run.") - return - else: - logger.info("Starting data preparation...") - - if download_with_HF: - from datasets import load_dataset - - hf_dataset = load_dataset( - "dataset.py", - train_split.lower(), - trust_remote_code=True, - cache_dir=data_folder, - data_dir=data_folder, - ) - for split, output in save_output.items(): - logger.info(f"Starting creating {output} using {split} split.") - HF_create_csv( - output, - hf_dataset[split], - split, - ) - else: - # check that the data folder contains the GigaSpeech dataset - check_gigaspeech_folders(data_folder, json_file) - - logger.info(f"Starting reading {json_file}.") - with open(json_file, "r") as f: - info = json.load(f) - logger.info(f"Reading {json_file} done.") - - for split, output in save_output.items(): - logger.info(f"Starting creating {output} using {split} split.") - create_csv(output, info, data_folder, split, convert_opus_to_wav) - logger.info("Data preparation completed!") - - -def process_line( - audio: json, - data_folder: str, - split: str, - convert_opus_to_wav: bool, -) -> list: - """ - Process the audio line and return the utterances for the given split. - - Parameters - ---------- - audio : dict - The audio line to be processed. - data_folder : str - The path to the GigaSpeech dataset. - split : str - The split to be used for filtering the data. - convert_opus_to_wav : bool - If True, the opus files will be converted to wav files. - - Returns - ------- - list - The list of utterances for the given split. - """ - if ("{" + split + "}") in audio["subsets"]: - - audio_path = os.path.join(data_folder, audio["path"]) - assert os.path.isfile(audio_path), f"File not found: {audio_path}" - - if convert_opus_to_wav and audio_path.endswith(".opus"): - audio_path = convert_opus2wav(audio_path) - - # 2. iterate over the utterances - utterances = [] - for segment in audio["segments"]: - text = preprocess_text(segment["text_tn"]) - if text: - begin_time = float(segment["begin_time"]) - end_time = float(segment["end_time"]) - duration = end_time - begin_time - utterance = GigaSpeechRow( - utt_id=segment["sid"], - audio_id=audio["aid"], - audio_path=str(audio_path), - speaker=audio["speaker"], - begin_time=begin_time, - end_time=end_time, - duration=duration, - text=text, - ) - utterances.append(utterance) - return utterances - - -def create_csv( - csv_file: str, - info: json, - data_folder: str, - split: str, - convert_opus_to_wav: bool, -) -> None: - """ - Create a CSV file based on the info in the GigaSpeech JSON file and filter the data based on the split. - - Parameters - ---------- - csv_file : str - The path to the CSV file to be created. - info : dict - The GigaSpeech JSON file content. - data_folder : str - The path to the GigaSpeech dataset. - split : str - The split to be used for filtering the data. - convert_opus_to_wav : bool - If True, the opus files will be converted to wav files. - - Returns - ------- - None - """ - total_duration = 0.0 - nb_samples = 0 - - line_processor = functools.partial( - process_line, - data_folder=data_folder, - split=split, - convert_opus_to_wav=convert_opus_to_wav, - ) - - csv_file_tmp = csv_file + ".tmp" - with open(csv_file_tmp, mode="w", encoding="utf-8") as csv_f: - csv_writer = csv.writer( - csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL - ) - header = [ - "ID", - "audio_id", - "audio_path", - "speaker", - "begin_time", - "end_time", - "duration", - "text", - ] - csv_writer.writerow(header) - for row in parallel_map(line_processor, info["audios"]): - if row is None: - continue - - for item in row: - csv_writer.writerow( - [ - item.utt_id, - item.audio_id, - item.audio_path, - item.speaker, - str(item.begin_time), - str(item.end_time), - str(item.duration), - item.text, - ] - ) - - total_duration += item.duration - nb_samples += 1 - - os.replace(csv_file_tmp, csv_file) - - logger.info(f"{csv_file} successfully created!") - logger.info(f"Number of samples in {split} split: {nb_samples}") - logger.info( - f"Total duration of {split} split: {round(total_duration / 3600, 2)} Hours" - ) - - -def HF_create_csv( - csv_file: str, - hf_dataset, - split: str, -) -> None: - """ - Create a CSV file based on the info in the GigaSpeech JSON file and filter the data based on the split. - - Parameters - ---------- - csv_file : str - The path to the CSV file to be created. - info : dict - The GigaSpeech JSON file content. - data_folder : str - The path to the GigaSpeech dataset. - split : str - The split to be used for filtering the data. - - Returns - ------- - None - """ - total_duration = 0.0 - nb_samples = 0 - - line_processor = functools.partial( - HF_process_line, - ) - - csv_file_tmp = csv_file + ".tmp" - with open(csv_file_tmp, mode="w", encoding="utf-8") as csv_f: - csv_writer = csv.writer( - csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL - ) - header = [ - "ID", - "audio_id", - "audio_path", - "speaker", - "begin_time", - "end_time", - "duration", - "text", - ] - csv_writer.writerow(header) - - for row in parallel_map(line_processor, hf_dataset, chunk_size=1024): - if row is None: - continue - - csv_writer.writerow( - [ - row.utt_id, - row.audio_id, - row.audio_path, - row.speaker, - str(row.begin_time), - str(row.end_time), - str(row.duration), - row.text, - ] - ) - - total_duration += row.duration - nb_samples += 1 - - os.replace(csv_file_tmp, csv_file) - - logger.info(f"{csv_file} successfully created!") - logger.info(f"Number of samples in {split} split: {nb_samples}") - logger.info( - f"Total duration of {split} split: {round(total_duration / 3600, 2)} Hours" - ) - - -def HF_process_line( - row, -) -> list: - """ - Process the audio line and return the utterances for the given split. - - Parameters - ---------- - row: dict - The audio line to be processed. - - Returns - ------- - list - The list of utterances for the given split. - """ - audio_path = os.path.join(row["audio"]["path"]) - - assert os.path.isfile(audio_path), f"File not found: {audio_path}" - - # check reading the audio file ; HF may have some corrupted files - try: - _ = sb.dataio.dataio.read_audio(audio_path) - except Exception as e: - logger.error(f"Failed reading {audio_path}: {e}") - return None - - text = preprocess_text(row["text"]) - if text: - utt_id = row["segment_id"] - audio_id = row["audio_id"] - audio_path = row["audio"]["path"] - speaker = row["speaker"] - begin_time = float(row["begin_time"]) - end_time = float(row["end_time"]) - duration = end_time - begin_time - - row = GigaSpeechRow( - utt_id=utt_id, - audio_id=audio_id, - audio_path=audio_path, - speaker=speaker, - begin_time=begin_time, - end_time=end_time, - duration=duration, - text=text, - ) - - return row - else: - return None - - -def convert_opus2wav(audio_opus_path): - """Convert an opus file to a wav file. - - Parameters - ---------- - audio_opus_path : str - The path to the opus file to be converted. - - Returns - ------- - str - The path to the converted wav file. - - Raises - ------ - subprocess.CalledProcessError - If the conversion process fails. - """ - audio_wav_path = audio_opus_path.replace(".opus", ".wav") - os.system( - f"ffmpeg -y -i {audio_opus_path} -ac 1 -ar {SAMPLING_RATE} {audio_wav_path} > /dev/null 2>&1" - ) - return audio_wav_path - - -def preprocess_text(text: str) -> str: - """ - Preprocesses the input text by removing garbage tags and replacing punctuation tags. - - Parameters - ---------- - text : str - The input text to be preprocessed. - - Returns - ------- - str - The preprocessed text with removed garbage tags and replaced punctuation tags. - - Raises - ------ - AssertionError - If '<' or '>' tags are found in the text after preprocessing. - - Notes - ----- - The function iterates over predefined garbage utterance tags (GARBAGE_UTTERANCE_TAGS) - and removes them from the input text. It then iterates over predefined punctuation tags - (PUNCTUATION_TAGS) and replaces them with the corresponding punctuation. - - Examples - -------- - >>> text = " DOUGLAS MCGRAY IS GOING TO BE OUR GUIDE YOU WALK THROUGH THE DOOR YOU SEE THE RED CARPETING YOU SEE SOMEONE IN A SUIT THEY MAY BE GREETING YOU " - >>> preprocess_text(text) - "douglas mcgray is going to be our guide you walk through the door, you see the red carpeting, you see someone in a suit. they may be greeting you." - """ - # Remove garbage tags - for tag in GARBAGE_UTTERANCE_TAGS: - if tag in text: - return "" - - # Remove punctuation tags - for tag, punctuation in PUNCTUATION_TAGS.items(): - text = text.replace(" " + tag, punctuation) - - assert ( - "<" not in text and ">" not in text - ), f"Found tags in the text: {text}" - return text.lower() - - -def skip_csv(save_csv_files: dict) -> bool: - """Check if the CSV files already exist. - - Parameters - ---------- - save_csv_files : dict - The dictionary containing the paths to the CSV files. - - Returns - ------- - bool - True if all the CSV files already exist, False otherwise. - """ - return all(os.path.isfile(path) for path in save_csv_files.values()) - - -def check_gigaspeech_folders( - data_folder: str, - json_file: str = "GigaSpeech.json", - audio_folder: str = "audio", -) -> None: - """Check if the data folder actually contains the GigaSpeech dataset. - - If it does not, an error is raised. - - Parameters - ---------- - data_folder : str - The path to the GigaSpeech dataset. - json_file : str, optional - The name of the JSON file containing the metadata of the GigaSpeech dataset. - audio_folder : str, optional - The name of the folder containing the audio files of the GigaSpeech dataset. - - Returns - ------- - None - - Raises - ------ - OSError - If GigaSpeech is not found at the specified path. - """ - # Checking if "GigaSpeech.json" exist - if not os.path.exists(json_file): - err_msg = ( - "the opus file %s does not exist (it is expected in the " - "Gigaspeech dataset)" % json_file - ) - raise OSError(err_msg) - - # Check if audio folders exist - for folder_subset in ["audiobook", "podcast", "youtube"]: - audio_subset = os.path.join(data_folder, audio_folder, folder_subset) - if not os.path.exists(audio_subset): - err_msg = ( - "the file %s does not exist (it is expected in the " - "Gigaspeech dataset)" % audio_subset - ) - raise OSError(err_msg) diff --git a/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py new file mode 120000 index 0000000000..5190685a8e --- /dev/null +++ b/recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py @@ -0,0 +1 @@ +../../gigaspeech_prepare.py \ No newline at end of file From a362bca38fc7d927fe9fda0207985b3b198f99ff Mon Sep 17 00:00:00 2001 From: TParcollet Date: Tue, 8 Oct 2024 13:58:43 +0100 Subject: [PATCH 39/77] fix precommit --- recipes/GigaSpeech/ASR/CTC/dataset.py | 441 +------------------------- recipes/GigaSpeech/dataset.py | 1 - 2 files changed, 1 insertion(+), 441 deletions(-) mode change 100644 => 120000 recipes/GigaSpeech/ASR/CTC/dataset.py diff --git a/recipes/GigaSpeech/ASR/CTC/dataset.py b/recipes/GigaSpeech/ASR/CTC/dataset.py deleted file mode 100644 index 3b6219efc8..0000000000 --- a/recipes/GigaSpeech/ASR/CTC/dataset.py +++ /dev/null @@ -1,440 +0,0 @@ -# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# MODIFIED BY: Adel Moumen 2024 -""" -GigaSpeech is an evolving, multi-domain English speech recognition corpus with 10,000 hours of high quality -labeled audio suitable for supervised training, and 40,000 hours of total audio suitable for semi-supervised -and unsupervised training. Around 40,000 hours of transcribed audio is first collected from audiobooks, podcasts -and YouTube, covering both read and spontaneous speaking styles, and a variety of topics, such as arts, science, -sports, etc. A new forced alignment and segmentation pipeline is proposed to create sentence segments suitable -for speech recognition training, and to filter out segments with low-quality transcription. For system training, -GigaSpeech provides five subsets of different sizes, 10h, 250h, 1000h, 2500h, and 10000h. -For our 10,000-hour XL training subset, we cap the word error rate at 4% during the filtering/validation stage, -and for all our other smaller training subsets, we cap it at 0%. The DEV and TEST evaluation sets, on the other hand, -are re-processed by professional human transcribers to ensure high transcription quality. -""" - -import csv -import os - -import datasets - -_CITATION = """\ -@article{DBLP:journals/corr/abs-2106-06909, - author = {Guoguo Chen and - Shuzhou Chai and - Guanbo Wang and - Jiayu Du and - Wei{-}Qiang Zhang and - Chao Weng and - Dan Su and - Daniel Povey and - Jan Trmal and - Junbo Zhang and - Mingjie Jin and - Sanjeev Khudanpur and - Shinji Watanabe and - Shuaijiang Zhao and - Wei Zou and - Xiangang Li and - Xuchen Yao and - Yongqing Wang and - Yujun Wang and - Zhao You and - Zhiyong Yan}, - title = {GigaSpeech: An Evolving, Multi-domain {ASR} Corpus with 10, 000 Hours - of Transcribed Audio}, - journal = {CoRR}, - volume = {abs/2106.06909}, - year = {2021}, - url = {https://arxiv.org/abs/2106.06909}, - eprinttype = {arXiv}, - eprint = {2106.06909}, - timestamp = {Wed, 29 Dec 2021 14:29:26 +0100}, - biburl = {https://dblp.org/rec/journals/corr/abs-2106-06909.bib}, - bibsource = {dblp computer science bibliography, https://dblp.org} -} -""" - -_DESCRIPTION = """\ -GigaSpeech is an evolving, multi-domain English speech recognition corpus with 10,000 hours of high quality -labeled audio suitable for supervised training, and 40,000 hours of total audio suitable for semi-supervised -and unsupervised training. Around 40,000 hours of transcribed audio is first collected from audiobooks, podcasts -and YouTube, covering both read and spontaneous speaking styles, and a variety of topics, such as arts, science, -sports, etc. A new forced alignment and segmentation pipeline is proposed to create sentence segments suitable -for speech recognition training, and to filter out segments with low-quality transcription. For system training, -GigaSpeech provides five subsets of different sizes, 10h, 250h, 1000h, 2500h, and 10000h. -For our 10,000-hour XL training subset, we cap the word error rate at 4% during the filtering/validation stage, -and for all our other smaller training subsets, we cap it at 0%. The DEV and TEST evaluation sets, on the other hand, -are re-processed by professional human transcribers to ensure high transcription quality. -""" - -_HOMEPAGE = "https://github.com/SpeechColab/GigaSpeech" - -_LICENSE = "Apache License 2.0" - -_CATEGORIES = ( - "People and Blogs", - "Business", - "Nonprofits and Activism", - "Crime", - "History", - "Pets and Animals", - "News and Politics", - "Travel and Events", - "Kids and Family", - "Leisure", - "N/A", - "Comedy", - "News and Politics", - "Sports", - "Arts", - "Science and Technology", - "Autos and Vehicles", - "Science and Technology", - "People and Blogs", - "Music", - "Society and Culture", - "Education", - "Howto and Style", - "Film and Animation", - "Gaming", - "Entertainment", - "Travel and Events", - "Health and Fitness", - "audiobook", -) - -_SOURCES = ("audiobook", "podcast", "youtube") - -_SUBSETS = ("xs", "s", "m", "l", "xl") - -_BASE_DATA_URL = ( - "https://huggingface.co/datasets/speechcolab/gigaspeech/resolve/main/data/" -) - -_AUDIO_ARCHIVE_URL = ( - _BASE_DATA_URL - + "audio/{subset}_files{is_additional}/{subset}_chunks_{archive_id:04}.tar.gz" -) - -_META_URL = ( - _BASE_DATA_URL - + "metadata/{subset}_metadata{is_additional}/{subset}_chunks_{archive_id:04}_metadata.csv" -) - -_N_ARCHIVES_URL = _BASE_DATA_URL + "{subset}_n_archives{is_additional}.txt" - -logger = datasets.utils.logging.get_logger(__name__) - - -class GigaspeechConfig(datasets.BuilderConfig): - """BuilderConfig for Gigaspeech.""" - - def __init__(self, name, *args, **kwargs): - """BuilderConfig for Gigaspeech""" - super().__init__(name=name, *args, **kwargs) - # larger subsets are supersets of smaller subsets, - # if we want to download "m", we need to download "xs" and "s" data too. - # so if name == "m", self.subsets_to_download will be ("xs", "s", "m") - if name not in {"dev", "test"}: - self.subsets_to_download = _SUBSETS[: _SUBSETS.index(name) + 1] - else: - self.subsets_to_download = (name,) - - -class Gigaspeech(datasets.GeneratorBasedBuilder): - """ - GigaSpeech is an evolving, multi-domain English speech recognition corpus with 10,000 hours of high quality - labeled audio suitable for supervised training, and 40,000 hours of total audio suitable for semi-supervised - and unsupervised training (this implementation contains only labelled data for now). - Around 40,000 hours of transcribed audio is first collected from audiobooks, podcasts - and YouTube, covering both read and spontaneous speaking styles, and a variety of topics, such as arts, science, - sports, etc. A new forced alignment and segmentation pipeline is proposed to create sentence segments suitable - for speech recognition training, and to filter out segments with low-quality transcription. For system training, - GigaSpeech provides five subsets of different sizes, 10h, 250h, 1000h, 2500h, and 10000h. - For our 10,000-hour XL training subset, we cap the word error rate at 4% during the filtering/validation stage, - and for all our other smaller training subsets, we cap it at 0%. The DEV and TEST evaluation sets, on the other hand, - are re-processed by professional human transcribers to ensure high transcription quality. - """ - - VERSION = datasets.Version("1.0.0") - - BUILDER_CONFIGS = [ - GigaspeechConfig(name=subset) for subset in _SUBSETS + ("dev", "test") - ] - - DEFAULT_WRITER_BATCH_SIZE = 128 - - def _info(self): - features = datasets.Features( - { - "segment_id": datasets.Value("string"), - "speaker": datasets.Value("string"), - "text": datasets.Value("string"), - "audio": datasets.Audio(sampling_rate=16_000, decode=False), - "begin_time": datasets.Value("float32"), - "end_time": datasets.Value("float32"), - "audio_id": datasets.Value("string"), - "title": datasets.Value("string"), - "url": datasets.Value("string"), - "source": datasets.ClassLabel(names=_SOURCES), - "category": datasets.ClassLabel(names=_CATEGORIES), - "original_full_path": datasets.Value( - "string" - ), # relative path to full audio in original data dirs - } - ) - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=features, - homepage=_HOMEPAGE, - license=_LICENSE, - citation=_CITATION, - ) - - def _is_additional_data(self, name): - if name in {"s", "m", "l", "xl"}: - return "_additional" - return "" - - @property - def _splits_to_subsets(self): - return { - "train": self.config.subsets_to_download, - "dev": ["dev"], - "test": ["test"], - } - - def _read_n_archives(self, n_archives_path): - with open(n_archives_path, encoding="utf-8") as f: - return int(f.read().strip()) - - def _split_generators(self, dl_manager): - splits_to_subsets = self._splits_to_subsets - if self.config.name in {"dev", "test"}: - splits = (self.config.name,) - else: - splits = ("train", "dev", "test") - - # 1. get number of archives (shards) in each subset - n_archives_links = { - split: { - subset: _N_ARCHIVES_URL.format( - subset=subset, - is_additional=self._is_additional_data(subset), - ) - for subset in splits_to_subsets[split] - } - for split in splits - } - n_archives_paths = dl_manager.download_and_extract(n_archives_links) - n_archives = { - # mapping from a subset to a single number - number of audio archives (shards) in a subset - split: { - subset: self._read_n_archives(n_archives_paths[split][subset]) - for subset in splits_to_subsets[split] - } - for split in splits - } - - # 2. prepare sharded archives with audio files - audio_archives_urls = { - split: { - subset: [ - _AUDIO_ARCHIVE_URL.format( - subset=subset, - is_additional=self._is_additional_data(subset), - archive_id=i, - ) - for i in range(n_archives[split][subset]) - ] - for subset in splits_to_subsets[split] - } - for split in splits - } - audio_archives_paths = dl_manager.download(audio_archives_urls) - # flatten archives paths from - # {"train": {"xs": [path1, path2,], "s": [path3], "m": [path5, path5]}, "dev": {"dev": [path6,...]}, "test": {"test": [...]}} - # to {"train": [path1, path2, path3, path4, path5], "dev": [path6, ...], "test": [...]} - audio_archives_paths = _flatten_nested_dict(audio_archives_paths) - local_audio_archives_paths = ( - dl_manager.extract(audio_archives_paths) - if not dl_manager.is_streaming - else None - ) - - # 3. prepare sharded metadata csv files - meta_urls = { - split: { - subset: [ - _META_URL.format( - subset=subset, - is_additional=self._is_additional_data(subset), - archive_id=i, - ) - for i in range(n_archives[split][subset]) - ] - for subset in splits_to_subsets[split] - } - for split in splits - } - meta_paths = dl_manager.download_and_extract(meta_urls) - meta_paths = _flatten_nested_dict(meta_paths) - - if self.config.name not in {"dev", "test"}: - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "audio_archives_iterators": [ - dl_manager.iter_archive(archive_path) - for archive_path in audio_archives_paths["train"] - ], - "local_audio_archives_paths": ( - local_audio_archives_paths["train"] - if local_audio_archives_paths - else None - ), - "meta_paths": meta_paths["train"], - }, - ), - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={ - "audio_archives_iterators": [ - dl_manager.iter_archive(archive_path) - for archive_path in audio_archives_paths["dev"] - ], - "local_audio_archives_paths": ( - local_audio_archives_paths["dev"] - if local_audio_archives_paths - else None - ), - "meta_paths": meta_paths["dev"], - }, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "audio_archives_iterators": [ - dl_manager.iter_archive(archive_path) - for archive_path in audio_archives_paths["test"] - ], - "local_audio_archives_paths": ( - local_audio_archives_paths["test"] - if local_audio_archives_paths - else None - ), - "meta_paths": meta_paths["test"], - }, - ), - ] - - if self.config.name == "dev": - return [ - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={ - "audio_archives_iterators": [ - dl_manager.iter_archive(archive_path) - for archive_path in audio_archives_paths["dev"] - ], - "local_audio_archives_paths": ( - local_audio_archives_paths["dev"] - if local_audio_archives_paths - else None - ), - "meta_paths": meta_paths["dev"], - }, - ), - ] - - if self.config.name == "test": - return [ - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "audio_archives_iterators": [ - dl_manager.iter_archive(archive_path) - for archive_path in audio_archives_paths["test"] - ], - "local_audio_archives_paths": ( - local_audio_archives_paths["test"] - if local_audio_archives_paths - else None - ), - "meta_paths": meta_paths["test"], - }, - ), - ] - - def _generate_examples( - self, audio_archives_iterators, local_audio_archives_paths, meta_paths - ): - assert len(audio_archives_iterators) == len(meta_paths) - if local_audio_archives_paths: - assert len(audio_archives_iterators) == len( - local_audio_archives_paths - ) - - for i, (meta_path, audio_archive_iterator) in enumerate( - zip(meta_paths, audio_archives_iterators) - ): - meta_dict = dict() - with open(meta_path) as csvfile: - meta_csv = csv.DictReader(csvfile) - for line in meta_csv: - meta_dict[line["sid"]] = line - - for audio_path_in_archive, audio_file in audio_archive_iterator: - # `audio_path_in_archive` is like "dev_chunks_0000/YOU1000000029_S0000095.wav" - audio_filename = os.path.split(audio_path_in_archive)[1] - audio_id = audio_filename.split(".wav")[0] - audio_meta = meta_dict[audio_id] - audio_meta["segment_id"] = audio_meta.pop("sid") - audio_meta["original_full_path"] = audio_meta.pop("path") - audio_meta["text"] = audio_meta.pop("text_tn") - audio_meta["audio_id"] = audio_meta.pop("aid") - if not audio_meta["category"]: - audio_meta["category"] = "N/A" - - path = ( - os.path.join( - local_audio_archives_paths[i], audio_path_in_archive - ) - if local_audio_archives_paths - else audio_path_in_archive - ) - - yield audio_id, { - "audio": {"path": path, "bytes": audio_file.read()}, - **{ - feature: value - for feature, value in audio_meta.items() - if feature in self.info.features - }, - } - - -def _flatten_nested_dict(nested_dict): - return { - key: [ - inner_list_element - for inner_list in value_to_lists.values() - for inner_list_element in inner_list - ] - for key, value_to_lists in nested_dict.items() - } diff --git a/recipes/GigaSpeech/ASR/CTC/dataset.py b/recipes/GigaSpeech/ASR/CTC/dataset.py new file mode 120000 index 0000000000..f3bfeaf826 --- /dev/null +++ b/recipes/GigaSpeech/ASR/CTC/dataset.py @@ -0,0 +1 @@ +../../dataset.py \ No newline at end of file diff --git a/recipes/GigaSpeech/dataset.py b/recipes/GigaSpeech/dataset.py index 3b6219efc8..b2f9eeadc2 100644 --- a/recipes/GigaSpeech/dataset.py +++ b/recipes/GigaSpeech/dataset.py @@ -144,7 +144,6 @@ class GigaspeechConfig(datasets.BuilderConfig): """BuilderConfig for Gigaspeech.""" def __init__(self, name, *args, **kwargs): - """BuilderConfig for Gigaspeech""" super().__init__(name=name, *args, **kwargs) # larger subsets are supersets of smaller subsets, # if we want to download "m", we need to download "xs" and "s" data too. From 603049c091b2b04d320c7bd640b340a2c2bd02ba Mon Sep 17 00:00:00 2001 From: TParcollet Date: Tue, 8 Oct 2024 14:15:43 +0100 Subject: [PATCH 40/77] readmes --- recipes/GigaSpeech/ASR/CTC/README.md | 61 ++++++++++++++++++++++++---- recipes/GigaSpeech/README.md | 13 ++++++ 2 files changed, 66 insertions(+), 8 deletions(-) create mode 100644 recipes/GigaSpeech/README.md diff --git a/recipes/GigaSpeech/ASR/CTC/README.md b/recipes/GigaSpeech/ASR/CTC/README.md index b9d49667f5..18dcd1cd27 100644 --- a/recipes/GigaSpeech/ASR/CTC/README.md +++ b/recipes/GigaSpeech/ASR/CTC/README.md @@ -1,8 +1,53 @@ -to do - -```bash -mkdir lm -git clone https://huggingface.co/wgb14/gigaspeech_lm lm -gunzip -c lm/3gram_pruned_1e7.arpa.gz > lm/3gram_pruned_1e7.arpa -gunzip -c lm/4gram.arpa.gz > lm/4gram.arpa -``` \ No newline at end of file +# Speech Recognition on GigaSpeech with pre-trained self-supervised models and CTC + +This folder contains the scripts to finetune any HuggingFace transformer model based +on transformers (wavlm, wav2vec 2, HuBERT...) with CTC for speech recognition on +GigaSpeech. Training can be done on any of the GigaSpeech subset (XL, L, S etc). + +## Data access and download + +SpeechBrain supports two ways of dealing with the GigaSpeech dataset: +1. [HuggingFace dataset](https://huggingface.co/datasets/speechcolab/gigaspeech/). For HuggingFacem note that **you must use** the HuggingFace client to log in first before running the recipe. +2. [Original Github](https://github.com/SpeechColab/GigaSpeech). + +You simply need to follow the instructions on either of the above links. **We strongly +recomment using HuggingFace as the download speed for people outside of China is +much quicker". + +## Installing Extra Dependencies + +Before proceeding, ensure you have installed the necessary additional dependencies. To do this, simply run the following command in your terminal: + +``` +pip install -r extra_requirements.txt +``` + +# How to run + +With a single GPU: +``` +python train_with_wavlm.py hparams/file.yaml +``` +With multiple GPUs: +``` +torchrun --nproc_per_node=8 train_with_wavlm.py hparams/file.yaml +``` + +# KenLM n-gram CTC rescoring +To enable n-gram rescoring during the decoding, you must download (or train yourself) the n-gram language model: + +``` +wget https://huggingface.co/wgb14/gigaspeech_lm/resolve/main/3gram_pruned_1e7.arpa.gz +wget https://huggingface.co/wgb14/gigaspeech_lm/resolve/main/4gram.arpa.gz +gunzip -c 3gram_pruned_1e7.arpa.gz > 3gram_pruned_1e7.arpa +gunzip -c 4gram.arpa.gz > 4gram.arpa +``` + +# Rescoring with a Neural Language Model +This can be done by modifying the current recipe. We invite you to have a look at our LibriSpeech CTC recipe for many different examples. + +# Results + +| Release | Hyperparams file | Decoding method | Finetuning Split | Test WER | Dev WER | HuggingFace link | Full model link | Training GPUs | +|:-------------:|:---------------------------:| :----------:| :-----:| :-----:| :-----:| :-----:| :-----:| :-----:| +| 05-08-23 | train_hf_wavlm.yaml | GreedySearch | XL | xx | xx | TBD | TBD | 4xRTX 3090 | \ No newline at end of file diff --git a/recipes/GigaSpeech/README.md b/recipes/GigaSpeech/README.md new file mode 100644 index 0000000000..c866e5a9c9 --- /dev/null +++ b/recipes/GigaSpeech/README.md @@ -0,0 +1,13 @@ +# Experimenting with the GigaSpeech dataset + +GigaSpeech is an evolving, multi-domain English speech recognition corpus with 10,000 hours of high quality labeled audio suitable for supervised training, and 40,000 hours of total audio suitable for semi-supervised and unsupervised training (this implementation contains only labelled data for now). However, the data access is gated, meaning, you need to request access to it. + +# Data access and download + +SpeechBrain supports two ways of dealing with the GigaSpeech dataset: +1. [HuggingFace dataset](https://huggingface.co/datasets/speechcolab/gigaspeech/). For HuggingFacem note that **you must use** the HuggingFace client to log in first before running the recipe. +2. [Original Github](https://github.com/SpeechColab/GigaSpeech). + +You simply need to follow the instructions on either of the above links. **We strongly +recomment using HuggingFace as the download speed for people outside of China is +much quicker". \ No newline at end of file From d4b3f0d54a3af554d6e6517103884cee59f004a8 Mon Sep 17 00:00:00 2001 From: TParcollet Date: Tue, 8 Oct 2024 14:16:24 +0100 Subject: [PATCH 41/77] readmes --- recipes/GigaSpeech/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/GigaSpeech/README.md b/recipes/GigaSpeech/README.md index c866e5a9c9..87bef08709 100644 --- a/recipes/GigaSpeech/README.md +++ b/recipes/GigaSpeech/README.md @@ -10,4 +10,4 @@ SpeechBrain supports two ways of dealing with the GigaSpeech dataset: You simply need to follow the instructions on either of the above links. **We strongly recomment using HuggingFace as the download speed for people outside of China is -much quicker". \ No newline at end of file +much quicker**. \ No newline at end of file From ef87027bb32e3714d9c227feaa0e3dcbeb18ed20 Mon Sep 17 00:00:00 2001 From: TParcollet Date: Tue, 8 Oct 2024 14:16:36 +0100 Subject: [PATCH 42/77] readmes --- recipes/GigaSpeech/ASR/CTC/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/GigaSpeech/ASR/CTC/README.md b/recipes/GigaSpeech/ASR/CTC/README.md index 18dcd1cd27..8cb7911cd2 100644 --- a/recipes/GigaSpeech/ASR/CTC/README.md +++ b/recipes/GigaSpeech/ASR/CTC/README.md @@ -12,7 +12,7 @@ SpeechBrain supports two ways of dealing with the GigaSpeech dataset: You simply need to follow the instructions on either of the above links. **We strongly recomment using HuggingFace as the download speed for people outside of China is -much quicker". +much quicker**. ## Installing Extra Dependencies From 8d53430185e6f21854a66e2e53a6361a8f3fded8 Mon Sep 17 00:00:00 2001 From: TParcollet Date: Tue, 8 Oct 2024 14:18:53 +0100 Subject: [PATCH 43/77] readmes --- recipes/GigaSpeech/ASR/CTC/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/recipes/GigaSpeech/ASR/CTC/README.md b/recipes/GigaSpeech/ASR/CTC/README.md index 8cb7911cd2..f3777b9a11 100644 --- a/recipes/GigaSpeech/ASR/CTC/README.md +++ b/recipes/GigaSpeech/ASR/CTC/README.md @@ -43,6 +43,8 @@ gunzip -c 3gram_pruned_1e7.arpa.gz > 3gram_pruned_1e7.arpa gunzip -c 4gram.arpa.gz > 4gram.arpa ``` +Then simply modify the *test_beam_search* in the yaml by adding *kenlm_model_path:* and your path as a parameter. + # Rescoring with a Neural Language Model This can be done by modifying the current recipe. We invite you to have a look at our LibriSpeech CTC recipe for many different examples. From 762a7b252b153d694ae1ac986d14682fc85bc251 Mon Sep 17 00:00:00 2001 From: TParcollet Date: Tue, 8 Oct 2024 14:20:56 +0100 Subject: [PATCH 44/77] doc update --- recipes/GigaSpeech/gigaspeech_prepare.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index d9ec4b6b56..e2a263a0b9 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -1,7 +1,9 @@ """ Data preparation script for the GigaSpeech dataset. -Download instructions: https://github.com/SpeechColab/GigaSpeech +Download instructions: + 1. https://github.com/SpeechColab/GigaSpeech + 2. https://huggingface.co/datasets/speechcolab/gigaspeech Reference: https://arxiv.org/abs/2106.06909 Author From 14a9df7923985f5b88dbf14d8ecf5ad6a931f341 Mon Sep 17 00:00:00 2001 From: TParcollet Date: Tue, 8 Oct 2024 14:29:10 +0100 Subject: [PATCH 45/77] CI god not happy, make CI god happy --- recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml index 884ee205d9..facddfabf1 100644 --- a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml +++ b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml @@ -90,7 +90,6 @@ token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 # Model parameters -activation: !name:torch.nn.LeakyReLU dnn_neurons: 1024 dropout: 0.1 freeze_wav2vec: False From 19d4753c26f682e97e1fb473f6ca5c84ad2bdaaf Mon Sep 17 00:00:00 2001 From: TParcollet Date: Tue, 8 Oct 2024 15:55:01 +0100 Subject: [PATCH 46/77] why you here little encoder --- recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml index facddfabf1..7b11772af7 100644 --- a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml +++ b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml @@ -217,8 +217,6 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.9 patient: 0 -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder - checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: @@ -227,7 +225,6 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer scheduler_model: !ref scheduler_wav2vec: !ref counter: !ref - tokenizer: !ref train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref From beb2ab2b3b40c9033b1dbb2ce3fc8890f98b8628 Mon Sep 17 00:00:00 2001 From: TParcollet Date: Tue, 8 Oct 2024 16:06:30 +0100 Subject: [PATCH 47/77] adding a tranduscer streaming recipe, because why not --- recipes/GigaSpeech/ASR/transducer/README.md | 102 ++++ recipes/GigaSpeech/ASR/transducer/dataset.py | 1 + .../ASR/transducer/extra_requirements.txt | 5 + .../ASR/transducer/gigaspeech_prepare.py | 1 + .../hparams/conformer_transducer.yaml | 402 +++++++++++++ recipes/GigaSpeech/ASR/transducer/train.py | 539 ++++++++++++++++++ 6 files changed, 1050 insertions(+) create mode 100644 recipes/GigaSpeech/ASR/transducer/README.md create mode 120000 recipes/GigaSpeech/ASR/transducer/dataset.py create mode 100644 recipes/GigaSpeech/ASR/transducer/extra_requirements.txt create mode 120000 recipes/GigaSpeech/ASR/transducer/gigaspeech_prepare.py create mode 100644 recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml create mode 100644 recipes/GigaSpeech/ASR/transducer/train.py diff --git a/recipes/GigaSpeech/ASR/transducer/README.md b/recipes/GigaSpeech/ASR/transducer/README.md new file mode 100644 index 0000000000..d75458cc58 --- /dev/null +++ b/recipes/GigaSpeech/ASR/transducer/README.md @@ -0,0 +1,102 @@ +# GigaSpeech streaming and non streaming speech recognition with Transducer models. +This folder contains scripts necessary to run an ASR experiment with the GigaSpeech dataset. +Before running this recipe, make sure numba is installed (pip install numba) +You can download LibriSpeech at http://www.openslr.org/12 + +# Extra-Dependencies +This recipe supports two implementations of the transducer loss, see `use_torchaudio` arg in the yaml file: +1. Transducer loss from torchaudio (this requires torchaudio version >= 0.10.0). +2. Speechbrain implementation using Numba. To use it, please set `use_torchaudio=False` in the yaml file. This version is implemented within SpeechBrain and allows you to directly access the python code of the transducer loss (and directly modify it if needed). + +The Numba implementation is currently enabled by default as the `use_torchaudio` option is incompatible with `bfloat16` training. + +Note: Before running this recipe, make sure numba is installed. Otherwise, run: +``` +pip install numba +``` + +# How to run it +```shell +python train.py hparams/conformer_transducer.yaml +``` + +## Precision Notes +If your GPU effectively supports fp16 (half-precision) computations, it is recommended to execute the training script with the `--precision=fp16` (or `--precision=bf16`) option. +Enabling half precision can significantly reduce the peak VRAM requirements. For example, in the case of the Conformer Transducer recipe trained with Librispeech, the peak VRAM decreases from 39GB to 12GB when using fp16. +According to our tests, the performance is not affected. + +# Results (non-streaming) + +Results are obtained with beam search and no LM (no-streaming i.e. full context). + +| Language | Release | LM | Val. CER | Val. WER | Test CER | Test WER | Model link | GPUs | +| ------------- |:-------------:| -----:| -----:| -----:| -----:| -----:| :-----------:| :-----------:| + +The output folders with checkpoints and logs can be found [here](https://www.dropbox.com/sh/852eq7pbt6d65ai/AACv4wAzk1pWbDo4fjVKLICYa?dl=0). + +## Streaming model + +### WER vs chunk size & left context + +The following matrix presents the Word Error Rate (WER%) achieved on GigaSpeech +`test` with various chunk sizes (in ms). + +The relative difference is not trivial to interpret, because we are not testing +against a continuous stream of speech, but rather against utterances of various +lengths. This tends to bias results in favor of larger chunk sizes. + +The chunk size might not accurately represent expected latency due to slight +padding differences in streaming contexts. + +The left chunk size is not representative of the receptive field of the model. +Because the model caches the streaming context at different layers, the model +may end up forming indirect dependencies to audio many seconds ago. + +| | full | cs=32 (1280ms) | 16 (640ms) | 8 (320ms) | +|:-----:|:----:|:-----:|:-----:|:-----:| +| it full | 8.92 | - | - | - | +| it lc=32 | - | 10.04 | 10.82 | 12.01 | +| fr full | 12.47 | - | - | - | +| fr lc=32 | - | 13.92 | 14.88 | 16.22 | + +### Inference + +Once your model is trained, you need a few manual steps in order to use it with the high-level streaming interfaces (`speechbrain.inference.ASR.StreamingASR`): + +1. Create a new directory where you want to store the model. +2. Copy `results/conformer_transducer//lm.ckpt` (optional; currently, for streaming rescoring LMs might be unsupported) and `tokenizer.ckpt` to that directory. +3. Copy `results/conformer_transducer//save/CKPT+????/model.ckpt` and `normalizer.ckpt` to that directory. +4. Copy your hyperparameters file to that directory. Uncomment the streaming specific keys and remove any training-specific keys. Alternatively, grab the inference hyperparameters YAML for this model from HuggingFace and adapt it to any changes you may have done. +5. You can now instantiate a `StreamingASR` with your model using `StreamingASR.from_hparams("/path/to/model/")`. + +The contents of that directory may be uploaded as a HuggingFace model, in which case the model source path can just be specified as `youruser/yourmodel`. + +# **About SpeechBrain** +- Website: https://speechbrain.github.io/ +- Code: https://github.com/speechbrain/speechbrain/ +- HuggingFace: https://huggingface.co/speechbrain/ + + +# **Citing SpeechBrain** +Please, cite SpeechBrain if you use it for your research or business. + +```bibtex +@misc{speechbrainV1, + title={Open-Source Conversational AI with SpeechBrain 1.0}, + author={Mirco Ravanelli and Titouan Parcollet and Adel Moumen and Sylvain de Langen and Cem Subakan and Peter Plantinga and Yingzhi Wang and Pooneh Mousavi and Luca Della Libera and Artem Ploujnikov and Francesco Paissan and Davide Borra and Salah Zaiem and Zeyu Zhao and Shucong Zhang and Georgios Karakasidis and Sung-Lin Yeh and Pierre Champion and Aku Rouhe and Rudolf Braun and Florian Mai and Juan Zuluaga-Gomez and Seyed Mahed Mousavi and Andreas Nautsch and Xuechen Liu and Sangeet Sagar and Jarod Duret and Salima Mdhaffar and Gaelle Laperriere and Mickael Rouvier and Renato De Mori and Yannick Esteve}, + year={2024}, + eprint={2407.00463}, + archivePrefix={arXiv}, + primaryClass={cs.LG}, + url={https://arxiv.org/abs/2407.00463}, +} +@misc{speechbrain, + title={{SpeechBrain}: A General-Purpose Speech Toolkit}, + author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio}, + year={2021}, + eprint={2106.04624}, + archivePrefix={arXiv}, + primaryClass={eess.AS}, + note={arXiv:2106.04624} +} +``` diff --git a/recipes/GigaSpeech/ASR/transducer/dataset.py b/recipes/GigaSpeech/ASR/transducer/dataset.py new file mode 120000 index 0000000000..f3bfeaf826 --- /dev/null +++ b/recipes/GigaSpeech/ASR/transducer/dataset.py @@ -0,0 +1 @@ +../../dataset.py \ No newline at end of file diff --git a/recipes/GigaSpeech/ASR/transducer/extra_requirements.txt b/recipes/GigaSpeech/ASR/transducer/extra_requirements.txt new file mode 100644 index 0000000000..47bf394ff1 --- /dev/null +++ b/recipes/GigaSpeech/ASR/transducer/extra_requirements.txt @@ -0,0 +1,5 @@ +# Numba is used if use_torchaudio=False +# Numba might be faster, but it is harder to install +# You might need to install numba with conda +# You might also need to install other packages such as cudatoolkit +numba diff --git a/recipes/GigaSpeech/ASR/transducer/gigaspeech_prepare.py b/recipes/GigaSpeech/ASR/transducer/gigaspeech_prepare.py new file mode 120000 index 0000000000..5190685a8e --- /dev/null +++ b/recipes/GigaSpeech/ASR/transducer/gigaspeech_prepare.py @@ -0,0 +1 @@ +../../gigaspeech_prepare.py \ No newline at end of file diff --git a/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml b/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml new file mode 100644 index 0000000000..73486cae05 --- /dev/null +++ b/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml @@ -0,0 +1,402 @@ +# ############################################################################ +# Model: E2E ASR with transformer and transducer +# Encoder: Conformer +# Decoder: LSTM + beamsearch + RNNLM +# Tokens: BPE with unigram +# losses: Transducer + CTC (optional) + CE (optional) +# Training: GigaSpeech +# Authors: Titouan Parcollet 2024 +# ############################################################################ + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +experiment_name: conformer_transducer +output_folder: !ref results// +output_wer_folder: !ref / +save_folder: !ref /save +train_log: !ref /train_log.txt + +# Data files +data_folder: !PLACEHOLDER # e,g./path/to/GigaSpeech + +# see https://github.com/SpeechColab/GigaSpeech for more details on the dataset +# must be one of ["XS", "S", "M", "L", "XL"] +# and ["DEV", "TEST"] for the eval splits. +splits: ["XS", "DEV", "TEST"] +skip_prep: False +download_with_HF: True +convert_opus_to_wav: True +keep_filler_words: False +keep_punctuation: False +ckpt_interval_minutes: 10 # save checkpoint every N min +train_csv: !ref /train.csv +valid_csv: !ref /dev.csv +test_csv: !ref /test.csv +json_file: !ref /GigaSpeech.json + +####################### Training Parameters #################################### + +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. +# Empirically, we found that this value should be >= 128. +# Please, set your parameters accordingly. +number_of_epochs: 5 +warmup_steps: 25000 +num_workers: 4 +batch_size_valid: 4 +lr: 0.0008 +weight_decay: 0.01 +number_of_ctc_epochs: 1 +ctc_weight: 0.3 # Multitask with CTC for the encoder (0.0 = disabled) +ce_weight: 0.0 # Multitask with CE for the decoder (0.0 = disabled) +max_grad_norm: 5.0 +loss_reduction: 'batchmean' +precision: fp16 # bf16, fp16 or fp32 + +# The batch size is used if and only if dynamic batching is set to False +# Validation and testing are done with fixed batches and not dynamic batching. +batch_size: 8 +grad_accumulation_factor: 4 +sorting: random +avg_checkpoints: 1 # Number of checkpoints to average for evaluation + +# Feature parameters +sample_rate: 16000 +n_fft: 512 +n_mels: 80 +win_length: 32 + +# Streaming & dynamic chunk training options +# At least for the current architecture on LibriSpeech, we found out that +# non-streaming accuracy is very similar between `streaming: True` and +# `streaming: False`. +streaming: True # controls all Dynamic Chunk Training & chunk size & left context mechanisms + +# Configuration for Dynamic Chunk Training. +# In this model, a chunk is roughly equivalent to 40ms of audio. +dynchunktrain_config_sampler: !new:speechbrain.utils.dynamic_chunk_training.DynChunkTrainConfigRandomSampler # yamllint disable-line rule:line-length + chunkwise_prob: 0.6 # Probability during a batch to limit attention and sample a random chunk size in the following range + chunk_size_min: 8 # Minimum chunk size (if in a DynChunkTrain batch) + chunk_size_max: 32 # Maximum chunk size (if in a DynChunkTrain batch) + limited_left_context_prob: 0.75 # If in a DynChunkTrain batch, the probability during a batch to restrict left context to a random number of chunks + left_context_chunks_min: 2 # Minimum left context size (in # of chunks) + left_context_chunks_max: 32 # Maximum left context size (in # of chunks) + # If you specify a valid/test config, you can optionally have evaluation be + # done with a specific DynChunkTrain configuration. + # valid_config: !new:speechbrain.utils.dynamic_chunk_training.DynChunkTrainConfig + # chunk_size: 24 + # left_context_size: 16 + # test_config: ... + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + +# Using dynamic batching by default. This works with 4x24GB GPUs +# Or turn it off (but training speed will decrease) +dynamic_batching: True +max_batch_length_train: 150 +max_batch_length_val: 50 # we reduce it as the beam is much wider (VRAM) +num_bucket: 200 +shuffle: True # if true re-creates batches at each epoch shuffling examples. +batch_ordering: random +max_batch_ex: 256 + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_valid: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# BPE parameters +token_type: unigram # ["unigram", "bpe", "char"] +character_coverage: 1.0 + +####################### Model Parameters ####################################### + +# Transformer +d_model: 768 +joint_dim: 512 +nhead: 8 +num_encoder_layers: 12 +num_decoder_layers: 0 +d_ffn: 2048 +transformer_dropout: 0.1 +activation: !name:torch.nn.GELU +output_neurons: 1024 +dec_dim: 512 +dec_emb_dropout: 0.2 +dec_dropout: 0.1 + +# Decoding parameters +blank_index: 0 +bos_index: 1 +eos_index: 2 +pad_index: 0 +beam_size: 10 +nbest: 1 +# by default {state,expand}_beam = 2.3 as mention in paper +# https://arxiv.org/abs/1904.02619 +state_beam: 2.3 +expand_beam: 2.3 +lm_weight: 0.50 + +# If True uses torchaudio loss. Otherwise, the numba one +use_torchaudio: False + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +normalize: !new:speechbrain.processing.features.InputNormalization + norm_type: global + update_until_epoch: 4 + +compute_features: !new:speechbrain.lobes.features.Fbank + sample_rate: !ref + n_fft: !ref + n_mels: !ref + win_length: !ref + +############################## Augmentations ################################### + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref + speeds: [95, 100, 105] + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + min_augmentations: 1 + max_augmentations: 1 + augment_prob: 1.0 + augmentations: [!ref ] + + +# Time Drop +time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop + drop_length_low: 12 + drop_length_high: 20 + drop_count_low: 1 + drop_count_high: 1 + replace: "zeros" + +# Frequency Drop +freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop + drop_length_low: 20 + drop_length_high: 25 + drop_count_low: 2 + drop_count_high: 2 + replace: "zeros" + dim: 2 + +# Time warp +time_warp: !new:speechbrain.augment.freq_domain.Warping + +fea_augment: !new:speechbrain.augment.augmenter.Augmenter + parallel_augment: False + concat_original: False + repeat_augment: 1 + shuffle_augmentations: False + min_augmentations: 2 + max_augmentations: 2 + augment_prob: 1.0 + augmentations: [ + !ref , + !ref , + !ref ] + +############################## Models ########################################## + +CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd + input_shape: (8, 10, 80) + num_blocks: 2 + num_layers_per_block: 1 + out_channels: (64, 32) + kernel_sizes: (3, 3) + strides: (2, 2) + residuals: (False, False) + +Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length + input_size: 640 + tgt_vocab: !ref + d_model: !ref + nhead: !ref + num_encoder_layers: !ref + num_decoder_layers: !ref + d_ffn: !ref + dropout: !ref + activation: !ref + encoder_module: conformer + attention_type: RelPosMHAXL + normalize_before: True + causal: False + +# We must call an encoder wrapper so the decoder isn't run (we don't have any) +enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper + transformer: !ref + +# For MTL CTC over the encoder +proj_ctc: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + +# Define some projection layers to make sure that enc and dec +# output dim are the same before joining +proj_enc: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + bias: False + +proj_dec: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + bias: False + +# Uncomment for MTL with CTC +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + reduction: !ref + +emb: !new:speechbrain.nnet.embedding.Embedding + num_embeddings: !ref + consider_as_one_hot: True + blank_id: !ref + +dec: !new:speechbrain.nnet.RNN.LSTM + input_shape: [null, null, !ref - 1] + hidden_size: !ref + num_layers: 1 + re_init: True + +# For MTL with LM over the decoder (need to uncomment to activate) +# dec_lin: !new:speechbrain.nnet.linear.Linear +# input_size: !ref +# n_neurons: !ref +# bias: False + +# For MTL +ce_cost: !name:speechbrain.nnet.losses.nll_loss + label_smoothing: 0.1 + +Tjoint: !new:speechbrain.nnet.transducer.transducer_joint.Transducer_joint + joint: sum # joint [sum | concat] + nonlinearity: !ref + +transducer_lin: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + bias: False + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +transducer_cost: !name:speechbrain.nnet.losses.transducer_loss + blank_index: !ref + use_torchaudio: !ref + +# for MTL +# update model if any HEAD module is added +modules: + CNN: !ref + enc: !ref + emb: !ref + dec: !ref + Tjoint: !ref + transducer_lin: !ref + normalize: !ref + proj_ctc: !ref + proj_dec: !ref + proj_enc: !ref + + +# update model if any HEAD module is added +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref , !ref , !ref , !ref , !ref ] + +############################## Decoding & optimiser ############################ + +# Tokenizer initialization +tokenizer: !new:sentencepiece.SentencePieceProcessor + +Greedysearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher + decode_network_lst: [!ref , !ref , !ref ] + tjoint: !ref + classifier_network: [!ref ] + blank_id: !ref + beam_size: 1 + nbest: 1 + +Beamsearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher + decode_network_lst: [!ref , !ref , !ref ] + tjoint: !ref + classifier_network: [!ref ] + blank_id: !ref + beam_size: !ref + nbest: !ref + state_beam: !ref + expand_beam: !ref + +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 1.e-8 + weight_decay: !ref + +noam_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +############################## Logging and Pretrainer ########################## + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + normalizer: !ref + counter: !ref + + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True + +# for the inference hparams, you will need to include and uncomment something like this: + +# make_tokenizer_streaming_context: !name:speechbrain.tokenizers.SentencePiece.SentencePieceDecoderStreamingContext +# tokenizer_decode_streaming: !name:speechbrain.tokenizers.SentencePiece.spm_decode_preserve_leading_space + +# make_decoder_streaming_context: !name:speechbrain.decoders.transducer.TransducerGreedySearcherStreamingContext # default constructor +# decoding_function: !name:speechbrain.decoders.transducer.TransducerBeamSearcher.transducer_greedy_decode_streaming +# - !ref # self + +# fea_streaming_extractor: !new:speechbrain.lobes.features.StreamingFeatureWrapper +# module: !new:speechbrain.nnet.containers.LengthsCapableSequential +# - !ref +# - !ref +# - !ref +# # don't consider normalization as part of the input filter chain. +# # normalization will operate at chunk level, which mismatches training +# # somewhat, but does not appear to result in noticeable degradation. +# properties: !apply:speechbrain.utils.filter_analysis.stack_filter_properties +# - [!ref , !ref ] diff --git a/recipes/GigaSpeech/ASR/transducer/train.py b/recipes/GigaSpeech/ASR/transducer/train.py new file mode 100644 index 0000000000..24d7b343f8 --- /dev/null +++ b/recipes/GigaSpeech/ASR/transducer/train.py @@ -0,0 +1,539 @@ +#!/usr/bin/env/python3 +"""Recipe for training a Transducer ASR system with GigaSpeech. +The system employs an encoder, a decoder, and an joint network +between them. Decoding is performed with beamsearch coupled with a neural +language model. + +To run this recipe, do the following: +> python train.py hparams/conformer_transducer.yaml + +With the default hyperparameters, the system employs a conformer encoder. +The decoder is based on a standard LSTM. Beamsearch coupled with a RNN +language model is used on the top of decoder probabilities. + +The neural network is trained on both CTC and negative-log likelihood +targets and sub-word units estimated with Byte Pairwise Encoding (BPE) +are used as basic recognition tokens. + +The experiment file is flexible enough to support a large variety of +different systems. By properly changing the parameter files, you can try +different encoders, decoders, tokens (e.g, characters instead of BPE), +training split, and many +other possible variations. + + +Authors + * Sylvain de Langen 2024 + * Titouan Parcollet 2024 + * Abdel Heba 2020 + * Mirco Ravanelli 2020 + * Ju-Chieh Chou 2020 + * Peter Plantinga 2020 +""" + +import os +import sys + +import torch +from hyperpyyaml import load_hyperpyyaml + +import speechbrain as sb +from speechbrain.tokenizers.SentencePiece import SentencePiece +from speechbrain.utils.data_utils import undo_padding +from speechbrain.utils.distributed import if_main_process, run_on_main +from speechbrain.utils.logger import get_logger + +logger = get_logger(__name__) + +# Define training procedure + + +class ASR(sb.Brain): + def compute_forward(self, batch, stage): + """Forward computations from the waveform batches to the output probabilities.""" + batch = batch.to(self.device) + wavs, wav_lens = batch.sig + tokens_with_bos, token_with_bos_lens = batch.tokens_bos + + # Add waveform augmentation if specified. + if stage == sb.Stage.TRAIN: + if hasattr(self.hparams, "wav_augment"): + wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens) + tokens_with_bos = self.hparams.wav_augment.replicate_labels( + tokens_with_bos + ) + + feats = self.hparams.compute_features(wavs) + + # Add feature augmentation if specified. + if stage == sb.Stage.TRAIN and hasattr(self.hparams, "fea_augment"): + feats, fea_lens = self.hparams.fea_augment(feats, wav_lens) + tokens_with_bos = self.hparams.fea_augment.replicate_labels( + tokens_with_bos + ) + + current_epoch = self.hparams.epoch_counter.current + + # Old models may not have the streaming hparam, we don't break them in + # any other way so just check for its presence + if hasattr(self.hparams, "streaming") and self.hparams.streaming: + dynchunktrain_config = self.hparams.dynchunktrain_config_sampler( + stage + ) + else: + dynchunktrain_config = None + + feats = self.modules.normalize(feats, wav_lens, epoch=current_epoch) + + src = self.modules.CNN(feats) + x = self.modules.enc( + src, + wav_lens, + pad_idx=self.hparams.pad_index, + dynchunktrain_config=dynchunktrain_config, + ) + x = self.modules.proj_enc(x) + + e_in = self.modules.emb(tokens_with_bos) + e_in = torch.nn.functional.dropout( + e_in, + self.hparams.dec_emb_dropout, + training=(stage == sb.Stage.TRAIN), + ) + h, _ = self.modules.dec(e_in) + h = torch.nn.functional.dropout( + h, self.hparams.dec_dropout, training=(stage == sb.Stage.TRAIN) + ) + h = self.modules.proj_dec(h) + + # Joint network + # add labelseq_dim to the encoder tensor: [B,T,H_enc] => [B,T,1,H_enc] + # add timeseq_dim to the decoder tensor: [B,U,H_dec] => [B,1,U,H_dec] + joint = self.modules.Tjoint(x.unsqueeze(2), h.unsqueeze(1)) + + # Output layer for transducer log-probabilities + logits_transducer = self.modules.transducer_lin(joint) + + # Compute outputs + if stage == sb.Stage.TRAIN: + p_ctc = None + p_ce = None + + if ( + self.hparams.ctc_weight > 0.0 + and current_epoch <= self.hparams.number_of_ctc_epochs + ): + # Output layer for ctc log-probabilities + out_ctc = self.modules.proj_ctc(x) + p_ctc = self.hparams.log_softmax(out_ctc) + + if self.hparams.ce_weight > 0.0: + # Output layer for ctc log-probabilities + p_ce = self.modules.dec_lin(h) + p_ce = self.hparams.log_softmax(p_ce) + + return p_ctc, p_ce, logits_transducer, wav_lens + + elif stage == sb.Stage.VALID: + best_hyps, scores, _, _ = self.hparams.Greedysearcher(x) + return logits_transducer, wav_lens, best_hyps + else: + ( + best_hyps, + best_scores, + nbest_hyps, + nbest_scores, + ) = self.hparams.Beamsearcher(x) + return logits_transducer, wav_lens, best_hyps + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss (Transducer+(CTC+NLL)) given predictions and targets.""" + + ids = batch.id + tokens, token_lens = batch.tokens + tokens_eos, token_eos_lens = batch.tokens_eos + + # Train returns 4 elements vs 3 for val and test + if len(predictions) == 4: + p_ctc, p_ce, logits_transducer, wav_lens = predictions + else: + logits_transducer, wav_lens, predicted_tokens = predictions + + if stage == sb.Stage.TRAIN: + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) + if hasattr(self.hparams, "fea_augment"): + ( + tokens, + token_lens, + tokens_eos, + token_eos_lens, + ) = self.hparams.fea_augment.replicate_multiple_labels( + tokens, token_lens, tokens_eos, token_eos_lens + ) + + if stage == sb.Stage.TRAIN: + CTC_loss = 0.0 + CE_loss = 0.0 + if p_ctc is not None: + CTC_loss = self.hparams.ctc_cost( + p_ctc, tokens, wav_lens, token_lens + ) + if p_ce is not None: + CE_loss = self.hparams.ce_cost( + p_ce, tokens_eos, length=token_eos_lens + ) + loss_transducer = self.hparams.transducer_cost( + logits_transducer, tokens, wav_lens, token_lens + ) + loss = ( + self.hparams.ctc_weight * CTC_loss + + self.hparams.ce_weight * CE_loss + + (1 - (self.hparams.ctc_weight + self.hparams.ce_weight)) + * loss_transducer + ) + else: + loss = self.hparams.transducer_cost( + logits_transducer, tokens, wav_lens, token_lens + ) + + if stage == sb.Stage.VALID: + # Decode token terms to words + predicted_words = self.tokenizer( + predicted_tokens, task="decode_from_list" + ) + elif stage == sb.Stage.TEST: + predicted_words = [ + hyp[0].text.split(" ") for hyp in predicted_tokens + ] + + if stage != sb.Stage.TRAIN: + # Convert indices to words + target_words = undo_padding(tokens, token_lens) + target_words = self.tokenizer(target_words, task="decode_from_list") + self.wer_metric.append(ids, predicted_words, target_words) + self.cer_metric.append(ids, predicted_words, target_words) + + return loss + + def on_fit_batch_end(self, batch, outputs, loss, should_step): + """At the end of the optimizer step, apply noam annealing.""" + if should_step: + self.hparams.noam_annealing(self.optimizer) + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch""" + if stage != sb.Stage.TRAIN: + self.cer_metric = self.hparams.cer_computer() + self.wer_metric = self.hparams.error_rate_computer() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of a epoch.""" + # Compute/store important stats + stage_stats = {"loss": stage_loss} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + else: + stage_stats["CER"] = self.cer_metric.summarize("error_rate") + stage_stats["WER"] = self.wer_metric.summarize("error_rate") + + # Perform end-of-iteration things, like annealing, logging, etc. + if stage == sb.Stage.VALID: + lr = self.hparams.noam_annealing.current_lr + steps = self.optimizer_step + optimizer = self.optimizer.__class__.__name__ + + epoch_stats = { + "epoch": epoch, + "lr": lr, + "steps": steps, + "optimizer": optimizer, + } + + self.hparams.train_logger.log_stats( + stats_meta=epoch_stats, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + self.checkpointer.save_and_keep_only( + meta={"WER": stage_stats["WER"], "epoch": epoch}, + min_keys=["WER"], + num_to_keep=self.hparams.avg_checkpoints, + ) + + elif stage == sb.Stage.TEST: + self.hparams.train_logger.log_stats( + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, + ) + if if_main_process(): + with open(self.hparams.test_wer_file, "w") as w: + self.wer_metric.write_stats(w) + + # save the averaged checkpoint at the end of the evaluation stage + # delete the rest of the intermediate checkpoints + # WER is set to -0.1 so checkpointer only keeps the averaged checkpoint + self.checkpointer.save_and_keep_only( + meta={"WER": -0.1, "epoch": epoch}, + min_keys=["WER"], + num_to_keep=1, + ) + + def on_evaluate_start(self, max_key=None, min_key=None): + """perform checkpoint average if needed""" + super().on_evaluate_start() + + ckpts = self.checkpointer.find_checkpoints( + max_key=max_key, + min_key=min_key, + ) + ckpt = sb.utils.checkpoints.average_checkpoints( + ckpts, recoverable_name="model" + ) + + self.hparams.model.load_state_dict(ckpt, strict=True) + self.hparams.model.eval() + + +def dataio_prepare(hparams): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions. + """ + data_folder = hparams["data_folder"] + + train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=hparams["train_csv"], + replacements={"data_root": data_folder}, + ) + + if hparams["sorting"] == "ascending": + # we sort training data to speed up training and get better results. + train_data = train_data.filtered_sorted(sort_key="duration") + # when sorting do not shuffle in dataloader ! otherwise is pointless + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "descending": + train_data = train_data.filtered_sorted( + sort_key="duration", reverse=True + ) + # when sorting do not shuffle in dataloader ! otherwise is pointless + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "random": + pass + + else: + raise NotImplementedError( + "sorting must be random, ascending or descending" + ) + + valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=hparams["valid_csv"], + replacements={"data_root": data_folder}, + ) + valid_data = valid_data.filtered_sorted(sort_key="duration") + + test_data = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=hparams["test_csv"], + replacements={"data_root": data_folder}, + ) + + # We also sort the validation data so it is faster to validate + test_data = test_data.filtered_sorted(sort_key="duration") + + datasets = [train_data, valid_data, test_data] + + # 2. Define audio pipeline: + @sb.utils.data_pipeline.takes("audio_path", "begin_time", "end_time") + @sb.utils.data_pipeline.provides("sig") + def audio_pipeline(audio_path, begin_time, end_time): + if hparams["download_with_HF"]: + sig = sb.dataio.dataio.read_audio(audio_path) + else: + start_sample = int(float(begin_time) * hparams["sample_rate"]) + stop_sample = int(float(end_time) * hparams["sample_rate"]) + sig = sb.dataio.dataio.read_audio( + {"file": audio_path, "start": start_sample, "stop": stop_sample} + ) + return sig + + sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) + + # 3. Define text pipeline: + @sb.utils.data_pipeline.takes("text") + @sb.utils.data_pipeline.provides( + "wrd", "tokens_list", "tokens_bos", "tokens_eos", "tokens" + ) + def text_pipeline(wrd): + yield wrd + tokens_list = tokenizer.sp.encode_as_ids(wrd) + yield tokens_list + tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list)) + yield tokens_bos + tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]]) + yield tokens_eos + tokens = torch.LongTensor(tokens_list) + yield tokens + + sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) + + # 4. Set output: + sb.dataio.dataset.set_output_keys( + datasets, + ["id", "sig", "wrd", "tokens_bos", "tokens_eos", "tokens"], + ) + + # 5. If Dynamic Batching is used, we instantiate the needed samplers. + train_batch_sampler = None + valid_batch_sampler = None + if hparams["dynamic_batching"]: + from speechbrain.dataio.sampler import DynamicBatchSampler # noqa + + dynamic_hparams_train = hparams["dynamic_batch_sampler_train"] + dynamic_hparams_valid = hparams["dynamic_batch_sampler_valid"] + + train_batch_sampler = DynamicBatchSampler( + train_data, + length_func=lambda x: x["duration"], + **dynamic_hparams_train, + ) + valid_batch_sampler = DynamicBatchSampler( + valid_data, + length_func=lambda x: x["duration"], + **dynamic_hparams_valid, + ) + + return ( + train_data, + valid_data, + test_data, + train_batch_sampler, + valid_batch_sampler, + ) + + +if __name__ == "__main__": + # CLI: + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # Use torchaudio if the device is CPU + if run_opts.get("device") == "cpu": + if "use_torchaudio: True" in overrides: + overrides.replace("use_torchaudio: True", "use_torchaudio: False") + else: + overrides += "\nuse_torchaudio: True" + + # create ddp_group with the right communication protocol + sb.utils.distributed.ddp_init_group(run_opts) + + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Dataset prep (parsing Librispeech) + from gigaspeech_prepare import prepare_gigaspeech # noqa + + # multi-gpu (ddp) save data preparation + run_on_main( + prepare_gigaspeech, + kwargs={ + "data_folder": hparams["data_folder"], + "save_folder": hparams["save_folder"], + "splits": hparams["splits"], + "output_train": hparams["train_csv"], + "output_dev": hparams["valid_csv"], + "output_test": hparams["test_csv"], + "json_file": hparams["json_file"], + "skip_prep": hparams["skip_prep"], + "convert_opus_to_wav": hparams["convert_opus_to_wav"], + "download_with_HF": hparams["download_with_HF"], + "punctuation": hparams["keep_punctuation"], + "filler": hparams["keep_filler_words"], + }, + ) + + # Defining tokenizer and loading it + tokenizer = SentencePiece( + model_dir=hparams["save_folder"], + vocab_size=hparams["output_neurons"], + annotation_train=hparams["train_csv"], + annotation_read="text", + model_type=hparams["token_type"], + character_coverage=hparams["character_coverage"], + bos_id=hparams["bos_index"], + eos_id=hparams["eos_index"], + ) + + # here we create the datasets objects as well as tokenization and encoding + ( + train_data, + valid_data, + test_data, + train_bsampler, + valid_bsampler, + ) = dataio_prepare(hparams) + + # Trainer initialization + asr_brain = ASR( + modules=hparams["modules"], + opt_class=hparams["opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + # We dynamically add the tokenizer to our brain class. + # NB: This tokenizer corresponds to the one used for the LM!! + asr_brain.tokenizer = tokenizer + + # We dynamically add the tokenizer to our brain class. + # NB: This tokenizer corresponds to the one used for the LM!! + train_dataloader_opts = hparams["train_dataloader_opts"] + valid_dataloader_opts = hparams["valid_dataloader_opts"] + + if train_bsampler is not None: + train_dataloader_opts = { + "batch_sampler": train_bsampler, + "num_workers": hparams["num_workers"], + } + + if valid_bsampler is not None: + valid_dataloader_opts = {"batch_sampler": valid_bsampler} + + # Training + asr_brain.fit( + asr_brain.hparams.epoch_counter, + train_data, + valid_data, + train_loader_kwargs=train_dataloader_opts, + valid_loader_kwargs=valid_dataloader_opts, + ) + + # Testing + os.makedirs(hparams["output_wer_folder"], exist_ok=True) + + # report WER on valid data + asr_brain.hparams.test_wer_file = os.path.join( + hparams["output_wer_folder"], "valid_wer.txt" + ) + asr_brain.evaluate( + valid_data, + min_key="WER", + test_loader_kwargs=hparams["test_dataloader_opts"], + ) + + # report WER on test data + asr_brain.hparams.test_wer_file = os.path.join( + hparams["output_wer_folder"], "test_wer.txt" + ) + asr_brain.evaluate( + test_data, + min_key="WER", + test_loader_kwargs=hparams["test_dataloader_opts"], + ) From cde564ae8308afa2707bd4356ca13c7f9a709b93 Mon Sep 17 00:00:00 2001 From: TParcollet Date: Tue, 8 Oct 2024 16:09:11 +0100 Subject: [PATCH 48/77] add test for transducer --- tests/recipes/GigaSpeech.csv | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/recipes/GigaSpeech.csv b/tests/recipes/GigaSpeech.csv index a60e84f864..afe2dfa567 100644 --- a/tests/recipes/GigaSpeech.csv +++ b/tests/recipes/GigaSpeech.csv @@ -1,2 +1,3 @@ Task,Dataset,Script_file,Hparam_file,Data_prep_file,Readme_file,Result_url,HF_repo,test_debug_flags,test_debug_checks,performance -ASR-CTC,GigaSpeech,recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py,recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml,recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py,recipes/GigaSpeech/ASR/CTC/README.md,,,--data_folder=tests/samples/ASR/ --train_csv=tests/samples/annotation/ASR_train.csv --valid_csv=tests/samples/annotation/ASR_train.csv --test_csv=tests/samples/annotation/ASR_train.csv --number_of_epochs=1 --skip_prep=True --wav2vec2_folder=tests/tmp/wav2vec2_checkpoint, \ No newline at end of file +ASR-CTC,GigaSpeech,recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py,recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml,recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py,recipes/GigaSpeech/ASR/CTC/README.md,,,--data_folder=tests/samples/ASR/ --train_csv=tests/samples/annotation/ASR_train.csv --valid_csv=tests/samples/annotation/ASR_train.csv --test_csv=tests/samples/annotation/ASR_train.csv --number_of_epochs=1 --skip_prep=True --wav2vec2_folder=tests/tmp/wav2vec2_checkpoint, +ASR-Transducers,GigaSpeech,recipes/GigaSpeech/ASR/transducer/train.py,recipes/GigaSpeech/ASR/transducer/hparams/conformer_large.yaml,recipes/GigaSpeech/ASR/transducer/gigaspeech_prepare.py,recipes/GigaSpeech/ASR/transducer/README.md,,,--data_folder=tests/samples/ASR/ --train_csv=tests/samples/annotation/ASR_train.csv --valid_csv=tests/samples/annotation/ASR_train.csv --test_csv=tests/samples/annotation/ASR_train.csv --number_of_epochs=1 --skip_prep=True, \ No newline at end of file From 7f1ff0ec3fabd4b53f4ef0212f3272ad3069c991 Mon Sep 17 00:00:00 2001 From: TParcollet Date: Tue, 8 Oct 2024 16:13:50 +0100 Subject: [PATCH 49/77] works better when me not stupid --- tests/recipes/GigaSpeech.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/recipes/GigaSpeech.csv b/tests/recipes/GigaSpeech.csv index afe2dfa567..ddf43f3cc5 100644 --- a/tests/recipes/GigaSpeech.csv +++ b/tests/recipes/GigaSpeech.csv @@ -1,3 +1,3 @@ Task,Dataset,Script_file,Hparam_file,Data_prep_file,Readme_file,Result_url,HF_repo,test_debug_flags,test_debug_checks,performance ASR-CTC,GigaSpeech,recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py,recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml,recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py,recipes/GigaSpeech/ASR/CTC/README.md,,,--data_folder=tests/samples/ASR/ --train_csv=tests/samples/annotation/ASR_train.csv --valid_csv=tests/samples/annotation/ASR_train.csv --test_csv=tests/samples/annotation/ASR_train.csv --number_of_epochs=1 --skip_prep=True --wav2vec2_folder=tests/tmp/wav2vec2_checkpoint, -ASR-Transducers,GigaSpeech,recipes/GigaSpeech/ASR/transducer/train.py,recipes/GigaSpeech/ASR/transducer/hparams/conformer_large.yaml,recipes/GigaSpeech/ASR/transducer/gigaspeech_prepare.py,recipes/GigaSpeech/ASR/transducer/README.md,,,--data_folder=tests/samples/ASR/ --train_csv=tests/samples/annotation/ASR_train.csv --valid_csv=tests/samples/annotation/ASR_train.csv --test_csv=tests/samples/annotation/ASR_train.csv --number_of_epochs=1 --skip_prep=True, \ No newline at end of file +ASR-Transducers,GigaSpeech,recipes/GigaSpeech/ASR/transducer/train.py,recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml,recipes/GigaSpeech/ASR/transducer/gigaspeech_prepare.py,recipes/GigaSpeech/ASR/transducer/README.md,,,--data_folder=tests/samples/ASR/ --train_csv=tests/samples/annotation/ASR_train.csv --valid_csv=tests/samples/annotation/ASR_train.csv --test_csv=tests/samples/annotation/ASR_train.csv --number_of_epochs=1 --skip_prep=True, \ No newline at end of file From d27e285cbd991e6bb415ac9c5fc551a16453ec0c Mon Sep 17 00:00:00 2001 From: TParcollet Date: Tue, 8 Oct 2024 16:20:15 +0100 Subject: [PATCH 50/77] fix yaml --- .../ASR/transducer/hparams/conformer_transducer.yaml | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml b/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml index 73486cae05..bfbe7a47ba 100644 --- a/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml +++ b/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml @@ -23,7 +23,7 @@ data_folder: !PLACEHOLDER # e,g./path/to/GigaSpeech # see https://github.com/SpeechColab/GigaSpeech for more details on the dataset # must be one of ["XS", "S", "M", "L", "XL"] # and ["DEV", "TEST"] for the eval splits. -splits: ["XS", "DEV", "TEST"] +splits: ["M", "DEV", "TEST"] skip_prep: False download_with_HF: True convert_opus_to_wav: True @@ -41,7 +41,7 @@ json_file: !ref /GigaSpeech.json # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. # Please, set your parameters accordingly. -number_of_epochs: 5 +number_of_epochs: 100 warmup_steps: 25000 num_workers: 4 batch_size_valid: 4 @@ -53,11 +53,12 @@ ce_weight: 0.0 # Multitask with CE for the decoder (0.0 = disabled) max_grad_norm: 5.0 loss_reduction: 'batchmean' precision: fp16 # bf16, fp16 or fp32 +grad_accumulation_factor: 1 # The batch size is used if and only if dynamic batching is set to False # Validation and testing are done with fixed batches and not dynamic batching. batch_size: 8 -grad_accumulation_factor: 4 + sorting: random avg_checkpoints: 1 # Number of checkpoints to average for evaluation @@ -102,6 +103,7 @@ test_dataloader_opts: # Using dynamic batching by default. This works with 4x24GB GPUs # Or turn it off (but training speed will decrease) +# Play with grad_accum_factor such that the total batch is around 600 to 1500 s. dynamic_batching: True max_batch_length_train: 150 max_batch_length_val: 50 # we reduce it as the beam is much wider (VRAM) @@ -155,7 +157,6 @@ nbest: 1 # https://arxiv.org/abs/1904.02619 state_beam: 2.3 expand_beam: 2.3 -lm_weight: 0.50 # If True uses torchaudio loss. Otherwise, the numba one use_torchaudio: False @@ -331,9 +332,6 @@ model: !new:torch.nn.ModuleList ############################## Decoding & optimiser ############################ -# Tokenizer initialization -tokenizer: !new:sentencepiece.SentencePieceProcessor - Greedysearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher decode_network_lst: [!ref , !ref , !ref ] tjoint: !ref From 800d637b4fc53c1868d214d802aea71bd44b0c9d Mon Sep 17 00:00:00 2001 From: TParcollet Date: Tue, 8 Oct 2024 16:22:38 +0100 Subject: [PATCH 51/77] update req --- recipes/GigaSpeech/ASR/transducer/extra_requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/recipes/GigaSpeech/ASR/transducer/extra_requirements.txt b/recipes/GigaSpeech/ASR/transducer/extra_requirements.txt index 47bf394ff1..f582033930 100644 --- a/recipes/GigaSpeech/ASR/transducer/extra_requirements.txt +++ b/recipes/GigaSpeech/ASR/transducer/extra_requirements.txt @@ -1,5 +1,8 @@ +datasets # Numba is used if use_torchaudio=False # Numba might be faster, but it is harder to install # You might need to install numba with conda # You might also need to install other packages such as cudatoolkit numba +soundfile +speechcolab From b76911b776773ea4697db4ebcd4dbfb6d5a85281 Mon Sep 17 00:00:00 2001 From: TParcollet Date: Wed, 9 Oct 2024 11:49:53 +0100 Subject: [PATCH 52/77] add warning for cache dir --- recipes/GigaSpeech/gigaspeech_prepare.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index e2a263a0b9..0e0144a7b6 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -136,6 +136,8 @@ def prepare_gigaspeech( be faster and more reliable than the official host. Make sure to read the instructions on how to get the dataset from Hugging Face here: https://huggingface.co/datasets/speechcolab/gigaspeech + The dataset will be downloaded in the default folder specified in the + environment variable HF_HUB_CACHE. Please change it if necessary. punctuation: bool, optional Keeping the punctuation, or not. filler: bool, optional @@ -195,11 +197,18 @@ def prepare_gigaspeech( "HuggingFace dataset.py not found. Please run this recipe from the correct recipe folder or copy the dataset.py file." ) + hf_caching_dir = os.environ["HF_HUB_CACHE"] + logger.info( + "Downloading dataset from HuggingFace to: " + str(hf_caching_dir) + ) + logger.info( + "To change this directory modify the HF_HUB_CACHE env. variable." + ) + hf_dataset = load_dataset( "dataset.py", train_split.lower(), trust_remote_code=True, - cache_dir=data_folder, data_dir=data_folder, ) for split, output in save_output.items(): From f1be37b5481fb5d317184e80e95845dd7d4bc207 Mon Sep 17 00:00:00 2001 From: TParcollet Date: Wed, 9 Oct 2024 12:02:48 +0100 Subject: [PATCH 53/77] add warning for cache dir --- recipes/GigaSpeech/gigaspeech_prepare.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index 0e0144a7b6..496f907a64 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -197,7 +197,11 @@ def prepare_gigaspeech( "HuggingFace dataset.py not found. Please run this recipe from the correct recipe folder or copy the dataset.py file." ) - hf_caching_dir = os.environ["HF_HUB_CACHE"] + if "HF_HUB_CACHE" in os.environ: + hf_caching_dir = os.environ["HF_HUB_CACHE"] + else: + hf_caching_dir = os.environ["HF_HOME"] + logger.info( "Downloading dataset from HuggingFace to: " + str(hf_caching_dir) ) From d96d2ceabc27ca9fc9a2bfd28979107f7ec89f82 Mon Sep 17 00:00:00 2001 From: TParcollet Date: Wed, 9 Oct 2024 14:58:07 +0100 Subject: [PATCH 54/77] enable multiprocessing --- recipes/GigaSpeech/gigaspeech_prepare.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index 496f907a64..e7b25abf11 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -98,6 +98,7 @@ def prepare_gigaspeech( download_with_HF: bool = False, punctuation: bool = False, filler: bool = False, + hf_multiprocess_load: bool = True, ) -> None: """Prepare the csv files for GigaSpeech dataset. @@ -138,10 +139,14 @@ def prepare_gigaspeech( https://huggingface.co/datasets/speechcolab/gigaspeech The dataset will be downloaded in the default folder specified in the environment variable HF_HUB_CACHE. Please change it if necessary. - punctuation: bool, optional + punctuation : bool, optional Keeping the punctuation, or not. - filler: bool, optional + filler : bool, optional Keeping filler words (hum), or not. + hf_multiprocess_load: bool, optional + If True, all the CPU threads will be used for data prepration. If set to + False, only one will be. Note that the data prepration of the larger sets + on a single core car take more than 24 hours (from downloading to done). Returns ------- @@ -209,11 +214,18 @@ def prepare_gigaspeech( "To change this directory modify the HF_HUB_CACHE env. variable." ) + nproc = 1 + if hf_multiprocess_load: + import multiprocessing + + nproc = multiprocessing.cpu_count() + hf_dataset = load_dataset( "dataset.py", train_split.lower(), trust_remote_code=True, data_dir=data_folder, + num_proc=nproc, ) for split, output in save_output.items(): logger.info(f"Starting creating {output} using {split} split.") From 5259f279c164631ae7679951eb2e65b47481d086 Mon Sep 17 00:00:00 2001 From: TParcollet Date: Thu, 10 Oct 2024 14:29:11 +0100 Subject: [PATCH 55/77] allow data prep without ddp --- .../GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml | 7 ++++--- recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py | 11 ++++++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml index 7b11772af7..bc44e707a7 100644 --- a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml +++ b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml @@ -36,7 +36,8 @@ test_csv: !ref /test.csv json_file: !ref /GigaSpeech.json # Training parameters -number_of_epochs: 5 +number_of_epochs: 3 +optimizer_step_limit: 400000 lr: 0.9 lr_wav2vec: 0.0001 sorting: ascending @@ -64,7 +65,7 @@ test_dataloader_opts: # Using dynamic batching by default. This works with 4x24GB GPUs # Or turn it off (but training speed will decrease) dynamic_batching: True -max_batch_length_train: 60 +max_batch_length_train: 50 max_batch_length_val: 30 # we reduce it as the beam is much wider (VRAM) num_bucket: 200 shuffle: True # if true re-creates batches at each epoch shuffling examples. @@ -208,7 +209,7 @@ wav2vec_opt_class: !name:torch.optim.AdamW lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 - annealing_factor: 0.8 + annealing_factor: 0.75 patient: 0 lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py index 5d1d6cbd9f..5cd9696ea9 100644 --- a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py @@ -312,9 +312,6 @@ def text_pipeline(wrd): # CLI: hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - with open(hparams_file) as fin: hparams = load_hyperpyyaml(fin, overrides) @@ -328,7 +325,9 @@ def text_pipeline(wrd): # Dataset prep (parsing Librispeech) from gigaspeech_prepare import prepare_gigaspeech # noqa - # multi-gpu (ddp) save data preparation + # We run on main for no reason as it is advised to not run this dataprep with + # DDP initialised. Indeed, it takes a lot of time and will most likely + # result in a timeout (internal DDP timeout). run_on_main( prepare_gigaspeech, kwargs={ @@ -339,7 +338,6 @@ def text_pipeline(wrd): "output_dev": hparams["valid_csv"], "output_test": hparams["test_csv"], "json_file": hparams["json_file"], - "skip_prep": hparams["skip_prep"], "convert_opus_to_wav": hparams["convert_opus_to_wav"], "download_with_HF": hparams["download_with_HF"], "punctuation": hparams["keep_punctuation"], @@ -347,6 +345,9 @@ def text_pipeline(wrd): }, ) + # create ddp_group with the right communication protocol + sb.utils.distributed.ddp_init_group(run_opts) + # Defining tokenizer and loading it tokenizer = SentencePiece( model_dir=hparams["save_folder"], From c0ea27af7f1986231218a7b6a6123fa5144d508a Mon Sep 17 00:00:00 2001 From: TParcollet Date: Thu, 10 Oct 2024 16:22:18 +0100 Subject: [PATCH 56/77] fix tests --- recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py index 5cd9696ea9..ab9e792294 100644 --- a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py @@ -341,6 +341,7 @@ def text_pipeline(wrd): "convert_opus_to_wav": hparams["convert_opus_to_wav"], "download_with_HF": hparams["download_with_HF"], "punctuation": hparams["keep_punctuation"], + "skip_prep": hparams["skip_prep"], "filler": hparams["keep_filler_words"], }, ) From 688cbe3f86a442cd89b5dcc0e40e1ca80f599dcd Mon Sep 17 00:00:00 2001 From: TParcollet Date: Thu, 10 Oct 2024 16:25:50 +0100 Subject: [PATCH 57/77] smoll readme update --- recipes/GigaSpeech/ASR/transducer/README.md | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/recipes/GigaSpeech/ASR/transducer/README.md b/recipes/GigaSpeech/ASR/transducer/README.md index d75458cc58..193fced3a7 100644 --- a/recipes/GigaSpeech/ASR/transducer/README.md +++ b/recipes/GigaSpeech/ASR/transducer/README.md @@ -1,7 +1,6 @@ # GigaSpeech streaming and non streaming speech recognition with Transducer models. This folder contains scripts necessary to run an ASR experiment with the GigaSpeech dataset. Before running this recipe, make sure numba is installed (pip install numba) -You can download LibriSpeech at http://www.openslr.org/12 # Extra-Dependencies This recipe supports two implementations of the transducer loss, see `use_torchaudio` arg in the yaml file: @@ -22,7 +21,7 @@ python train.py hparams/conformer_transducer.yaml ## Precision Notes If your GPU effectively supports fp16 (half-precision) computations, it is recommended to execute the training script with the `--precision=fp16` (or `--precision=bf16`) option. -Enabling half precision can significantly reduce the peak VRAM requirements. For example, in the case of the Conformer Transducer recipe trained with Librispeech, the peak VRAM decreases from 39GB to 12GB when using fp16. +Enabling half precision can significantly reduce the peak VRAM requirements. For example, in the case of the Conformer Transducer recipe trained with GigaSpeech, the peak VRAM decreases from 39GB to 12GB when using fp16. According to our tests, the performance is not affected. # Results (non-streaming) @@ -54,10 +53,6 @@ may end up forming indirect dependencies to audio many seconds ago. | | full | cs=32 (1280ms) | 16 (640ms) | 8 (320ms) | |:-----:|:----:|:-----:|:-----:|:-----:| -| it full | 8.92 | - | - | - | -| it lc=32 | - | 10.04 | 10.82 | 12.01 | -| fr full | 12.47 | - | - | - | -| fr lc=32 | - | 13.92 | 14.88 | 16.22 | ### Inference From 99d998ef4ae7bb3bafc783dc16e4c03bb9ae04ae Mon Sep 17 00:00:00 2001 From: TParcollet Date: Fri, 11 Oct 2024 10:54:58 +0100 Subject: [PATCH 58/77] fix review comments --- .../GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml | 2 +- recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py | 8 +++++++- .../ASR/transducer/hparams/conformer_transducer.yaml | 9 ++++----- recipes/GigaSpeech/ASR/transducer/train.py | 2 -- recipes/GigaSpeech/README.md | 2 +- recipes/GigaSpeech/gigaspeech_prepare.py | 11 +++++------ 6 files changed, 18 insertions(+), 16 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml index bc44e707a7..db7cee2e50 100644 --- a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml +++ b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml @@ -7,7 +7,7 @@ # Seed needs to be set at top of yaml, before objects with parameters are made seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] +__set_seed: !apply:speechbrain.utils.seed_everything [!ref ] experiment_name: train_wavlm_char output_folder: !ref results// output_wer_folder: !ref / diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py index ab9e792294..7257535f37 100644 --- a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py @@ -1,7 +1,13 @@ -"""TODO +""" This recipe finetunes a pretrained wavlm model large +on GigaSpeech for speech recognition with CTC and at the character level. +The WavLM model can be swapped with any HuggingFace model if wanted. + +To run this recipe, do the following: +> python train_with_wavlm.py hparams/train_hf_wavlm.yaml Authors * Adel Moumen 2024 + * Titouan Parcollet 2024 """ import logging diff --git a/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml b/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml index bfbe7a47ba..35ac3944b9 100644 --- a/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml +++ b/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml @@ -10,7 +10,7 @@ # Seed needs to be set at top of yaml, before objects with parameters are made seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] +__set_seed: !apply:speechbrain.utils.seed_everything [!ref ] experiment_name: conformer_transducer output_folder: !ref results// output_wer_folder: !ref / @@ -23,7 +23,7 @@ data_folder: !PLACEHOLDER # e,g./path/to/GigaSpeech # see https://github.com/SpeechColab/GigaSpeech for more details on the dataset # must be one of ["XS", "S", "M", "L", "XL"] # and ["DEV", "TEST"] for the eval splits. -splits: ["M", "DEV", "TEST"] +splits: ["XL", "DEV", "TEST"] skip_prep: False download_with_HF: True convert_opus_to_wav: True @@ -41,13 +41,12 @@ json_file: !ref /GigaSpeech.json # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. # Please, set your parameters accordingly. -number_of_epochs: 100 -warmup_steps: 25000 +number_of_epochs: 5 num_workers: 4 batch_size_valid: 4 lr: 0.0008 weight_decay: 0.01 -number_of_ctc_epochs: 1 +number_of_ctc_epochs: 2 ctc_weight: 0.3 # Multitask with CTC for the encoder (0.0 = disabled) ce_weight: 0.0 # Multitask with CE for the decoder (0.0 = disabled) max_grad_norm: 5.0 diff --git a/recipes/GigaSpeech/ASR/transducer/train.py b/recipes/GigaSpeech/ASR/transducer/train.py index 24d7b343f8..309f45bde0 100644 --- a/recipes/GigaSpeech/ASR/transducer/train.py +++ b/recipes/GigaSpeech/ASR/transducer/train.py @@ -45,8 +45,6 @@ logger = get_logger(__name__) -# Define training procedure - class ASR(sb.Brain): def compute_forward(self, batch, stage): diff --git a/recipes/GigaSpeech/README.md b/recipes/GigaSpeech/README.md index 87bef08709..a71fd9593d 100644 --- a/recipes/GigaSpeech/README.md +++ b/recipes/GigaSpeech/README.md @@ -5,7 +5,7 @@ GigaSpeech is an evolving, multi-domain English speech recognition corpus with 1 # Data access and download SpeechBrain supports two ways of dealing with the GigaSpeech dataset: -1. [HuggingFace dataset](https://huggingface.co/datasets/speechcolab/gigaspeech/). For HuggingFacem note that **you must use** the HuggingFace client to log in first before running the recipe. +1. [HuggingFace dataset](https://huggingface.co/datasets/speechcolab/gigaspeech/). For HuggingFace note that **you must use** the HuggingFace client to log in first before running the recipe. 2. [Original Github](https://github.com/SpeechColab/GigaSpeech). You simply need to follow the instructions on either of the above links. **We strongly diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index e7b25abf11..6da5abfbc8 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -18,7 +18,6 @@ import os from dataclasses import dataclass -import speechbrain as sb from speechbrain.utils.parallel import parallel_map logger = logging.getLogger(__name__) @@ -509,11 +508,11 @@ def HF_process_line(row: dict, punctuation: bool, filler: bool) -> list: assert os.path.isfile(audio_path), f"File not found: {audio_path}" # check reading the audio file ; HF may have some corrupted files - try: - _ = sb.dataio.dataio.read_audio(audio_path) - except Exception as e: - logger.error(f"Failed reading {audio_path}: {e}") - return None + # try: + # _ = sb.dataio.dataio.read_audio(audio_path) + # except Exception as e: + # logger.error(f"Failed reading {audio_path}: {e}") + # return None text = preprocess_text(row["text"], punctuation, filler) if text: From 679e2707ae581331497b54b6af9382b9cfca4c40 Mon Sep 17 00:00:00 2001 From: Titouan Parcollet/Embedded AI /SRUK/Engineer/Samsung Electronics Date: Fri, 11 Oct 2024 16:51:36 +0100 Subject: [PATCH 59/77] wtf --- recipes/GigaSpeech/ASR/CTC/README.md | 8 ++++++++ recipes/GigaSpeech/ASR/transducer/README.md | 18 ++++++++++++++++++ .../hparams/conformer_transducer.yaml | 8 +++++--- recipes/GigaSpeech/gigaspeech_prepare.py | 4 +++- 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/README.md b/recipes/GigaSpeech/ASR/CTC/README.md index f3777b9a11..9e3f7cc69f 100644 --- a/recipes/GigaSpeech/ASR/CTC/README.md +++ b/recipes/GigaSpeech/ASR/CTC/README.md @@ -14,6 +14,14 @@ You simply need to follow the instructions on either of the above links. **We st recomment using HuggingFace as the download speed for people outside of China is much quicker**. +## Data preparation + +**This step can be very long (24h+) for the XL split of GigaSpeech** + +SpeechBrain will automatically download the dataset if you use HuggingFace. Note that if you use HuggingFace, the *data_folder* argument in yaml becomes useless. Indeed, HuggingFace is a bit strict in the way it operates with dataset, and the data will be put into the folder specified by the environment variable *HF_HUB_CACHE* or, if not set, *HF_HOME* or, if not set, *XDG_CACHE_HOME*. Hence, we recommend setting the *HF_HUB_CACHE* to the place where you want to store the data first. For example, you can set it like this: + +```export HF_HUB_CACHE=/path/to/your/data/folder``` + ## Installing Extra Dependencies Before proceeding, ensure you have installed the necessary additional dependencies. To do this, simply run the following command in your terminal: diff --git a/recipes/GigaSpeech/ASR/transducer/README.md b/recipes/GigaSpeech/ASR/transducer/README.md index 193fced3a7..534b79036e 100644 --- a/recipes/GigaSpeech/ASR/transducer/README.md +++ b/recipes/GigaSpeech/ASR/transducer/README.md @@ -2,6 +2,24 @@ This folder contains scripts necessary to run an ASR experiment with the GigaSpeech dataset. Before running this recipe, make sure numba is installed (pip install numba) +## Data access and download + +SpeechBrain supports two ways of dealing with the GigaSpeech dataset: +1. [HuggingFace dataset](https://huggingface.co/datasets/speechcolab/gigaspeech/). For HuggingFacem note that **you must use** the HuggingFace client to log in first before running the recipe. +2. [Original Github](https://github.com/SpeechColab/GigaSpeech). + +You simply need to follow the instructions on either of the above links. **We strongly +recomment using HuggingFace as the download speed for people outside of China is +much quicker**. + +## Data preparation + +**This step can be very long (24h+) for the XL split of GigaSpeech** + +SpeechBrain will automatically download the dataset if you use HuggingFace. Note that if you use HuggingFace, the *data_folder* argument in yaml becomes useless. Indeed, HuggingFace is a bit strict in the way it operates with dataset, and the data will be put into the folder specified by the environment variable *HF_HUB_CACHE* or, if not set, *HF_HOME* or, if not set, *XDG_CACHE_HOME*. Hence, we recommend setting the *HF_HUB_CACHE* to the place where you want to store the data first. For example, you can set it like this: + +```export HF_HUB_CACHE=/path/to/your/data/folder``` + # Extra-Dependencies This recipe supports two implementations of the transducer loss, see `use_torchaudio` arg in the yaml file: 1. Transducer loss from torchaudio (this requires torchaudio version >= 0.10.0). diff --git a/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml b/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml index 35ac3944b9..43f6307d02 100644 --- a/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml +++ b/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml @@ -41,7 +41,9 @@ json_file: !ref /GigaSpeech.json # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. # Please, set your parameters accordingly. -number_of_epochs: 5 +number_of_epochs: 10 +optimizer_step_limit: 400000 +warmup_steps: 30000 num_workers: 4 batch_size_valid: 4 lr: 0.0008 @@ -100,11 +102,11 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref -# Using dynamic batching by default. This works with 4x24GB GPUs +# Using dynamic batching by default. This works with 48GB GPUs # Or turn it off (but training speed will decrease) # Play with grad_accum_factor such that the total batch is around 600 to 1500 s. dynamic_batching: True -max_batch_length_train: 150 +max_batch_length_train: 250 max_batch_length_val: 50 # we reduce it as the beam is much wider (VRAM) num_bucket: 200 shuffle: True # if true re-creates batches at each epoch shuffling examples. diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index 6da5abfbc8..8c3ec631f6 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -203,8 +203,10 @@ def prepare_gigaspeech( if "HF_HUB_CACHE" in os.environ: hf_caching_dir = os.environ["HF_HUB_CACHE"] - else: + elif "HF_HOME" in os.environ: hf_caching_dir = os.environ["HF_HOME"] + else: + hf_caching_dir = os.environ["XDG_CACHE_HOME"] logger.info( "Downloading dataset from HuggingFace to: " + str(hf_caching_dir) From a33cd7be7042297f0f3f880649e1b9f79f994fe8 Mon Sep 17 00:00:00 2001 From: TParcollet Date: Fri, 11 Oct 2024 17:04:00 +0100 Subject: [PATCH 60/77] update doc --- recipes/GigaSpeech/ASR/CTC/README.md | 2 +- recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py | 10 +++++++--- recipes/GigaSpeech/ASR/transducer/README.md | 2 +- .../ASR/transducer/hparams/conformer_transducer.yaml | 1 + recipes/GigaSpeech/ASR/transducer/train.py | 3 +++ 5 files changed, 13 insertions(+), 5 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/README.md b/recipes/GigaSpeech/ASR/CTC/README.md index 9e3f7cc69f..dbd4c20b79 100644 --- a/recipes/GigaSpeech/ASR/CTC/README.md +++ b/recipes/GigaSpeech/ASR/CTC/README.md @@ -16,7 +16,7 @@ much quicker**. ## Data preparation -**This step can be very long (24h+) for the XL split of GigaSpeech** +**This step can be very long (24h+) for the XL split of GigaSpeech. For DDP (multi GPU) the recipe must be run once without DDP otherwise it will timeout. You do not want to let X GPUs hang out without doing nothing for 24 hours anyway. Use the *data_prep_only* flag from the yaml to exit after data preparation** SpeechBrain will automatically download the dataset if you use HuggingFace. Note that if you use HuggingFace, the *data_folder* argument in yaml becomes useless. Indeed, HuggingFace is a bit strict in the way it operates with dataset, and the data will be put into the folder specified by the environment variable *HF_HUB_CACHE* or, if not set, *HF_HOME* or, if not set, *XDG_CACHE_HOME*. Hence, we recommend setting the *HF_HUB_CACHE* to the place where you want to store the data first. For example, you can set it like this: diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py index 7257535f37..99aa2588ef 100644 --- a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py @@ -2,7 +2,8 @@ on GigaSpeech for speech recognition with CTC and at the character level. The WavLM model can be swapped with any HuggingFace model if wanted. -To run this recipe, do the following: +To run this recipe, do the follo +wing: > python train_with_wavlm.py hparams/train_hf_wavlm.yaml Authors @@ -321,6 +322,9 @@ def text_pipeline(wrd): with open(hparams_file) as fin: hparams = load_hyperpyyaml(fin, overrides) + # create ddp_group with the right communication protocol + sb.utils.distributed.ddp_init_group(run_opts) + # Create experiment directory sb.create_experiment_directory( experiment_directory=hparams["output_folder"], @@ -352,8 +356,8 @@ def text_pipeline(wrd): }, ) - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) + if hparams["data_prep_only"]: + exit # Defining tokenizer and loading it tokenizer = SentencePiece( diff --git a/recipes/GigaSpeech/ASR/transducer/README.md b/recipes/GigaSpeech/ASR/transducer/README.md index 534b79036e..d3c00cd101 100644 --- a/recipes/GigaSpeech/ASR/transducer/README.md +++ b/recipes/GigaSpeech/ASR/transducer/README.md @@ -14,7 +14,7 @@ much quicker**. ## Data preparation -**This step can be very long (24h+) for the XL split of GigaSpeech** +**This step can be very long (24h+) for the XL split of GigaSpeech. For DDP (multi GPU) the recipe must be run once without DDP otherwise it will timeout. You do not want to let X GPUs hang out without doing nothing for 24 hours anyway. Use the *data_prep_only* flag from the yaml to exit after data preparation** SpeechBrain will automatically download the dataset if you use HuggingFace. Note that if you use HuggingFace, the *data_folder* argument in yaml becomes useless. Indeed, HuggingFace is a bit strict in the way it operates with dataset, and the data will be put into the folder specified by the environment variable *HF_HUB_CACHE* or, if not set, *HF_HOME* or, if not set, *XDG_CACHE_HOME*. Hence, we recommend setting the *HF_HUB_CACHE* to the place where you want to store the data first. For example, you can set it like this: diff --git a/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml b/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml index 43f6307d02..3024e78522 100644 --- a/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml +++ b/recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml @@ -25,6 +25,7 @@ data_folder: !PLACEHOLDER # e,g./path/to/GigaSpeech # and ["DEV", "TEST"] for the eval splits. splits: ["XL", "DEV", "TEST"] skip_prep: False +data_prep_only: False download_with_HF: True convert_opus_to_wav: True keep_filler_words: False diff --git a/recipes/GigaSpeech/ASR/transducer/train.py b/recipes/GigaSpeech/ASR/transducer/train.py index 309f45bde0..9dc6131db1 100644 --- a/recipes/GigaSpeech/ASR/transducer/train.py +++ b/recipes/GigaSpeech/ASR/transducer/train.py @@ -456,6 +456,9 @@ def text_pipeline(wrd): }, ) + if hparams["data_prep_only"]: + exit + # Defining tokenizer and loading it tokenizer = SentencePiece( model_dir=hparams["save_folder"], From 9e2af5bfac69d8f6f934e5ca557a0577acad02cb Mon Sep 17 00:00:00 2001 From: Titouan Parcollet/Embedded AI /SRUK/Engineer/Samsung Electronics Date: Fri, 11 Oct 2024 17:53:48 +0100 Subject: [PATCH 61/77] more documentation on storage --- recipes/GigaSpeech/ASR/CTC/README.md | 4 +++- recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py | 7 ++++++- recipes/GigaSpeech/ASR/transducer/README.md | 4 +++- recipes/GigaSpeech/ASR/transducer/train.py | 7 ++++++- recipes/GigaSpeech/gigaspeech_prepare.py | 1 + 5 files changed, 19 insertions(+), 4 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/README.md b/recipes/GigaSpeech/ASR/CTC/README.md index dbd4c20b79..34a584b9ec 100644 --- a/recipes/GigaSpeech/ASR/CTC/README.md +++ b/recipes/GigaSpeech/ASR/CTC/README.md @@ -6,6 +6,8 @@ GigaSpeech. Training can be done on any of the GigaSpeech subset (XL, L, S etc). ## Data access and download +**The XL set is fairly large, 2.2TB are necessary to store the compressed and uncompressed version of the data** + SpeechBrain supports two ways of dealing with the GigaSpeech dataset: 1. [HuggingFace dataset](https://huggingface.co/datasets/speechcolab/gigaspeech/). For HuggingFacem note that **you must use** the HuggingFace client to log in first before running the recipe. 2. [Original Github](https://github.com/SpeechColab/GigaSpeech). @@ -18,7 +20,7 @@ much quicker**. **This step can be very long (24h+) for the XL split of GigaSpeech. For DDP (multi GPU) the recipe must be run once without DDP otherwise it will timeout. You do not want to let X GPUs hang out without doing nothing for 24 hours anyway. Use the *data_prep_only* flag from the yaml to exit after data preparation** -SpeechBrain will automatically download the dataset if you use HuggingFace. Note that if you use HuggingFace, the *data_folder* argument in yaml becomes useless. Indeed, HuggingFace is a bit strict in the way it operates with dataset, and the data will be put into the folder specified by the environment variable *HF_HUB_CACHE* or, if not set, *HF_HOME* or, if not set, *XDG_CACHE_HOME*. Hence, we recommend setting the *HF_HUB_CACHE* to the place where you want to store the data first. For example, you can set it like this: +SpeechBrain will automatically download the dataset if you use HuggingFace. Note that if you use HuggingFace, the *data_folder* argument is used to store the **extracted** dataset. However, HuggingFace first needs to download the compressed data, and this is not stored in *data_folder* by default. Indeed, HuggingFace is a bit strict in the way it operates with dataset, and the data will be put into the folder specified by the environment variable *HF_HUB_CACHE* or, if not set, *HF_HOME* or, if not set, *XDG_CACHE_HOME*. Hence, we recommend setting the *HF_HUB_CACHE* to the place where you want to store the data first. For example, you can set it like this: ```export HF_HUB_CACHE=/path/to/your/data/folder``` diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py index 99aa2588ef..3e5589dee6 100644 --- a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py @@ -357,7 +357,12 @@ def text_pipeline(wrd): ) if hparams["data_prep_only"]: - exit + logger.info( + "Data preparation finished. Restart the script with data_prep_only to False. " + ) + import sys + + sys.exit() # Defining tokenizer and loading it tokenizer = SentencePiece( diff --git a/recipes/GigaSpeech/ASR/transducer/README.md b/recipes/GigaSpeech/ASR/transducer/README.md index d3c00cd101..bc5777dc91 100644 --- a/recipes/GigaSpeech/ASR/transducer/README.md +++ b/recipes/GigaSpeech/ASR/transducer/README.md @@ -4,6 +4,8 @@ Before running this recipe, make sure numba is installed (pip install numba) ## Data access and download +**The XL set is fairly large, 2.2TB are necessary to store the compressed and uncompressed version of the data** + SpeechBrain supports two ways of dealing with the GigaSpeech dataset: 1. [HuggingFace dataset](https://huggingface.co/datasets/speechcolab/gigaspeech/). For HuggingFacem note that **you must use** the HuggingFace client to log in first before running the recipe. 2. [Original Github](https://github.com/SpeechColab/GigaSpeech). @@ -16,7 +18,7 @@ much quicker**. **This step can be very long (24h+) for the XL split of GigaSpeech. For DDP (multi GPU) the recipe must be run once without DDP otherwise it will timeout. You do not want to let X GPUs hang out without doing nothing for 24 hours anyway. Use the *data_prep_only* flag from the yaml to exit after data preparation** -SpeechBrain will automatically download the dataset if you use HuggingFace. Note that if you use HuggingFace, the *data_folder* argument in yaml becomes useless. Indeed, HuggingFace is a bit strict in the way it operates with dataset, and the data will be put into the folder specified by the environment variable *HF_HUB_CACHE* or, if not set, *HF_HOME* or, if not set, *XDG_CACHE_HOME*. Hence, we recommend setting the *HF_HUB_CACHE* to the place where you want to store the data first. For example, you can set it like this: +SpeechBrain will automatically download the dataset if you use HuggingFace. Note that if you use HuggingFace, the *data_folder* argument is used to store the **extracted** dataset. However, HuggingFace first needs to download the compressed data, and this is not stored in *data_folder* by default. Indeed, HuggingFace is a bit strict in the way it operates with dataset, and the data will be put into the folder specified by the environment variable *HF_HUB_CACHE* or, if not set, *HF_HOME* or, if not set, *XDG_CACHE_HOME*. Hence, we recommend setting the *HF_HUB_CACHE* to the place where you want to store the data first. For example, you can set it like this: ```export HF_HUB_CACHE=/path/to/your/data/folder``` diff --git a/recipes/GigaSpeech/ASR/transducer/train.py b/recipes/GigaSpeech/ASR/transducer/train.py index 9dc6131db1..af186ce209 100644 --- a/recipes/GigaSpeech/ASR/transducer/train.py +++ b/recipes/GigaSpeech/ASR/transducer/train.py @@ -457,7 +457,12 @@ def text_pipeline(wrd): ) if hparams["data_prep_only"]: - exit + logger.info( + "Data preparation finished. Restart the script with data_prep_only to False. " + ) + import sys + + sys.exit() # Defining tokenizer and loading it tokenizer = SentencePiece( diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index 8c3ec631f6..b58623be92 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -226,6 +226,7 @@ def prepare_gigaspeech( train_split.lower(), trust_remote_code=True, data_dir=data_folder, + cache_dir=data_folder, num_proc=nproc, ) for split, output in save_output.items(): From 468147dcf9a0e227ead630d33905ee1e2ce9a3a2 Mon Sep 17 00:00:00 2001 From: TParcollet Date: Fri, 11 Oct 2024 17:59:17 +0100 Subject: [PATCH 62/77] missing arg --- recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml index db7cee2e50..daa4c21ebd 100644 --- a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml +++ b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml @@ -25,6 +25,7 @@ data_folder: !PLACEHOLDER # e,g./path/to/GigaSpeech # and ["DEV", "TEST"] for the eval splits. splits: ["XL", "DEV", "TEST"] skip_prep: False +data_prep_only: False download_with_HF: True convert_opus_to_wav: True keep_filler_words: False From 575a55ca466367fe00b22e3bdc4013b48027369d Mon Sep 17 00:00:00 2001 From: TParcollet Date: Fri, 11 Oct 2024 18:07:16 +0100 Subject: [PATCH 63/77] a bit of logs --- recipes/GigaSpeech/dataset.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/recipes/GigaSpeech/dataset.py b/recipes/GigaSpeech/dataset.py index b2f9eeadc2..9841bc22c9 100644 --- a/recipes/GigaSpeech/dataset.py +++ b/recipes/GigaSpeech/dataset.py @@ -31,6 +31,10 @@ import datasets +from speechbrain.utils.logger import get_logger + +logger = get_logger(__name__) + _CITATION = """\ @article{DBLP:journals/corr/abs-2106-06909, author = {Guoguo Chen and @@ -239,7 +243,10 @@ def _split_generators(self, dl_manager): } for split in splits } - n_archives_paths = dl_manager.download_and_extract(n_archives_links) + logger.info("Downloading the data. It may take a while.") + paths = dl_manager.download(n_archives_links) + logger.info("Extracting the data. It may take a while.") + n_archives_paths = dl_manager.extract(paths) n_archives = { # mapping from a subset to a single number - number of audio archives (shards) in a subset split: { From 0886ec6b7b94b8beea3a532ea14e07d603d4c37c Mon Sep 17 00:00:00 2001 From: TParcollet Date: Fri, 11 Oct 2024 19:38:42 +0100 Subject: [PATCH 64/77] new schedulers --- .../ASR/CTC/hparams/train_hf_wavlm.yaml | 41 ++++++++++--------- .../GigaSpeech/ASR/CTC/train_with_wavlm.py | 19 +++++++++ 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml index daa4c21ebd..3b789b58e0 100644 --- a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml +++ b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml @@ -23,7 +23,7 @@ data_folder: !PLACEHOLDER # e,g./path/to/GigaSpeech # see https://github.com/SpeechColab/GigaSpeech for more details on the dataset # must be one of ["XS", "S", "M", "L", "XL"] # and ["DEV", "TEST"] for the eval splits. -splits: ["XL", "DEV", "TEST"] +splits: ["XS", "DEV", "TEST"] skip_prep: False data_prep_only: False download_with_HF: True @@ -37,9 +37,13 @@ test_csv: !ref /test.csv json_file: !ref /GigaSpeech.json # Training parameters -number_of_epochs: 3 -optimizer_step_limit: 400000 -lr: 0.9 + +# The training will either stops at number_of_epochs or optimizer_step_limit +# I.e. the first that is reached. +number_of_epochs: 10 +optimizer_step_limit: 300000 +warmup: 1000 # Not much is needed as models are pretrained +lr: 0.001 lr_wav2vec: 0.0001 sorting: ascending num_workers: 4 @@ -95,6 +99,7 @@ character_coverage: 1.0 dnn_neurons: 1024 dropout: 0.1 freeze_wav2vec: False +freeze_wav2vec_extractor: False wav2vec_output_dim: 1024 # Outputs @@ -178,7 +183,7 @@ wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Ve source: !ref output_norm: False freeze: !ref - freeze_feature_extractor: True + freeze_feature_extractor: !ref save_path: !ref ctc_lin: !new:speechbrain.nnet.linear.Linear @@ -199,25 +204,23 @@ modules: model: !new:torch.nn.ModuleList - [!ref , !ref ] -model_opt_class: !name:torch.optim.Adadelta +model_opt_class: !name:torch.optim.AdamW lr: !ref - rho: 0.95 - eps: 1.e-8 wav2vec_opt_class: !name:torch.optim.AdamW lr: !ref -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.75 - patient: 0 - -lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.9 - patient: 0 +lr_annealing_model: !new:speechbrain.nnet.schedulers.WarmAndExpDecayLRSchedule + lr: !ref + n_warmup_steps: !ref + total_steps: !ref + decay_factor: 0.05 # Divided by twenty at the end. + +lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.WarmAndExpDecayLRSchedule + lr: !ref + n_warmup_steps: !ref + total_steps: !ref + decay_factor: 0.1 # Divided by ten at the end. checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py index 3e5589dee6..d59f45e43b 100644 --- a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py @@ -167,6 +167,25 @@ def on_stage_end(self, stage, stage_loss, epoch): with open(self.hparams.test_wer_file, "w") as w: self.wer_metric.write_stats(w) + def on_fit_batch_end(self, batch, outputs, loss, should_step): + """Called after ``fit_batch()``. + + Arguments + --------- + batch : list of torch.Tensors + Batch of data to use for training. Default implementation assumes + this batch has two elements: inputs and targets. + outputs : list or dictionary of torch.Tensors + Returned value of compute_forward(). + loss : torch.Tensor + Returned value of compute_objectives(). + should_step : boolean + Whether optimizer.step() was called or not. + """ + + self.hparams.lr_annealing_model(self.model_optimizer) + self.hparams.lr_annealing_wav2vec(self.wav2vec_optimizer) + def init_optimizers(self): "Initializes the wav2vec2 optimizer and model optimizer" # Handling SpeechBrain vs HuggingFace pretrained models From e285300564b6bc5762756de4a989b6721135b927 Mon Sep 17 00:00:00 2001 From: TParcollet Date: Fri, 11 Oct 2024 19:46:15 +0100 Subject: [PATCH 65/77] new schedulers --- .../GigaSpeech/ASR/CTC/train_with_wavlm.py | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py index d59f45e43b..ef609facf5 100644 --- a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py @@ -133,23 +133,16 @@ def on_stage_end(self, stage, stage_loss, epoch): # Perform end-of-iteration things, like annealing, logging, etc. if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - old_lr_wav2vec, new_lr_wav2vec = self.hparams.lr_annealing_wav2vec( - stage_stats["loss"] - ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - sb.nnet.schedulers.update_learning_rate( - self.wav2vec_optimizer, new_lr_wav2vec - ) + new_lr_model = self.hparams.optimizer_model.param_groups[0]["lr"] + new_lr_wav2vec = self.hparams.optimizer_wav2vec2.param_groups[0][ + "lr" + ] + self.hparams.train_logger.log_stats( stats_meta={ "epoch": epoch, - "lr_model": old_lr_model, - "lr_wav2vec": old_lr_wav2vec, + "lr_model": new_lr_model, + "lr_wav2vec": new_lr_wav2vec, }, train_stats=self.train_stats, valid_stats=stage_stats, From a06221b59321e6351c4d51598c9bc1cb9cf56f3e Mon Sep 17 00:00:00 2001 From: TParcollet Date: Fri, 11 Oct 2024 21:29:34 +0100 Subject: [PATCH 66/77] fix my stupidity --- recipes/GigaSpeech/ASR/CTC/README.md | 2 +- .../ASR/CTC/hparams/train_hf_wavlm.yaml | 2 +- .../GigaSpeech/ASR/CTC/train_with_wavlm.py | 6 +- recipes/GigaSpeech/ASR/transducer/README.md | 2 +- recipes/GigaSpeech/gigaspeech_prepare.py | 58 +++++++++---------- 5 files changed, 32 insertions(+), 38 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/README.md b/recipes/GigaSpeech/ASR/CTC/README.md index 34a584b9ec..564373c3f6 100644 --- a/recipes/GigaSpeech/ASR/CTC/README.md +++ b/recipes/GigaSpeech/ASR/CTC/README.md @@ -18,7 +18,7 @@ much quicker**. ## Data preparation -**This step can be very long (24h+) for the XL split of GigaSpeech. For DDP (multi GPU) the recipe must be run once without DDP otherwise it will timeout. You do not want to let X GPUs hang out without doing nothing for 24 hours anyway. Use the *data_prep_only* flag from the yaml to exit after data preparation** +**This step can be very long depending on your internet connection and filesystem for the XL split of GigaSpeech. For DDP (multi GPU) the recipe must be run once without DDP otherwise it will timeout. You do not want to let X GPUs hang out without doing nothing for hours anyway. Use the *data_prep_only* flag from the yaml to exit after data preparation** SpeechBrain will automatically download the dataset if you use HuggingFace. Note that if you use HuggingFace, the *data_folder* argument is used to store the **extracted** dataset. However, HuggingFace first needs to download the compressed data, and this is not stored in *data_folder* by default. Indeed, HuggingFace is a bit strict in the way it operates with dataset, and the data will be put into the folder specified by the environment variable *HF_HUB_CACHE* or, if not set, *HF_HOME* or, if not set, *XDG_CACHE_HOME*. Hence, we recommend setting the *HF_HUB_CACHE* to the place where you want to store the data first. For example, you can set it like this: diff --git a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml index 3b789b58e0..71d2c8c7c3 100644 --- a/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml +++ b/recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml @@ -23,7 +23,7 @@ data_folder: !PLACEHOLDER # e,g./path/to/GigaSpeech # see https://github.com/SpeechColab/GigaSpeech for more details on the dataset # must be one of ["XS", "S", "M", "L", "XL"] # and ["DEV", "TEST"] for the eval splits. -splits: ["XS", "DEV", "TEST"] +splits: ["XL", "DEV", "TEST"] skip_prep: False data_prep_only: False download_with_HF: True diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py index ef609facf5..d2c1dc8152 100644 --- a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py @@ -133,10 +133,8 @@ def on_stage_end(self, stage, stage_loss, epoch): # Perform end-of-iteration things, like annealing, logging, etc. if stage == sb.Stage.VALID: - new_lr_model = self.hparams.optimizer_model.param_groups[0]["lr"] - new_lr_wav2vec = self.hparams.optimizer_wav2vec2.param_groups[0][ - "lr" - ] + new_lr_model = self.model_optimizer.param_groups[0]["lr"] + new_lr_wav2vec = self.wav2vec_optimizer.param_groups[0]["lr"] self.hparams.train_logger.log_stats( stats_meta={ diff --git a/recipes/GigaSpeech/ASR/transducer/README.md b/recipes/GigaSpeech/ASR/transducer/README.md index bc5777dc91..b2a52a2648 100644 --- a/recipes/GigaSpeech/ASR/transducer/README.md +++ b/recipes/GigaSpeech/ASR/transducer/README.md @@ -16,7 +16,7 @@ much quicker**. ## Data preparation -**This step can be very long (24h+) for the XL split of GigaSpeech. For DDP (multi GPU) the recipe must be run once without DDP otherwise it will timeout. You do not want to let X GPUs hang out without doing nothing for 24 hours anyway. Use the *data_prep_only* flag from the yaml to exit after data preparation** +**This step can be very long depending on your internet connection and filesystem for the XL split of GigaSpeech. For DDP (multi GPU) the recipe must be run once without DDP otherwise it will timeout. You do not want to let X GPUs hang out without doing nothing for hours anyway. Use the *data_prep_only* flag from the yaml to exit after data preparation** SpeechBrain will automatically download the dataset if you use HuggingFace. Note that if you use HuggingFace, the *data_folder* argument is used to store the **extracted** dataset. However, HuggingFace first needs to download the compressed data, and this is not stored in *data_folder* by default. Indeed, HuggingFace is a bit strict in the way it operates with dataset, and the data will be put into the folder specified by the environment variable *HF_HUB_CACHE* or, if not set, *HF_HOME* or, if not set, *XDG_CACHE_HOME*. Hence, we recommend setting the *HF_HUB_CACHE* to the place where you want to store the data first. For example, you can set it like this: diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index b58623be92..e1ab542478 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -18,6 +18,8 @@ import os from dataclasses import dataclass +import torchaudio + from speechbrain.utils.parallel import parallel_map logger = logging.getLogger(__name__) @@ -436,10 +438,16 @@ def HF_create_csv( total_duration = 0.0 nb_samples = 0 + to_remove = GARBAGE_UTTERANCE_TAGS + if not punctuation: + to_remove += PUNCTUATION_TAGS + if not filler: + to_remove += FILLERS + line_processor = functools.partial( HF_process_line, + stopwords=to_remove, punctuation=punctuation, - filler=filler, ) csv_file_tmp = csv_file + ".tmp" @@ -488,7 +496,7 @@ def HF_create_csv( ) -def HF_process_line(row: dict, punctuation: bool, filler: bool) -> list: +def HF_process_line(row: dict, punctuation: bool, stopwords: list) -> list: """ Process the audio line and return the utterances for the given split. @@ -498,8 +506,8 @@ def HF_process_line(row: dict, punctuation: bool, filler: bool) -> list: The audio line to be processed. punctuation : bool Keeping punctuation or not. Default is no. - filler : bool - Keeping filler words or not (hum, er). Default is no. + stopwords: list + List of stopwords to remove from the text of the labels. Returns ------- @@ -508,16 +516,18 @@ def HF_process_line(row: dict, punctuation: bool, filler: bool) -> list: """ audio_path = os.path.join(row["audio"]["path"]) - assert os.path.isfile(audio_path), f"File not found: {audio_path}" + if not os.path.isfile(audio_path): + return None # check reading the audio file ; HF may have some corrupted files - # try: - # _ = sb.dataio.dataio.read_audio(audio_path) - # except Exception as e: - # logger.error(f"Failed reading {audio_path}: {e}") - # return None + try: + _ = torchaudio.info(audio_path) + except Exception as e: + logger.error(f"Failed reading {audio_path}: {e}") + return None + + text = preprocess_text(row["text"], punctuation, stopwords) - text = preprocess_text(row["text"], punctuation, filler) if text: utt_id = row["segment_id"] audio_id = row["audio_id"] @@ -568,7 +578,7 @@ def convert_opus2wav(audio_opus_path): return audio_wav_path -def preprocess_text(text: str, punctuation: bool, filler: bool) -> str: +def preprocess_text(text: str, punctuation: bool, stopwords) -> str: """ Preprocesses the input text by removing garbage tags and removing punctuation and filler words if specified. @@ -579,8 +589,8 @@ def preprocess_text(text: str, punctuation: bool, filler: bool) -> str: The input text to be preprocessed. punctuation : bool Keeping punctuation or not. Default is no. - filler : bool - Keeping filler words or not (hum, er). Default is no. + stopwords : list + List of words to remove from the input test string. Returns ------- @@ -608,28 +618,14 @@ def preprocess_text(text: str, punctuation: bool, filler: bool) -> str: text = text.upper() text = text.replace("-", " ") - to_remove = GARBAGE_UTTERANCE_TAGS - if not punctuation: - to_remove += PUNCTUATION_TAGS - if not filler: - to_remove += FILLERS - - processed = [] - for word in text.split(): - if word in to_remove: - continue - processed.append(word) - - sentence = " ".join(processed) + sentence = " ".join( + [word for word in text.split() if word not in stopwords] + ) if punctuation: for tag, punctuation in PUNCTUATION_TAGS.items(): sentence = sentence.replace(" " + tag, punctuation) - assert ( - "<" not in sentence and ">" not in sentence - ), f"Found tags in the text: {sentence}" - return sentence From 6bd627c251cf264ac181a76b99d6677b74420240 Mon Sep 17 00:00:00 2001 From: asu Date: Tue, 15 Oct 2024 13:50:21 +0200 Subject: [PATCH 67/77] Update non-HF code path for new preprocessing code in GigaSpeech --- recipes/GigaSpeech/gigaspeech_prepare.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index e1ab542478..3419f19fb5 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -262,8 +262,8 @@ def process_line( data_folder: str, split: str, convert_opus_to_wav: bool, - punctuation: bool = False, - filler: bool = False, + punctuation: bool, + stopwords: list ) -> list: """ Process the audio line and return the utterances for the given split. @@ -280,8 +280,8 @@ def process_line( If True, the opus files will be converted to wav files. punctuation : bool Keeping punctuation or not. Default is no. - filler : bool - Keeping filler words or not (hum, er). Default is no. + stopwords: list + List of stopwords to remove from the text of the labels. Returns ------- @@ -299,7 +299,7 @@ def process_line( # 2. iterate over the utterances utterances = [] for segment in audio["segments"]: - text = preprocess_text(segment["text_tn"], punctuation, filler) + text = preprocess_text(segment["text_tn"], punctuation, stopwords) if text: begin_time = float(segment["begin_time"]) end_time = float(segment["end_time"]) @@ -354,11 +354,19 @@ def create_csv( total_duration = 0.0 nb_samples = 0 + to_remove = GARBAGE_UTTERANCE_TAGS + if not punctuation: + to_remove += PUNCTUATION_TAGS + if not filler: + to_remove += FILLERS + line_processor = functools.partial( process_line, data_folder=data_folder, split=split, convert_opus_to_wav=convert_opus_to_wav, + stopwords=to_remove, + punctuation=punctuation, ) csv_file_tmp = csv_file + ".tmp" From dd28c73b8838a8aefa63b5215bbf375f7684ea21 Mon Sep 17 00:00:00 2001 From: asu Date: Tue, 15 Oct 2024 14:01:36 +0200 Subject: [PATCH 68/77] Fix CSV path for non-HF Gigaspeech --- recipes/GigaSpeech/gigaspeech_prepare.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index 3419f19fb5..ad0cd88359 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -175,16 +175,20 @@ def prepare_gigaspeech( # Setting output paths save_output = {} + split_map = {} train_split = "" for split in splits: if split in TRAIN_SUBSET: save_output["train"] = output_train + split_map["train"] = split train_split = split else: if split == "DEV": save_output["validation"] = output_dev + split_map["validation"] = split elif split == "TEST": save_output["test"] = output_test + split_map["test"] = split # check if the data is already prepared if skip_csv(save_output): @@ -249,7 +253,7 @@ def prepare_gigaspeech( output, info, data_folder, - split, + split_map[split], convert_opus_to_wav, punctuation, filler, From ab79b48b58b7c73b0be514e3ab0a425c25f302b5 Mon Sep 17 00:00:00 2001 From: asu Date: Tue, 15 Oct 2024 14:16:22 +0200 Subject: [PATCH 69/77] Fix formatting --- recipes/GigaSpeech/gigaspeech_prepare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index ad0cd88359..8dadfb3087 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -267,7 +267,7 @@ def process_line( split: str, convert_opus_to_wav: bool, punctuation: bool, - stopwords: list + stopwords: list, ) -> list: """ Process the audio line and return the utterances for the given split. From 4e64041d9a399adb42b9f208a312c15c28f91b16 Mon Sep 17 00:00:00 2001 From: asu Date: Fri, 18 Oct 2024 11:18:41 +0200 Subject: [PATCH 70/77] Fix preprocess_text example --- recipes/GigaSpeech/gigaspeech_prepare.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index 8dadfb3087..a7d9ed7146 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -623,8 +623,8 @@ def preprocess_text(text: str, punctuation: bool, stopwords) -> str: Examples -------- >>> text = " DOUGLAS MCGRAY IS GOING TO BE OUR GUIDE YOU WALK THROUGH THE DOOR YOU SEE THE RED CARPETING YOU SEE SOMEONE IN A SUIT THEY MAY BE GREETING YOU " - >>> preprocess_text(text) - "douglas mcgray is going to be our guide you walk through the door, you see the red carpeting, you see someone in a suit. they may be greeting you." + >>> preprocess_text(text, punctuation=True, stopwords=GARBAGE_UTTERANCE_TAGS) + "DOUGLAS MCGRAY IS GOING TO BE OUR GUIDE YOU WALK THROUGH THE DOOR, YOU SEE THE RED CARPETING, YOU SEE SOMEONE IN A SUIT. THEY MAY BE GREETING YOU." """ text = text.upper() From 906ada0e288ed0a737d7f4ce1ed75a6456ade9e0 Mon Sep 17 00:00:00 2001 From: Adel Moumen <88119391+Adel-Moumen@users.noreply.github.com> Date: Tue, 22 Oct 2024 14:46:40 +0100 Subject: [PATCH 71/77] add citing in README --- recipes/GigaSpeech/ASR/CTC/README.md | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/recipes/GigaSpeech/ASR/CTC/README.md b/recipes/GigaSpeech/ASR/CTC/README.md index 564373c3f6..3e0af9b63b 100644 --- a/recipes/GigaSpeech/ASR/CTC/README.md +++ b/recipes/GigaSpeech/ASR/CTC/README.md @@ -62,4 +62,28 @@ This can be done by modifying the current recipe. We invite you to have a look a | Release | Hyperparams file | Decoding method | Finetuning Split | Test WER | Dev WER | HuggingFace link | Full model link | Training GPUs | |:-------------:|:---------------------------:| :----------:| :-----:| :-----:| :-----:| :-----:| :-----:| :-----:| -| 05-08-23 | train_hf_wavlm.yaml | GreedySearch | XL | xx | xx | TBD | TBD | 4xRTX 3090 | \ No newline at end of file +| 05-08-23 | train_hf_wavlm.yaml | GreedySearch | XL | xx | xx | TBD | TBD | 4xRTX 3090 | + +# **Citing SpeechBrain** +Please, cite SpeechBrain if you use it for your research or business. + +```bibtex +@misc{speechbrainV1, + title={Open-Source Conversational AI with SpeechBrain 1.0}, + author={Mirco Ravanelli and Titouan Parcollet and Adel Moumen and Sylvain de Langen and Cem Subakan and Peter Plantinga and Yingzhi Wang and Pooneh Mousavi and Luca Della Libera and Artem Ploujnikov and Francesco Paissan and Davide Borra and Salah Zaiem and Zeyu Zhao and Shucong Zhang and Georgios Karakasidis and Sung-Lin Yeh and Pierre Champion and Aku Rouhe and Rudolf Braun and Florian Mai and Juan Zuluaga-Gomez and Seyed Mahed Mousavi and Andreas Nautsch and Xuechen Liu and Sangeet Sagar and Jarod Duret and Salima Mdhaffar and Gaelle Laperriere and Mickael Rouvier and Renato De Mori and Yannick Esteve}, + year={2024}, + eprint={2407.00463}, + archivePrefix={arXiv}, + primaryClass={cs.LG}, + url={https://arxiv.org/abs/2407.00463}, +} +@misc{speechbrain, + title={{SpeechBrain}: A General-Purpose Speech Toolkit}, + author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio}, + year={2021}, + eprint={2106.04624}, + archivePrefix={arXiv}, + primaryClass={eess.AS}, + note={arXiv:2106.04624} +} +``` From de7a7e801d565f12da27c349029bbcbfc2b31700 Mon Sep 17 00:00:00 2001 From: asu Date: Wed, 23 Oct 2024 13:33:23 +0200 Subject: [PATCH 72/77] Fix GS transducer test prediction decoding? --- recipes/GigaSpeech/ASR/transducer/train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/recipes/GigaSpeech/ASR/transducer/train.py b/recipes/GigaSpeech/ASR/transducer/train.py index af186ce209..af91a325dc 100644 --- a/recipes/GigaSpeech/ASR/transducer/train.py +++ b/recipes/GigaSpeech/ASR/transducer/train.py @@ -202,7 +202,8 @@ def compute_objectives(self, predictions, batch, stage): ) elif stage == sb.Stage.TEST: predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens + self.tokenizer.decode_ids(utt_seq).split(" ") + for utt_seq in predicted_tokens ] if stage != sb.Stage.TRAIN: From 5b15078a40d93482bac1a8ac5093e6f6908c688b Mon Sep 17 00:00:00 2001 From: asu Date: Wed, 23 Oct 2024 13:46:56 +0200 Subject: [PATCH 73/77] Actually fix GS transducer test prediction decoding --- recipes/GigaSpeech/ASR/transducer/train.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/recipes/GigaSpeech/ASR/transducer/train.py b/recipes/GigaSpeech/ASR/transducer/train.py index af91a325dc..280946a065 100644 --- a/recipes/GigaSpeech/ASR/transducer/train.py +++ b/recipes/GigaSpeech/ASR/transducer/train.py @@ -195,18 +195,12 @@ def compute_objectives(self, predictions, batch, stage): logits_transducer, tokens, wav_lens, token_lens ) - if stage == sb.Stage.VALID: + if stage != sb.Stage.TRAIN: # Decode token terms to words predicted_words = self.tokenizer( predicted_tokens, task="decode_from_list" ) - elif stage == sb.Stage.TEST: - predicted_words = [ - self.tokenizer.decode_ids(utt_seq).split(" ") - for utt_seq in predicted_tokens - ] - if stage != sb.Stage.TRAIN: # Convert indices to words target_words = undo_padding(tokens, token_lens) target_words = self.tokenizer(target_words, task="decode_from_list") From 76a803bcacd02377fbed49a9b1bb633433e4779d Mon Sep 17 00:00:00 2001 From: asu Date: Wed, 23 Oct 2024 15:54:47 +0200 Subject: [PATCH 74/77] Remove punctuation filtering that is handled elsewhere --- recipes/GigaSpeech/gigaspeech_prepare.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/recipes/GigaSpeech/gigaspeech_prepare.py b/recipes/GigaSpeech/gigaspeech_prepare.py index a7d9ed7146..cdfb502cc1 100644 --- a/recipes/GigaSpeech/gigaspeech_prepare.py +++ b/recipes/GigaSpeech/gigaspeech_prepare.py @@ -359,8 +359,6 @@ def create_csv( nb_samples = 0 to_remove = GARBAGE_UTTERANCE_TAGS - if not punctuation: - to_remove += PUNCTUATION_TAGS if not filler: to_remove += FILLERS @@ -451,8 +449,6 @@ def HF_create_csv( nb_samples = 0 to_remove = GARBAGE_UTTERANCE_TAGS - if not punctuation: - to_remove += PUNCTUATION_TAGS if not filler: to_remove += FILLERS From 231c78ac2559363a8867852048814cca6208e609 Mon Sep 17 00:00:00 2001 From: asu Date: Wed, 23 Oct 2024 15:57:49 +0200 Subject: [PATCH 75/77] HuggingFance --- recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py | 2 +- recipes/LibriSpeech/ASR/CTC/train_with_wav2vec_k2.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py index d2c1dc8152..60e1f6bb4a 100644 --- a/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py +++ b/recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py @@ -44,7 +44,7 @@ def compute_forward(self, batch, stage): # Forward pass - # Handling SpeechBrain vs HuggingFance pretrained models + # Handling SpeechBrain vs HuggingFace pretrained models if hasattr(self.modules, "extractor"): # SpeechBrain pretrained model latents = self.modules.extractor(wavs) feats = self.modules.encoder_wrapper(latents, wav_lens=wav_lens)[ diff --git a/recipes/LibriSpeech/ASR/CTC/train_with_wav2vec_k2.py b/recipes/LibriSpeech/ASR/CTC/train_with_wav2vec_k2.py index 3501a268fa..2ddc4b72be 100644 --- a/recipes/LibriSpeech/ASR/CTC/train_with_wav2vec_k2.py +++ b/recipes/LibriSpeech/ASR/CTC/train_with_wav2vec_k2.py @@ -55,7 +55,7 @@ def compute_forward(self, batch, stage): # Forward pass - # Handling SpeechBrain vs HuggingFance pretrained models + # Handling SpeechBrain vs HuggingFace pretrained models if hasattr(self.modules, "extractor"): # SpeechBrain pretrained model latents = self.modules.extractor(wavs) feats = self.modules.encoder_wrapper(latents, wav_lens=wav_lens)[ From d98e949c6064a55df19cd2205d67a5a34db184b6 Mon Sep 17 00:00:00 2001 From: asu Date: Fri, 25 Oct 2024 14:35:26 +0200 Subject: [PATCH 76/77] Add results and notices for results for GigaSpeech transducer & wavlm --- recipes/GigaSpeech/ASR/CTC/README.md | 4 +++- recipes/GigaSpeech/ASR/transducer/README.md | 12 +++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/recipes/GigaSpeech/ASR/CTC/README.md b/recipes/GigaSpeech/ASR/CTC/README.md index 3e0af9b63b..488906ecb6 100644 --- a/recipes/GigaSpeech/ASR/CTC/README.md +++ b/recipes/GigaSpeech/ASR/CTC/README.md @@ -62,7 +62,9 @@ This can be done by modifying the current recipe. We invite you to have a look a | Release | Hyperparams file | Decoding method | Finetuning Split | Test WER | Dev WER | HuggingFace link | Full model link | Training GPUs | |:-------------:|:---------------------------:| :----------:| :-----:| :-----:| :-----:| :-----:| :-----:| :-----:| -| 05-08-23 | train_hf_wavlm.yaml | GreedySearch | XL | xx | xx | TBD | TBD | 4xRTX 3090 | +| 25-10-2024 | train_hf_wavlm.yaml | GreedySearch | XL | 11.88% | 11.86% | Unavailable\* | Unavailable\* | 8xRTX 3090 | + +\*: Unfortunately, we are unable to upload the checkpoints for the WavLM model at this time. We currently don't have plans to remedy this. # **Citing SpeechBrain** Please, cite SpeechBrain if you use it for your research or business. diff --git a/recipes/GigaSpeech/ASR/transducer/README.md b/recipes/GigaSpeech/ASR/transducer/README.md index b2a52a2648..d672f3c45c 100644 --- a/recipes/GigaSpeech/ASR/transducer/README.md +++ b/recipes/GigaSpeech/ASR/transducer/README.md @@ -48,10 +48,18 @@ According to our tests, the performance is not affected. Results are obtained with beam search and no LM (no-streaming i.e. full context). +**TBD: The final models are currently in training.** This model has already been succesfully trained, though. This will be updated when the checkpoints are ready for download. + + + + ## Streaming model @@ -74,6 +82,8 @@ may end up forming indirect dependencies to audio many seconds ago. | | full | cs=32 (1280ms) | 16 (640ms) | 8 (320ms) | |:-----:|:----:|:-----:|:-----:|:-----:| +**TBD: The final models are currently in training.** This model has already been succesfully trained, though. This will be updated when the checkpoints are ready for download. + ### Inference Once your model is trained, you need a few manual steps in order to use it with the high-level streaming interfaces (`speechbrain.inference.ASR.StreamingASR`): From db5b629d7249609338d3a5cc25a6b43a34aa7f03 Mon Sep 17 00:00:00 2001 From: asu Date: Fri, 25 Oct 2024 14:42:07 +0200 Subject: [PATCH 77/77] english hard --- recipes/GigaSpeech/ASR/transducer/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/GigaSpeech/ASR/transducer/README.md b/recipes/GigaSpeech/ASR/transducer/README.md index d672f3c45c..46e8953160 100644 --- a/recipes/GigaSpeech/ASR/transducer/README.md +++ b/recipes/GigaSpeech/ASR/transducer/README.md @@ -48,7 +48,7 @@ According to our tests, the performance is not affected. Results are obtained with beam search and no LM (no-streaming i.e. full context). -**TBD: The final models are currently in training.** This model has already been succesfully trained, though. This will be updated when the checkpoints are ready for download. +**TBD: The final models are currently in training.** This model has already been successfully trained, though. This will be updated when the checkpoints are ready for download.