diff --git a/lhotse/recipes/commonvoice.py b/lhotse/recipes/commonvoice.py index fd84329da..46a08cd91 100644 --- a/lhotse/recipes/commonvoice.py +++ b/lhotse/recipes/commonvoice.py @@ -9,6 +9,7 @@ How does it work? We are crowdsourcing an open-source dataset of voices. Donate your voice, validate the accuracy of other people's clips, make the dataset better for everyone. """ +import csv import logging import math import numbers @@ -149,14 +150,13 @@ def _parse_utterance( language: str, audio_info: str, ) -> Optional[Tuple[Recording, SupervisionSegment]]: - audio_info = audio_info.split("\t", -1) - audio_path = lang_path / "clips" / audio_info[1] + audio_path = lang_path / "clips" / audio_info["path"] if not audio_path.is_file(): logging.info(f"No such file: {audio_path}") return None - recording_id = Path(audio_info[1]).stem + recording_id = Path(audio_info["path"]).stem recording = Recording.from_file(path=audio_path, recording_id=recording_id) segment = SupervisionSegment( @@ -166,12 +166,13 @@ def _parse_utterance( duration=recording.duration, channel=0, language=language, - speaker=audio_info[0], - text=audio_info[2].strip(), - gender=audio_info[6], + speaker=audio_info["client_id"], + text=audio_info["sentence"].strip(), + gender=audio_info["gender"], custom={ - "age": audio_info[5], - "accents": audio_info[7], + "age": audio_info["age"], + "accents": audio_info["accents"], + "variant": audio_info["variant"], }, ) return recording, segment @@ -207,19 +208,22 @@ def _prepare_part( futures = [] recordings = [] supervisions = [] + audio_infos = [] - with open(tsv_path) as f: - audio_infos = iter(f.readlines()) + with open(tsv_path, "r") as f: - for audio_info in tqdm(audio_infos, desc="Distributing tasks"): - futures.append( - ex.submit( - _parse_utterance, - lang_path, - lang, - audio_info, + # Note: using QUOTE_NONE as CV dataset contains unbalanced quotes, cleanup needed later + audio_infos = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE) + + for audio_info in tqdm(audio_infos, desc="Distributing tasks"): + futures.append( + ex.submit( + _parse_utterance, + lang_path, + lang, + audio_info, + ) ) - ) for future in tqdm(futures, desc="Processing"): result = future.result()