Skip to content

Commit

Permalink
In CommonVoice corpus, use .tsv headers to parse and not column index (
Browse files Browse the repository at this point in the history
…#1328)

* Fix for cv corpus

* Fix for cv corpus x2

* Debug serialization problem

* Debug serialization problem

* Undo

* Handle quote polution in CV dataset
  • Loading branch information
daniel-dona authored Apr 29, 2024
1 parent ed5797c commit b2dce78
Showing 1 changed file with 22 additions and 18 deletions.
40 changes: 22 additions & 18 deletions lhotse/recipes/commonvoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
How does it work?
We are crowdsourcing an open-source dataset of voices. Donate your voice, validate the accuracy of other people's clips, make the dataset better for everyone.
"""
import csv
import logging
import math
import numbers
Expand Down Expand Up @@ -149,14 +150,13 @@ def _parse_utterance(
language: str,
audio_info: str,
) -> Optional[Tuple[Recording, SupervisionSegment]]:
audio_info = audio_info.split("\t", -1)
audio_path = lang_path / "clips" / audio_info[1]
audio_path = lang_path / "clips" / audio_info["path"]

if not audio_path.is_file():
logging.info(f"No such file: {audio_path}")
return None

recording_id = Path(audio_info[1]).stem
recording_id = Path(audio_info["path"]).stem
recording = Recording.from_file(path=audio_path, recording_id=recording_id)

segment = SupervisionSegment(
Expand All @@ -166,12 +166,13 @@ def _parse_utterance(
duration=recording.duration,
channel=0,
language=language,
speaker=audio_info[0],
text=audio_info[2].strip(),
gender=audio_info[6],
speaker=audio_info["client_id"],
text=audio_info["sentence"].strip(),
gender=audio_info["gender"],
custom={
"age": audio_info[5],
"accents": audio_info[7],
"age": audio_info["age"],
"accents": audio_info["accents"],
"variant": audio_info["variant"],
},
)
return recording, segment
Expand Down Expand Up @@ -207,19 +208,22 @@ def _prepare_part(
futures = []
recordings = []
supervisions = []
audio_infos = []

with open(tsv_path) as f:
audio_infos = iter(f.readlines())
with open(tsv_path, "r") as f:

for audio_info in tqdm(audio_infos, desc="Distributing tasks"):
futures.append(
ex.submit(
_parse_utterance,
lang_path,
lang,
audio_info,
# Note: using QUOTE_NONE as CV dataset contains unbalanced quotes, cleanup needed later
audio_infos = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)

for audio_info in tqdm(audio_infos, desc="Distributing tasks"):
futures.append(
ex.submit(
_parse_utterance,
lang_path,
lang,
audio_info,
)
)
)

for future in tqdm(futures, desc="Processing"):
result = future.result()
Expand Down

0 comments on commit b2dce78

Please sign in to comment.