Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds radio data recipe #1400

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lhotse/bin/modes/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
from .nsc import *
from .peoples_speech import *
from .primewords import *
from .radio import *
from .reazonspeech import *
from .rir_noise import *
from .sbcsae import *
Expand Down
41 changes: 41 additions & 0 deletions lhotse/bin/modes/recipes/radio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from typing import List, Optional, Sequence, Tuple, Union

import click

from lhotse.bin.modes import prepare
from lhotse.recipes.radio import prepare_radio
from lhotse.utils import Pathlike

__all__ = ["radio"]


@prepare.command(context_settings=dict(show_default=True))
@click.argument("corpus_dir", type=click.Path(dir_okay=True))
@click.argument("output_dir", type=click.Path(dir_okay=True))
@click.option(
"-d",
"--min-seg-dur",
type=float,
default=0.5,
help="The minimum segment duration",
)
@click.option(
"-j",
"--num-jobs",
type=int,
default=4,
help="The number of parallel threads to use for data preparation",
)
def radio(
corpus_dir: Pathlike,
output_dir: Pathlike,
min_seg_dur: float = 0.5,
num_jobs: int = 4,
):
"""Data preparation"""
prepare_radio(
corpus_dir,
output_dir=output_dir,
num_jobs=num_jobs,
min_segment_duration=min_seg_dur,
)
2 changes: 2 additions & 0 deletions lhotse/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
from .musan import download_musan, prepare_musan
from .nsc import prepare_nsc
from .peoples_speech import prepare_peoples_speech
from .radio import prepare_radio
from .reazonspeech import download_reazonspeech, prepare_reazonspeech
from .rir_noise import download_rir_noise, prepare_rir_noise
from .sbcsae import download_sbcsae, prepare_sbcsae
Expand Down Expand Up @@ -194,6 +195,7 @@
"prepare_peoples_speech",
"download_reazonspeech",
"prepare_reazonspeech",
"prepare_radio",
"download_rir_noise",
"prepare_rir_noise",
"prepare_slu",
Expand Down
141 changes: 141 additions & 0 deletions lhotse/recipes/radio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""
This recipe prepares data collected from radio streamed on the web. The data
have some metadata attached to them, including the geographic location of
broadcast, date and time of the recorded clip, as well as a unique station
identifier.

Obtaining the data
-----------------------------------------------------------
If you want to use this corpus please email: [email protected]

As the data are collected from radio stream, they cannot be broadly
disseminated or used for commercial purposes. In the email, include your
affiliated academic institution and the intended use for the data and we will
the data to you if it is indeed for non-commercial, academic purporses.

Description
------------------------------------------------------------
The data consist of ∼4000 hours of speech collected between
September 27, 2023 to October 1, 2023, in 9449 locations all over the world,
from 17171 stations.

These data were used for Geolocation of speech in order to answer the question,
Where are you from? in the paper

Where are you from? Geolocating Speech and Applications to Language
Identification, presented at NAACL 2024. Please read for a full descrption
and please cite as

@inproceedings{foley2024you,
title={Where are you from? Geolocating Speech and Applications to Language Identification},
author={Foley, Patrick and Wiesner, Matthew and Odoom, Bismarck and Perera, Leibny Paola Garcia and Murray, Kenton and Koehn, Philipp},
booktitle={Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)},
pages={5114--5126},
year={2024}
}
"""
import json
import re
from functools import partial
from pathlib import Path
from typing import Dict, Optional, Union

from tqdm import tqdm

from lhotse.audio import Recording, RecordingSet, set_ffmpeg_torchaudio_info_enabled
from lhotse.parallel import parallel_map
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike

set_ffmpeg_torchaudio_info_enabled(False)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One last request - don’t set these things in the global scope because it’ll be executed when Lhotse is imported. Move it to local scope (even better i think this one works like a context manager)



def _make_reco_and_sups_from_file(sf: str, msd: float = 0.5):
corpus_dir = sf.parents[2]
audio_dir = corpus_dir / "recos"
fname = sf.with_suffix(".flac").stem

# E.g. 2023_10_01_09h_02m_54s_dur30_ZnpbY9Zx_lat3.17_long113.04
chunk_idx = int(sf.parent.suffix.strip("."))
reco_file = audio_dir / f"recos.{chunk_idx}" / f"{fname}.flac"
reco = Recording.from_file(reco_file, recording_id=fname)
reco.channel_ids = [0]
sups = []
total = 0
with open(sf) as f:
segments = json.load(f)

# Parse the file format, shown in the comment above, to get:
# date, station, latitude, longitude, and the estimated gender
lat, lon = re.search(r"lat[^_]+_long[^_]+", Path(sf).stem).group(0).split("_")
lat = float(lat.replace("lat", ""))
lon = float(lon.replace("long", ""))
station = re.search(r"s_dur[0-9]+_(.*)_lat[^_]+_long[^_]+", fname).groups()[0]
fname_vals = fname.split("_")
date = [int(i.strip("hms")) for i in fname_vals[0:6]] # YY MM DD hh mm ss
for seg in segments:
start, end = float(seg[1]), float(seg[2])
dur = end - start
if seg[0] in ("male", "female") and dur > msd:
sups.append(
SupervisionSegment(
id=f"{fname}_{int(100*start):04}",
recording_id=fname,
start=start,
duration=round(dur, 4),
channel=0,
custom={
"date": date,
"lat": lat,
"lon": lon,
"station": station,
"est_gender": seg[0],
},
)
)
return sups, reco


def prepare_radio(
corpus_dir: Pathlike,
output_dir: Optional[Pathlike] = None,
min_segment_duration: float = 0.5,
num_jobs: int = 4,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
"""
Return the manifests which consist of recordings and supervisions
:param corpus_dir: Path to the collected radio samples
:param output_dir: Pathlike, the path where manifests are written
:return: A Dict whose key is the dataset part and the value is a Dict with
keys 'recordings' and 'supervisions'.
"""
corpus_dir = Path(corpus_dir)
segment_files = corpus_dir.rglob("segs/*/*.json")
supervisions, recordings = [], []
fun = partial(_make_reco_and_sups_from_file, msd=min_segment_duration)
output_dir = Path(output_dir) if output_dir is not None else None
output_dir.mkdir(mode=511, parents=True, exist_ok=True)
with RecordingSet.open_writer(
output_dir / "radio_recordings.jsonl.gz"
) as rec_writer:
with SupervisionSet.open_writer(
output_dir / "radio_supervisions.jsonl.gz"
) as sup_writer:
for sups, reco in tqdm(
parallel_map(
fun,
segment_files,
num_jobs=num_jobs,
),
desc=f"Making recordings and supervisions",
):
rec_writer.write(reco)
for sup in sups:
sup_writer.write(sup)

manifests = {
"recordings": RecordingSet.from_jsonl_lazy(rec_writer.path),
"supervisions": SupervisionSet.from_jsonl_lazy(sup_writer.path),
}

return manifests
Loading