Skip to content

Commit

Permalink
Allow skipping missing files in AMI download (#1318)
Browse files Browse the repository at this point in the history
  • Loading branch information
pzelasko authored Apr 23, 2024
1 parent 9bf1b8f commit ad66889
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 15 deletions.
24 changes: 10 additions & 14 deletions lhotse/recipes/ami.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,10 @@ def download_audio(
wav_dir.mkdir(parents=True, exist_ok=True)
wav_path = wav_dir / wav_name
resumable_download(
wav_url, filename=wav_path, force_download=force_download
wav_url,
filename=wav_path,
force_download=force_download,
missing_ok=True,
)
elif mic == "mdm":
for array in MDM_ARRAYS:
Expand All @@ -208,19 +211,12 @@ def download_audio(
wav_dir = target_dir / "wav_db" / item / "audio"
wav_dir.mkdir(parents=True, exist_ok=True)
wav_path = wav_dir / wav_name
try:
resumable_download(
wav_url, filename=wav_path, force_download=force_download
)
except urllib.error.HTTPError as err:
if err.code == 404:
logging.warning(
f"{wav_url} does not exist. Skipping this file."
)
if os.path.exists(wav_path) and os.path.isfile(wav_path):
os.remove(wav_path)
else:
raise err
resumable_download(
wav_url,
filename=wav_path,
force_download=force_download,
missing_ok=True,
)
elif mic == "mdm8-bf":
wav_name = f"{item}_MDM8.wav"
wav_url = f"{url}/AMICorpusMirror/amicorpus/beamformed/{item}/{wav_name}"
Expand Down
9 changes: 8 additions & 1 deletion lhotse/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,7 @@ def resumable_download(
filename: Pathlike,
force_download: bool = False,
completed_file_size: Optional[int] = None,
missing_ok: bool = False,
) -> None:
# Check if the file exists and get its size
file_exists = os.path.exists(filename)
Expand Down Expand Up @@ -518,7 +519,13 @@ def _download(rq, size):
except urllib.error.HTTPError as e:
# "Request Range Not Satisfiable" means the requested range
# starts after the file ends OR that the server does not support range requests.
if e.code == 416:
if e.code == 404 and missing_ok:
logging.warning(
f"{url} does not exist (error 404). Skipping this file."
)
if Path(filename).is_file():
os.remove(filename)
elif e.code == 416:
content_range = e.headers.get("Content-Range", None)
if content_range is None:
# sometimes, the server actually supports range requests
Expand Down

0 comments on commit ad66889

Please sign in to comment.