From 59203856e1f0c3db6774d9099fb27ee649d801fa Mon Sep 17 00:00:00 2001 From: cnovel Date: Sun, 16 Jun 2019 14:03:50 +0200 Subject: [PATCH 1/2] Better way to download, with retry and continue --- src/bulk_downloader.py | 59 ++++++++++++++++++++++++++++--- src/tests/test_bulk_downloader.py | 38 ++++++++++++++++++++ 2 files changed, 93 insertions(+), 4 deletions(-) diff --git a/src/bulk_downloader.py b/src/bulk_downloader.py index 9cb81be..5405167 100644 --- a/src/bulk_downloader.py +++ b/src/bulk_downloader.py @@ -1,9 +1,9 @@ from bs4 import BeautifulSoup import requests import os.path -import urllib import logging import argparse +from time import sleep from xml.etree import ElementTree @@ -11,6 +11,55 @@ class BulkDownloaderException(Exception): pass +def download_with_resume(url, path): + logging.debug("Downloading {} to {}".format(url, path)) + + # Clean existing file + if os.path.exists(path): + os.remove(path) + + try: + r = requests.head(url) + except requests.exceptions as e: + logging.error(e) + return False + + if r.status_code < 200 or r.status_code > 302: + logging.error("Failed to reach {}, status is {}".format(url, r.status_code)) + r.close() + return False + + expected_size = int(r.headers.get("content-length")) + r.close() + + chunk_size = 2**20 + last_byte = 0 + with open(path, 'wb') as f: + while last_byte < expected_size: + logging.debug("{} vs {}".format(last_byte, expected_size)) + logging.debug("Starting download with already {}% of the file".format((100*last_byte)/expected_size)) + resume_header = {'Range': 'bytes=%d-' % last_byte} + resume_request = requests.get(url, headers=resume_header, stream=True, + verify=True, allow_redirects=True) + for data in resume_request.iter_content(chunk_size): + f.write(data) + last_byte += len(data) + resume_request.close() + + return True + + +def try_download(url, path, max_try=3, sleep_time=5): + count = 0 + while count < max_try: + if download_with_resume(url, path): + return True + count += 1 + sleep(sleep_time) + logging.error('Download of {} failed after {} tries'.format(url, max_try)) + return False + + class BulkDownloader: _EXT = '.mp3' @@ -39,7 +88,7 @@ def list_mp3(self): to_download = self._get_url_to_download_from_html(page) return to_download - def download_mp3(self): + def download_mp3(self, dry_run=False): if not self.folder(): raise BulkDownloaderException('No folder is defined for the download') to_download = self.list_mp3() @@ -48,8 +97,9 @@ def download_mp3(self): name = os.path.basename(file) name = name.replace('%20', ' ') path = os.path.join(self.folder(), name) - logging.info('Saving {} to {}'.format(name, path)) - urllib.request.urlretrieve(file, path) + logging.info('Saving {} to {} from {}'.format(name, path, file)) + if not dry_run: + try_download(file, path) logging.info('Done') def _get_url_to_download_from_html(self, page): @@ -93,6 +143,7 @@ def main(): logging.getLogger().setLevel(logging.INFO) log_format = "[%(levelname)s] %(message)s" logging.basicConfig(format=log_format) + logging.captureWarnings(True) parser = argparse.ArgumentParser(description='Download MP3s from RSS feed or web folder') parser.add_argument('--url', dest='url', help='URL to inspect') parser.add_argument('-f', '--folder', dest='folder', help='Destination folder') diff --git a/src/tests/test_bulk_downloader.py b/src/tests/test_bulk_downloader.py index e4172f5..f485915 100644 --- a/src/tests/test_bulk_downloader.py +++ b/src/tests/test_bulk_downloader.py @@ -1,4 +1,6 @@ import pytest +import os +from shutil import rmtree from .. import bulk_downloader as bd @@ -30,3 +32,39 @@ def test_wrong_server(): with pytest.raises(bd.BulkDownloaderException): bdl.list_mp3() + +def test_dl_no_folder(): + bdl = bd.BulkDownloader('https://feeds.radiokawa.com/podcast_nawak.xml') + assert len(bdl.list_mp3()) > 0 + with pytest.raises(bd.BulkDownloaderException): + bdl.download_mp3() + + +def test_dl_dry(): + bdl = bd.BulkDownloader('https://feeds.radiokawa.com/podcast_nawak.xml', './dl') + assert len(bdl.list_mp3()) > 0 + bdl.download_mp3(dry_run=True) + + +@pytest.fixture(scope='module') +def tmp_directory(request): + tmp_directory = os.path.join(os.getcwd(), 'tmp_dir') + if os.path.exists(tmp_directory): + rmtree(tmp_directory) + os.mkdir(tmp_directory) + + def clean(): + rmtree(tmp_directory) + request.addfinalizer(clean) + return tmp_directory + + +def test_try_download_ok(tmp_directory): + assert bd.try_download('http://www.acute3d.com/embed/Logo-acute3D.png', + os.path.join(tmp_directory, 'acute3d.png'), 2, 1) + + +def test_try_download_ko(tmp_directory): + assert not bd.try_download('http://www.acute3d.com/embed/Logo-pix4d.png', + os.path.join(tmp_directory, 'pix4d.png'), 2, 1) + From 42da38ffedfd08ccf3202c22e11d1e7a29f4e57d Mon Sep 17 00:00:00 2001 From: cnovel Date: Sun, 16 Jun 2019 14:06:55 +0200 Subject: [PATCH 2/2] Fix formatting --- src/bulk_downloader.py | 3 ++- src/tests/test_bulk_downloader.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bulk_downloader.py b/src/bulk_downloader.py index 5405167..b845f79 100644 --- a/src/bulk_downloader.py +++ b/src/bulk_downloader.py @@ -37,7 +37,8 @@ def download_with_resume(url, path): with open(path, 'wb') as f: while last_byte < expected_size: logging.debug("{} vs {}".format(last_byte, expected_size)) - logging.debug("Starting download with already {}% of the file".format((100*last_byte)/expected_size)) + logging.debug("Starting download with already {}% of the file". + format((100*last_byte)/expected_size)) resume_header = {'Range': 'bytes=%d-' % last_byte} resume_request = requests.get(url, headers=resume_header, stream=True, verify=True, allow_redirects=True) diff --git a/src/tests/test_bulk_downloader.py b/src/tests/test_bulk_downloader.py index f485915..99d59f9 100644 --- a/src/tests/test_bulk_downloader.py +++ b/src/tests/test_bulk_downloader.py @@ -67,4 +67,3 @@ def test_try_download_ok(tmp_directory): def test_try_download_ko(tmp_directory): assert not bd.try_download('http://www.acute3d.com/embed/Logo-pix4d.png', os.path.join(tmp_directory, 'pix4d.png'), 2, 1) -