diff --git a/KEGG_parser/downloader.py b/KEGG_parser/downloader.py index 2613e0b..ddfc023 100644 --- a/KEGG_parser/downloader.py +++ b/KEGG_parser/downloader.py @@ -1,6 +1,9 @@ import asyncio import aiohttp +import requests +import time +from tqdm import tqdm from KEGG_parser.parsers import parse_ko @@ -32,7 +35,8 @@ async def download_coroutine(session, url, attempts=10, wait=30): raise ValueError('Bad HTTP request status %s: %s\n%s' % (response.status, response.reason, url)) await asyncio.sleep(wait) - raise ValueError('KEGG has forbidden request after %s attempts' % attempts) + raise ValueError('KEGG has forbidden request after %s attempts for url %s , which returns a response status of %s' % + (attempts, url, response.status)) async def kegg_download_manager(loop, list_of_ids): @@ -44,10 +48,39 @@ async def kegg_download_manager(loop, list_of_ids): return [raw_record for raw_records in results for raw_record in raw_records.split('///')[:-1]] +def download_synchronous(url, attempts=10): + for _ in range(attempts): + response = requests.get(url) -def get_from_kegg_api(loop, list_of_ids, parser): - return [parser(raw_record) for raw_record in loop.run_until_complete(kegg_download_manager(loop, list_of_ids))] + if response.status_code == 200: + return response.text + + # if none of our attempts have returned OK + raise ValueError('KEGG has forbidden request after %s attempts for url %s , which returns a response status of %s' % + (attempts, url, response.status_code)) +def kegg_download_manager_synchronous(list_of_ids, wait=1): + """This is a backup in case the async downloading is forbidden.""" + urls = ['http://rest.kegg.jp/get/%s' % '+'.join(chunk) for chunk in chunks(list(list_of_ids), 10)] + num_urls = len(urls) + print(f"Total urls to download: {num_urls}. Progress will be shown below.") + results = [] + for url in tqdm(urls): + results.append(download_synchronous(url)) + time.sleep(wait) + + return [raw_record for raw_records in results for raw_record in raw_records.split('///')[:-1]] + + + +def get_from_kegg_api(loop, list_of_ids, parser): + try: + return [parser(raw_record) for raw_record in loop.run_until_complete(kegg_download_manager(loop, list_of_ids))] + except ValueError: + print("Asynchronous downloading of KEGG records has failed. KEGG parser will try to download data sequentially." + "This will be slower.") + time.sleep(30) + return [parser(raw_record) for raw_record in kegg_download_manager_synchronous(list_of_ids)] def get_kegg_record_dict(list_of_ids, parser, records_file_loc=None, verbose=False): if records_file_loc is None: diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ba56dbb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +pytest +aiohttp +asyncio +requests +tqdm diff --git a/setup.py b/setup.py index b56fc20..02eb90f 100644 --- a/setup.py +++ b/setup.py @@ -12,8 +12,8 @@ name="KEGG-parser", version=__version__, setup_requires=['pytest-runner'], - tests_require=['pytest'], - install_requires=['aiohttp'], + tests_require=['pytest', 'requests'], + install_requires=['aiohttp', 'asyncio', 'tqdm', 'requests'], packages=find_packages(), description="KEGG Parser: A tool for parsing and converting KEGG data into manipulable Python objects.", long_description=long_description,