From ac7616a463583be389faba873e38991aca825ff7 Mon Sep 17 00:00:00 2001 From: John Sterrett Date: Thu, 8 Jun 2023 13:22:27 -0600 Subject: [PATCH 01/11] add url to error message so user can check this url --- KEGG_parser/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/KEGG_parser/downloader.py b/KEGG_parser/downloader.py index 2613e0b..40b8248 100644 --- a/KEGG_parser/downloader.py +++ b/KEGG_parser/downloader.py @@ -32,7 +32,7 @@ async def download_coroutine(session, url, attempts=10, wait=30): raise ValueError('Bad HTTP request status %s: %s\n%s' % (response.status, response.reason, url)) await asyncio.sleep(wait) - raise ValueError('KEGG has forbidden request after %s attempts' % attempts) + raise ValueError('KEGG has forbidden request after %s attempts for url %s' % (attempts, url)) async def kegg_download_manager(loop, list_of_ids): From 69e743c8260cfd5e0a2b64ce0fbc6c982a3dc330 Mon Sep 17 00:00:00 2001 From: John Sterrett Date: Thu, 8 Jun 2023 13:37:13 -0600 Subject: [PATCH 02/11] add response code to error message from downloader --- KEGG_parser/downloader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/KEGG_parser/downloader.py b/KEGG_parser/downloader.py index 40b8248..c6dc060 100644 --- a/KEGG_parser/downloader.py +++ b/KEGG_parser/downloader.py @@ -32,7 +32,8 @@ async def download_coroutine(session, url, attempts=10, wait=30): raise ValueError('Bad HTTP request status %s: %s\n%s' % (response.status, response.reason, url)) await asyncio.sleep(wait) - raise ValueError('KEGG has forbidden request after %s attempts for url %s' % (attempts, url)) + raise ValueError('KEGG has forbidden request after %s attempts for url %s , which returns a response status of %s' % + (attempts, url, response.status)) async def kegg_download_manager(loop, list_of_ids): From cfb526694b41034778b86662c02b7e9f6ea0b727 Mon Sep 17 00:00:00 2001 From: John Sterrett Date: Thu, 8 Jun 2023 14:59:24 -0600 Subject: [PATCH 03/11] tentative synchronous download function added --- KEGG_parser/downloader.py | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/KEGG_parser/downloader.py b/KEGG_parser/downloader.py index c6dc060..2c69fb6 100644 --- a/KEGG_parser/downloader.py +++ b/KEGG_parser/downloader.py @@ -1,6 +1,8 @@ import asyncio import aiohttp +import requests +import time from KEGG_parser.parsers import parse_ko @@ -45,10 +47,41 @@ async def kegg_download_manager(loop, list_of_ids): return [raw_record for raw_records in results for raw_record in raw_records.split('///')[:-1]] +def download_synchronous(url, attempts=10): + for _ in range(attempts): + response = requests.get(url) -def get_from_kegg_api(loop, list_of_ids, parser): - return [parser(raw_record) for raw_record in loop.run_until_complete(kegg_download_manager(loop, list_of_ids))] + if response.status_code == 200: + return response.text + + # if none of our attempts have returned OK + raise ValueError('KEGG has forbidden request after %s attempts for url %s , which returns a response status of %s' % + (attempts, url, response.status_code)) + +def kegg_download_manager_synchronous(list_of_ids, wait=3): + """This is a backup in case the async downloading is forbidden.""" + urls = ['http://rest.kegg.jp/get/%s' % '+'.join(chunk) for chunk in chunks(list(list_of_ids), 10)] + num_urls = len(urls) + + results = [] + for i, url in enumerate(urls): + if i % 10 == 0: + print(f"Downloaded {(i/num_urls)*100}% of requested KOs.") + results.append(download_synchronous(url)) + time.sleep(wait) + return [raw_record for raw_records in results for raw_record in raw_records.split('///')[:-1]] + + + +def get_from_kegg_api(loop, list_of_ids, parser): + try: + return [parser(raw_record) for raw_record in loop.run_until_complete(kegg_download_manager(loop, list_of_ids))] + except ValueError: + print("Asynchronous downloading of KEGG records has failed. KEGG parser will try to download data sequentially." + "This will be slower.") + time.sleep(30) + return [parser(raw_record) for raw_record in loop.run_until_complete(kegg_download_manager_synchronous(list_of_ids))] def get_kegg_record_dict(list_of_ids, parser, records_file_loc=None, verbose=False): if records_file_loc is None: From d3a4f829ea43b72a5689c9d32a57310b19e5ce02 Mon Sep 17 00:00:00 2001 From: John Sterrett Date: Thu, 8 Jun 2023 15:56:54 -0600 Subject: [PATCH 04/11] update waits and tqdm for progress --- KEGG_parser/downloader.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/KEGG_parser/downloader.py b/KEGG_parser/downloader.py index 2c69fb6..2e0a71c 100644 --- a/KEGG_parser/downloader.py +++ b/KEGG_parser/downloader.py @@ -3,6 +3,7 @@ import aiohttp import requests import time +from tqdm import tqdm from KEGG_parser.parsers import parse_ko @@ -58,15 +59,13 @@ def download_synchronous(url, attempts=10): raise ValueError('KEGG has forbidden request after %s attempts for url %s , which returns a response status of %s' % (attempts, url, response.status_code)) -def kegg_download_manager_synchronous(list_of_ids, wait=3): +def kegg_download_manager_synchronous(list_of_ids, wait=1): """This is a backup in case the async downloading is forbidden.""" urls = ['http://rest.kegg.jp/get/%s' % '+'.join(chunk) for chunk in chunks(list(list_of_ids), 10)] num_urls = len(urls) - + print(f"Total urls to download: {num_urls}. Progress will be shown below.") results = [] - for i, url in enumerate(urls): - if i % 10 == 0: - print(f"Downloaded {(i/num_urls)*100}% of requested KOs.") + for url in tqdm(urls): results.append(download_synchronous(url)) time.sleep(wait) From 5f619dfc153c7a414d2d4b455f70e8ba421c213d Mon Sep 17 00:00:00 2001 From: John Sterrett Date: Thu, 8 Jun 2023 15:57:18 -0600 Subject: [PATCH 05/11] update requirements to install tqdm too --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b56fc20..d5ad1ca 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ version=__version__, setup_requires=['pytest-runner'], tests_require=['pytest'], - install_requires=['aiohttp'], + install_requires=['aiohttp', 'tqdm'], packages=find_packages(), description="KEGG Parser: A tool for parsing and converting KEGG data into manipulable Python objects.", long_description=long_description, From 6505209070efcca70634c0108230201fc23b7738 Mon Sep 17 00:00:00 2001 From: John Sterrett Date: Fri, 9 Jun 2023 08:53:26 -0600 Subject: [PATCH 06/11] add requests to install requirements --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d5ad1ca..82c7217 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ version=__version__, setup_requires=['pytest-runner'], tests_require=['pytest'], - install_requires=['aiohttp', 'tqdm'], + install_requires=['aiohttp', 'tqdm', 'requests'], packages=find_packages(), description="KEGG Parser: A tool for parsing and converting KEGG data into manipulable Python objects.", long_description=long_description, From 7c639d4b1b74f6592e91bab75e116092967697bc Mon Sep 17 00:00:00 2001 From: John Sterrett Date: Fri, 9 Jun 2023 09:01:24 -0600 Subject: [PATCH 07/11] add other requirements --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 82c7217..6dabfd8 100644 --- a/setup.py +++ b/setup.py @@ -12,8 +12,8 @@ name="KEGG-parser", version=__version__, setup_requires=['pytest-runner'], - tests_require=['pytest'], - install_requires=['aiohttp', 'tqdm', 'requests'], + tests_require=['pytest', 'requests'], + install_requires=['aiohttp', 'asyncio', 'tqdm', 'time', 'requests'], packages=find_packages(), description="KEGG Parser: A tool for parsing and converting KEGG data into manipulable Python objects.", long_description=long_description, From eb4c0a8d7d6abdba3d52739257cfcebe4fd86df9 Mon Sep 17 00:00:00 2001 From: John Sterrett Date: Fri, 9 Jun 2023 09:02:07 -0600 Subject: [PATCH 08/11] create requirements.txt file --- requirements.txt | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2993334 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +pytest +aiohttp +asyncio +requests +tqdm +time From 06580aeee1db7061c808b9c563a4fd0342314390 Mon Sep 17 00:00:00 2001 From: John Sterrett Date: Fri, 9 Jun 2023 09:03:32 -0600 Subject: [PATCH 09/11] remove time --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2993334..ba56dbb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,3 @@ aiohttp asyncio requests tqdm -time From c700cc961f82c8f47c2b06725f575f3aadd95796 Mon Sep 17 00:00:00 2001 From: John Sterrett Date: Fri, 9 Jun 2023 11:28:48 -0600 Subject: [PATCH 10/11] remove time from setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6dabfd8..02eb90f 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ version=__version__, setup_requires=['pytest-runner'], tests_require=['pytest', 'requests'], - install_requires=['aiohttp', 'asyncio', 'tqdm', 'time', 'requests'], + install_requires=['aiohttp', 'asyncio', 'tqdm', 'requests'], packages=find_packages(), description="KEGG Parser: A tool for parsing and converting KEGG data into manipulable Python objects.", long_description=long_description, From cf0112d61c10a1e96978c2f5bf07d37bb5b54592 Mon Sep 17 00:00:00 2001 From: John Sterrett Date: Fri, 9 Jun 2023 12:48:08 -0600 Subject: [PATCH 11/11] fix looping with sync download --- KEGG_parser/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/KEGG_parser/downloader.py b/KEGG_parser/downloader.py index 2e0a71c..ddfc023 100644 --- a/KEGG_parser/downloader.py +++ b/KEGG_parser/downloader.py @@ -80,7 +80,7 @@ def get_from_kegg_api(loop, list_of_ids, parser): print("Asynchronous downloading of KEGG records has failed. KEGG parser will try to download data sequentially." "This will be slower.") time.sleep(30) - return [parser(raw_record) for raw_record in loop.run_until_complete(kegg_download_manager_synchronous(list_of_ids))] + return [parser(raw_record) for raw_record in kegg_download_manager_synchronous(list_of_ids)] def get_kegg_record_dict(list_of_ids, parser, records_file_loc=None, verbose=False): if records_file_loc is None: