Skip to content

Commit

Permalink
Merge pull request #15 from lozuponelab/download_error_info
Browse files Browse the repository at this point in the history
Fix issues with forbidden urls with async attempting to download too many sites
  • Loading branch information
sterrettJD authored Jun 9, 2023
2 parents 7155d3a + cf0112d commit 6e02a54
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 5 deletions.
39 changes: 36 additions & 3 deletions KEGG_parser/downloader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import asyncio

import aiohttp
import requests
import time
from tqdm import tqdm

from KEGG_parser.parsers import parse_ko

Expand Down Expand Up @@ -32,7 +35,8 @@ async def download_coroutine(session, url, attempts=10, wait=30):
raise ValueError('Bad HTTP request status %s: %s\n%s' % (response.status, response.reason, url))
await asyncio.sleep(wait)

raise ValueError('KEGG has forbidden request after %s attempts' % attempts)
raise ValueError('KEGG has forbidden request after %s attempts for url %s , which returns a response status of %s' %
(attempts, url, response.status))


async def kegg_download_manager(loop, list_of_ids):
Expand All @@ -44,10 +48,39 @@ async def kegg_download_manager(loop, list_of_ids):

return [raw_record for raw_records in results for raw_record in raw_records.split('///')[:-1]]

def download_synchronous(url, attempts=10):
for _ in range(attempts):
response = requests.get(url)

def get_from_kegg_api(loop, list_of_ids, parser):
return [parser(raw_record) for raw_record in loop.run_until_complete(kegg_download_manager(loop, list_of_ids))]
if response.status_code == 200:
return response.text

# if none of our attempts have returned OK
raise ValueError('KEGG has forbidden request after %s attempts for url %s , which returns a response status of %s' %
(attempts, url, response.status_code))

def kegg_download_manager_synchronous(list_of_ids, wait=1):
"""This is a backup in case the async downloading is forbidden."""
urls = ['http://rest.kegg.jp/get/%s' % '+'.join(chunk) for chunk in chunks(list(list_of_ids), 10)]
num_urls = len(urls)
print(f"Total urls to download: {num_urls}. Progress will be shown below.")
results = []
for url in tqdm(urls):
results.append(download_synchronous(url))
time.sleep(wait)

return [raw_record for raw_records in results for raw_record in raw_records.split('///')[:-1]]



def get_from_kegg_api(loop, list_of_ids, parser):
try:
return [parser(raw_record) for raw_record in loop.run_until_complete(kegg_download_manager(loop, list_of_ids))]
except ValueError:
print("Asynchronous downloading of KEGG records has failed. KEGG parser will try to download data sequentially."
"This will be slower.")
time.sleep(30)
return [parser(raw_record) for raw_record in kegg_download_manager_synchronous(list_of_ids)]

def get_kegg_record_dict(list_of_ids, parser, records_file_loc=None, verbose=False):
if records_file_loc is None:
Expand Down
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pytest
aiohttp
asyncio
requests
tqdm
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
name="KEGG-parser",
version=__version__,
setup_requires=['pytest-runner'],
tests_require=['pytest'],
install_requires=['aiohttp'],
tests_require=['pytest', 'requests'],
install_requires=['aiohttp', 'asyncio', 'tqdm', 'requests'],
packages=find_packages(),
description="KEGG Parser: A tool for parsing and converting KEGG data into manipulable Python objects.",
long_description=long_description,
Expand Down

0 comments on commit 6e02a54

Please sign in to comment.