Merge pull request #15 from lozuponelab/download_error_info

Fix issues with forbidden urls with async attempting to download too many sites
lozuponelab · Jun 9, 2023 · 6e02a54 · 6e02a54
2 parents 7155d3a + cf0112d
commit 6e02a54
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 5 deletions.
diff --git a/KEGG_parser/downloader.py b/KEGG_parser/downloader.py
@@ -1,6 +1,9 @@
 import asyncio
 
 import aiohttp
+import requests
+import time
+from tqdm import tqdm
 
 from KEGG_parser.parsers import parse_ko
 
@@ -32,7 +35,8 @@ async def download_coroutine(session, url, attempts=10, wait=30):
                 raise ValueError('Bad HTTP request status %s: %s\n%s' % (response.status, response.reason, url))
         await asyncio.sleep(wait)
 
-    raise ValueError('KEGG has forbidden request after %s attempts' % attempts)
+    raise ValueError('KEGG has forbidden request after %s attempts for url %s , which returns a response status of %s' % 
+                     (attempts, url, response.status))
 
 
 async def kegg_download_manager(loop, list_of_ids):
@@ -44,10 +48,39 @@ async def kegg_download_manager(loop, list_of_ids):
 
     return [raw_record for raw_records in results for raw_record in raw_records.split('///')[:-1]]
 
+def download_synchronous(url, attempts=10):
+    for _ in range(attempts):
+        response = requests.get(url)
 
-def get_from_kegg_api(loop, list_of_ids, parser):
-    return [parser(raw_record) for raw_record in loop.run_until_complete(kegg_download_manager(loop, list_of_ids))]
+        if response.status_code == 200:
+            return response.text
+
+    # if none of our attempts have returned OK
+    raise ValueError('KEGG has forbidden request after %s attempts for url %s , which returns a response status of %s' % 
+                     (attempts, url, response.status_code))
 
+def kegg_download_manager_synchronous(list_of_ids, wait=1):
+    """This is a backup in case the async downloading is forbidden."""
+    urls = ['http://rest.kegg.jp/get/%s' % '+'.join(chunk) for chunk in chunks(list(list_of_ids), 10)]
+    num_urls = len(urls)
+    print(f"Total urls to download: {num_urls}. Progress will be shown below.")
+    results = []
+    for url in tqdm(urls):
+        results.append(download_synchronous(url))
+        time.sleep(wait)
+
+    return [raw_record for raw_records in results for raw_record in raw_records.split('///')[:-1]]
+
+
+
+def get_from_kegg_api(loop, list_of_ids, parser):
+    try:
+        return [parser(raw_record) for raw_record in loop.run_until_complete(kegg_download_manager(loop, list_of_ids))]
+    except ValueError:
+        print("Asynchronous downloading of KEGG records has failed. KEGG parser will try to download data sequentially."
+              "This will be slower.")
+        time.sleep(30)
+        return [parser(raw_record) for raw_record in kegg_download_manager_synchronous(list_of_ids)]
 
 def get_kegg_record_dict(list_of_ids, parser, records_file_loc=None, verbose=False):
     if records_file_loc is None:

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,5 @@
+pytest 
+aiohttp 
+asyncio
+requests
+tqdm
diff --git a/setup.py b/setup.py
@@ -12,8 +12,8 @@
       name="KEGG-parser",
       version=__version__,
       setup_requires=['pytest-runner'],
-      tests_require=['pytest'],
-      install_requires=['aiohttp'],
+      tests_require=['pytest', 'requests'],
+      install_requires=['aiohttp', 'asyncio', 'tqdm', 'requests'],
       packages=find_packages(),
       description="KEGG Parser: A tool for parsing and converting KEGG data into manipulable Python objects.",
       long_description=long_description,