Skip to content

Commit

Permalink
add timeout for requests get call again
Browse files Browse the repository at this point in the history
  • Loading branch information
malteweber committed Jun 2, 2024
1 parent f026b6c commit 08709ce
Showing 1 changed file with 4 additions and 3 deletions.
7 changes: 4 additions & 3 deletions scripts/download_pubmed_fulltexts.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ def extract_text_from_pdf(pdf):

API_URL = "https://api.openalex.org/works?per-page=200&select=ids,locations&filter=has_pmid:true,locations.is_oa:true&cursor="
FILEPATH = "data/pdf_texts.jsonl.gz"
TIMEOUT = 60
TIMEOUT_PAPER = 60
TIMEOUT_OPENALEX = 120
LIMIT = 5
downloaded_paper = 0

Expand All @@ -92,13 +93,13 @@ def extract_text_from_pdf(pdf):
break

print(cursor)
response = requests.get(API_URL+cursor)
response = requests.get(API_URL+cursor, timeout=TIMEOUT_OPENALEX)

pdf_urls_list = [
(paper.get("ids").get("pmid"), get_pdf_urls_for_paper(paper)) for paper in response.json().get("results")
]

pdfs = asyncio.run(_a_download_pdfs_for_urls_list(pdf_urls_list, ssl_ctx, timeout=TIMEOUT))
pdfs = asyncio.run(_a_download_pdfs_for_urls_list(pdf_urls_list, ssl_ctx, timeout=TIMEOUT_PAPER))

pdf_texts = [(pm_id, extract_text_from_pdf(pdf)) for (pm_id, pdf) in pdfs if pdf is not None]

Expand Down

0 comments on commit 08709ce

Please sign in to comment.