-
Notifications
You must be signed in to change notification settings - Fork 0
/
PubMedCentralAgent.py
executable file
·156 lines (117 loc) · 6.04 KB
/
PubMedCentralAgent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# Purpose: provide an interface to various services at PubMed Central
# Usage:
# 1. Initialize the module by calling setToolName() and/or setEmailAddress() as desired to override
# default settings.
# 2. Instantiate an IDCoverterAgent (to convert DOI IDs to PMC IDs) or a PDFLookupAgent (to take
# PMC IDs and look up)
# 3. Run with it.
import urllib.request, urllib.error, urllib.parse
import xml.dom.minidom
import HttpRequestGovernor
###--- Globals ---###
# name of tool making the request (sent to PubMed for tracking)
TOOL_NAME = 'PubMedCentralAgent'
# email address in case contact is needed (sent to PubMed for tracking)
EMAIL_ADDRESS = '[email protected]'
# URL for sending DOI IDs to PubMed Central to be converted to PubMed Central (PMC) IDs;
# need to fill in tool name, email address, and comma-delimited list of DOI IDs
ID_CONVERTER_URL = '''https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=%s&email=%s&ids=%s&format=csv'''
# URL for sending a PubMed Central (PMC) ID to PubMed Central to get its download URLs
# need to fill in a single PMC ID
PDF_LOOKUP_URL = '''https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=%s'''
###--- Functions ---###
def setToolName(tool):
# Purpose: change the tool name submitted to NCBI (for their tracking purposes)
global TOOL_NAME
TOOL_NAME = tool
return
def setEmailAddress(email):
# Purpose: change email address submitted to NCBI (for their tracking purposes)
global EMAIL_ADDRESS
EMAIL_ADDRESS = email
return
def _splitList (
items, # the list of items to split
n # the maximum number of items per sublist
):
# Purpose: (private) splits 'items' in a list of sub-lists, each of which has 'n' or fewer items in it
# Returns: list of lists as described in Purpose
# Example:
# _splitList ( [ 'a', 'b', 'c', 'd', 'e' ], 2) ===> [ ['a', 'b'], ['c', 'd'], ['e'] ]
if len (items) <= n:
return [ items ]
else:
return [ items [:n] ] + _splitList (items [n:], n)
###--- Classes ---###
class IDConverterAgent:
# Is: an agent that communicates with PubMed Central to convert DOI IDs to PMC IDs
def __init__ (self):
return
def getPMCID (self, doiID):
# Purpose: look up the PMC ID for a single DOI ID
# Returns: str.(PMC ID) or None (if the DOI ID has no PMC ID)
# Throws: Exception if there are problems communicating with PubMed Central
return self.getPMCIDs([ doiID ])[doiID]
def getPMCIDs (self, doiIDs):
# Purpose: look up the PMC ID corresponding to each DOI ID in the input list
# Returns: dictionary mapping from each DOI ID to its corresponding PMC ID (or None,
# if a given DOI ID has no PMC ID)
# Throws: Exception if there are problems communicating with PubMed Central
pmcIDs = {} # maps from DOI ID to PMC ID
if not doiIDs:
return pmcIDs
# strip leading & trailing spaces from IDs and split the list into chunks
sublists = _splitList([x.strip() for x in doiIDs], 20)
for sublist in sublists:
lines = HttpRequestGovernor.readURL(ID_CONVERTER_URL % (TOOL_NAME, EMAIL_ADDRESS, ','.join(sublist)))
# Lines have comma-delimited columns. String values are in double-quotes.
# Standardize lines by stripping out the double-quotes, then splitting on commas.
lines = [x.split(',') for x in lines.replace('"', '').split('\n')]
# first line will have column headers. We need DOI and PMCID columns.
if 'DOI' not in lines[0]:
raise Exception('Cannot find "DOI" column in getPMCIDs')
if 'PMCID' not in lines[0]:
raise Exception('Cannot find "PMCID" column in getPMCIDs')
doiCol = lines[0].index('DOI')
pmcCol = lines[0].index('PMCID')
# now go through the rest of the lines and do the mapping
for line in lines[1:]:
if len(line) > pmcCol:
if line[pmcCol] != '':
pmcIDs[line[doiCol]] = line[pmcCol]
else:
pmcIDs[line[doiCol]] = None
return pmcIDs
class PDFLookupAgent:
def __init__ (self):
return
def getUrl (self, pmcID):
# Purpose: look up the download URL for a single PMC ID
# Returns: str.(URL) or None (if the PMC ID has no file to download)
# Throws: Exception if there are problems communicating with PubMed Central
return self.getUrls([ pmcID ])[pmcID]
def getUrls (self, pmcIDs):
# Purpose: look up the download URL corresponding to each PMC ID in the input list
# Returns: dictionary mapping from each PMC ID to its corresponding download URL (or None,
# if a given PMC ID has no download URL)
# Throws: Exception if there are problems communicating with PubMed Central
# Notes: Direct links to PDF files are preferred, but if a given ID doesn't have one, we
# will fall back on a link to a tarred, gzipped directory, where available.
urls = {} # maps from PMC ID to download URL
if not pmcIDs:
return urls
for pmcID in [x.strip() for x in pmcIDs]:
lines = HttpRequestGovernor.readURL(PDF_LOOKUP_URL % pmcID)
xmldoc = xml.dom.minidom.parseString(lines)
links = {} # maps from format to url for this pmcID
for linkElement in xmldoc.getElementsByTagName("link"):
format = linkElement.attributes['format'].value
url = linkElement.attributes['href'].value
links[format] = url
if 'pdf' in links: # prefer direct PDF over tarred, gzipped directory
urls[pmcID] = links['pdf']
elif 'tgz' in links:
urls[pmcID] = links['tgz']
else:
urls[pmcID] = None
return urls