-
Notifications
You must be signed in to change notification settings - Fork 0
/
PdfParser.py
executable file
·404 lines (336 loc) · 16.9 KB
/
PdfParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
# Purpose: provides functions for extracting text from PDF files
# and for finding DOI IDs in extracted text
# Notes:
# 1. relies on MGI's litparser product to do the actual pdf to text
# 2. must be initialized with call to setLitParserDir()
import os
import re
import subprocess
###--- Globals ---###
LITPARSER = None # full path to parsing script in litparser product
###--- Functions ---###
def setLitParserDir (
directory # string; path to the litparser product
):
# Purpose: initialize this module by identifying where to find the
# litparser product.
# Throws: Exception if 'directory' does not exist or if it does not
# contain the expected pdfGetFullText.sh script.
global LITPARSER
if not os.path.isdir(directory):
raise Exception('%s is not a directory' % directory)
LITPARSER = os.path.join(directory, 'pdfGetFullText.sh')
if not os.path.exists(LITPARSER):
raise Exception('%s does not exist' % LITPARSER)
return
###--- Classes ---###
class DoiFinder (object):
# Is: a parser that knows how find the DOI ID in the extracted text of a PDF
# Has: journal specific reg ex's and logic
# Does: return the DOI ID in a text string
# Define regex's as class variables so they are only compiled once,
# and try to keep their definition near the code that uses them.
# regex for detecting basic DOI IDs:
# "10", followed by '.', followed by any number of 0-9 or '.'
# ... followed by anything else until we reach a space, tab, or semicolon
DOI_RE = re.compile('(10\.[0-9\.]+/[^ \t;]+)')
# regex specifically for recognizing IDs from any 10.1177 journal that
# contains trailing 'Journal'
# 10.1177 is Sage publisher: https://us.sagepub.com/en-us/nam/sage-journals
SAGE_DOI_RE = re.compile('(10\.1177/[a-zA-Z0-9\-\.]+)Journal')
def getDoiID (self, text):
# Purpose: return the DOI ID from the text, where text is the
# extracted text from a PDF.
# Returns: string DOI ID or None (if no ID can be found)
if text.find('www.pnas.org') >= 0:
return self._getPnasID(text)
# Everything except pnas
text = text.replace(' journal.pone', 'journal.pone')
match = self.DOI_RE.search(text)
if not match: # no apparent DOI
return None
# Got an ID match, lets see if it needs any special handling
doiID = match.group(1)
slash = doiID.find('/') # where is the 1st '/'
nl = doiID.find('\n') # where is the 1st '\n'
# special case for PLoS journals, which often have a line break in ID.
# PLOS journals have 28-character DOI IDs 99.98% of the time.
# Out of 10,000+ # PLOS DOI IDs in MGI so far, the only others are
# single IDs with 21 and 24 characters.
# So if we encounter a newline within the first 21 characters,
# we can just remove it.
# Also as of new pdftotext util we started using in Oct 2019, the 1st
# or 2nd ID occurrance in the paper may be truncated when a space is
# inserted instead of a line break.
# So try looking for a couple ID instances.
if doiID.startswith('10.1371/'): # PLoS
if (0 <= nl < 21): # remove potential nl
doiID = doiID.replace('\n', '', 1)
slash = doiID.find('/')
nl = doiID.find('\n')
i = 0
while len(doiID) < 28: # try another occurrance
if i == 3: break # quit after 3 tries
i += 1
match = self.DOI_RE.search(text, match.end())
if not match: break # odd, this shouldn't happen, bail
doiID = match.group(1)
slash = doiID.find('/')
nl = doiID.find('\n')
if (0 <= nl < 21): # remove potential nl
doiID = doiID.replace('\n', '', 1)
slash = doiID.find('/')
nl = doiID.find('\n')
# Special case for Journals from American Society for Microbiology (ASM)
# Includes Molecular and Cellular Biology (also J Virol, MBio (mBio?),
# Infec Immun)
# These have DOI IDs from 20 to 32 characters
# -- but which are often interrupted by line breaks
# in their new (circa late-2016) PDF format. As workaround for the most
# common case, remove any newlines within the first 20 chars of the ID.
if doiID.startswith('10.1128/'):
while 0 <= nl < 20:
doiID = doiID.replace('\n', '', 1)
nl = doiID.find('\n')
# This code is not journal specific. It would be nice to refactor
# and have all the non-journal specific code in one place, but
# this is a little scary to move.
# if there is a newline right after the slash, just remove it
if (nl >= 0) and (nl == (slash+1)):
doiID = doiID.replace('\n', '', 1)
nl = doiID.find('\n')
# if there is a newline later in the string,
# trim the ID at that point
if (nl >= 0) and (nl > slash):
doiID = doiID[:nl]
doiID = self._cleanEnd(doiID) # rm trailing ')', ']', '.', whitespace
# Now back to journal specific code...
# if this is a '10.1177/...Journal' DOI ID, (Sage journals)
# then remove the trailing 'Journal' text
if self.SAGE_DOI_RE.match(doiID):
doiID = doiID.replace('Journal', '')
elif (doiID.find('/eLife') > 0) and (doiID.endswith('.001')):
doiID = doiID[:-4] # eLife IDs often errantly end with .001
elif doiID.startswith('10.1182/blood'):
doiID = self._getBloodID(text)
elif doiID.startswith('10.1172/jci'):
doiID = self._getJciInsightID(text)
elif doiID.startswith('10.1530/REP'):
doiID = self._getReproductionID(text)
# if this is a Science DOI ID, we instead need to find and return the
# last DOI ID for the PDF file.
# scitranslmed is from the same publisher (like scisignal) but is not
# handled here.
# I haven't found any examples in our db from scitranslmed or scisignal
# where the 1st doi is the wrong one (haven't looked too hard either)
elif doiID.startswith('10.1126/science') or \
doiID.startswith('10.1126/scisignal'):
doiID = self._getScienceID(text)
return doiID
# end getDoiID() --------------
END_CLEAN_RE = re.compile('[\)\.\]\s]+$')
def _cleanEnd (self, text):
# strip off trailing parentheses, periods,
# brackets, and whitespace from the text
text = self.END_CLEAN_RE.sub('', text)
return text
# regex specifically for recognizing IDs from Blood journal
BLOOD_DOI_RE = re.compile('10\.1182/blood([0-9\-\.\s]+)')
# (6/25/2020) Note Blood has at least two types of articles, full articles
# e.g., MGI:6284584 and "comment" (or short?) artcles like MGI:6284578.
# In the comment articles, the PDF for an article often contains the tail
# end of a previous article in the issue, and may contain the beginning
# of the the next article, full articles. These tail/beginning parts of
# the surrounding articles may contain their own DOI IDs.
# So it is easy to get the wrong (but valid!) DOI ID.
# In our download files, often there is a download page at the end that
# contains the correct DOI ID. Probably should change the logic to get
# that ID from the PDF (should consider what happens w/ supp data, haven't
# looked for examples of that)
# Blood also has two types of IDs:
# hyphenated: '10.1182/blood-2018-12-889758'
# unhyphenated: '10.1182/blood.2019004603' (seems to always start w/ '.')
# The code below
# Given all the pain here, best solution is to have Quosa name the
# downloaded PDFs using the PMID_##### convention so we don't have to
# try to find the DOI ID in the text. See TR12755.
def _getBloodID (self, text):
# if this is a Blood DOI ID,
# the hypenation sometimes needs tweaking
# may contain a '.' or a ' '
# Note: Blood really needs better logic, often
# the 1st doiID is for the paper in the PDF.
# Should probably grab last doiID like Science.
match = self.BLOOD_DOI_RE.search(text)
doiID = self._cleanEnd(match.group(0))
numbers = self._cleanEnd(match.group(1))
revised = self._BloodFixHyphens(numbers)
doiID = doiID.replace(numbers, revised)
doiID = doiID.replace(' ', '')
doiID = doiID.replace('\n', '')
return doiID
# end _getBloodID() --------------
def _BloodFixHyphens (self, s):
# Purpose: fix the hyphenation in Blood DOI IDs, which should be
# of the format "-yyyy-mm-others" where the first six digits
# are the year, the next two are the month, and then all the
# others come at the end
# Returns: string updated according to 'Purpose', or the input string
# if there are not enough digits
digits = s.replace('-', '').replace('.', '').replace(' ', '')
if len(digits) < 7:
return s
if s.find('.') >= 0:
return '.%s%s%s' % (digits[:4], digits[4:6], digits[6:])
else:
return '-%s-%s-%s' % (digits[:4], digits[4:6], digits[6:])
# end _BloodFixHyphens() --------------
# Reproduction: regex specifically for recognizing IDs from any 10.1530/REP
REP_DOI_RE = re.compile('(?:doi.org/)?(10\.1530/REP[ \-0-9]+)')
def _getReproductionID (self, text):
# Reproduction may have spaces introduced
# and newer papers have 'doi.org/'
match = self.REP_DOI_RE.search(text)
doiID = match.group(1)
doiID = doiID.replace(' ', '')
return doiID
# end _getReproductionID() --------------
# regex specifically for recognizing IDs from any 10.1172/jci. insight
# may have line break (which may get translated to ' ' by pdftotext) after
# 'jci.'
JCI_DOI_RE = re.compile('(10\.1172/jci\.[\s]?insight\.[0-9]+)')
def _getJciInsightID (self, text):
match = self.JCI_DOI_RE.search(text)
doiID = match.group(0)
doiID = doiID.replace(' ', '')
doiID = doiID.replace('\n', '')
return doiID
# end _getJciInsightID() --------------
# regex for recognizing IDs from Proc Natl Acad Sci (PNAS) journal
# examples: matches
# 10.1073/pnas.0931458100 OR 10.1073pnas.0931458100 OR
# 10.1073#pnas.0931458100 f
#\W? match 0 or 1 non-alphanumeric between '10.1073' and 'pnas'
PNAS_DOI_RE = re.compile('(10\.1073\W?pnas\.[0-9]+)')
def _getPnasID (self, text):
# PNAS DOI sometimes have missing '/' e.g., '10.1073pnas.041475098'
# so can't be found using our standard DOI_RE
# Determine if missing '/' OR intervening SINGLE non-alphnumeric char
# should be replaced by '/'
match = self.PNAS_DOI_RE.search(text)
doiID = match.group(1)
if doiID.find('/') == -1: # no '/'
if doiID.find('pnas') == 7: # there is no '/', add one
doiID = doiID.replace('10.1073', '10.1073/')
elif doiID.find('pnas') == 8: # there is a single intervening char
# jak: this is really rare
charToReplace = doiID[7]
doiID = doiID.replace(charToReplace, '/')
return doiID
# end _getPnasID() --------------
# regex specifically for recognizing IDs from Science journals
SCIENCE_DOI_RE = re.compile('(10\.1126/[a-zA-Z0-9\-\.]+)')
# regex for finding "accepted" string
ACCEPTED_RE = re.compile('accepted', re.IGNORECASE)
def _getScienceID (self, text):
# Science journals include the end of the prior article at the
# start of the PDF file. This means that we will usually
# return an inaccurate DOI ID for PDFs from Science journals.
# Instead, the desired ID occurs at the end of the article,
# shortly after the word "accepted". Use these criteria to
# get the desired ID and return it.
# To get to this method, we must have already loaded the
# full text, and it must have been non-null.
# Find all occurrences of the word 'accepted' and note the
# position of each. (It is possible that 'accepted' would
# occur in the start of the next article, so we can't just
# blindly take the last one.)
acceptedPositions = []
match = self.ACCEPTED_RE.search(text)
while match:
pos = match.regs[0][0]
acceptedPositions.append(pos)
match = self.ACCEPTED_RE.search(text, pos + 1)
# Now start at the last occurrence of "accepted" and see if
# we can find a Science DOI ID reasonably soon after it. If
# so, that's our desired ID to return. If not, work back
# through the other instances of "accepted".
# how close is close enough? (number of characters)
threshold = 80
acceptedPositions.reverse()
for accPos in acceptedPositions:
match = self.SCIENCE_DOI_RE.search(text, accPos)
if match:
if (match.regs[0][0] <= (accPos + threshold)):
return match.group(1)
return None
# end _getScienceID() --------------
# end class DoiFinder -------------------
class PdfParser:
# Is: a parser that knows how to extract text from a PDF file
# Has: path to a PDF file, text from a PDF file
# Does: reads a PDF file from the file system, parses it, provides
# access to full text and various bits of information
doiFinder = DoiFinder() # only need a singleton DoiFinder
def __init__ (self,
pdfPath # string; path to PDF file to parse
):
# Purpose: constructor
# Throws: Exception if the file specified in 'pdfPath' does
# not exist
if not os.path.exists(pdfPath):
raise Exception('PDF file does not exist: %s' % pdfPath)
self.pdfPath = pdfPath # string; path to the PDF file
self.fullText = None # string; text from the PDF file
self.loaded = False # boolean; did we read the file yet?
return
def _loadFullText (self):
# Purpose: (private) get the text from the PDF file
# Throws: Exception if this library has not been properly
# initialized or if there are errors in parsing the file
# Notes: only loads the file once; if we already ready it,
# calling this function is a no-op.
if self.loaded:
return
if not LITPARSER:
raise Exception('Must initialize pdfParser library using setLitParserDir()')
self.stderr = ''
cmd = [ LITPARSER, self.pdfPath ]
cmdText = ' '.join(cmd)
try:
completedProcess = subprocess.run(cmd, text=True,
capture_output=True)
except: # error in attempting to execute parsing script
raise Exception('Failed to execute: %s' % cmdText)
self.stderr = completedProcess.stderr
# parsing script finished with an error code?
if (completedProcess.returncode != 0):
msg = 'Failed to parse %s\n' % self.pdfPath
msg += 'Stderr from %s:\n%s\n' % (cmdText, self.stderr)
raise Exception(msg)
# parsing was successful, so grab the text and note that we
# loaded the file
self.fullText = completedProcess.stdout
self.loaded = True
return
def getStderr(self):
return self.stderr
def getFirstDoiID (self):
# Purpose: return the first DOI ID from the PDF file
# Returns: string DOI ID or None (if no ID can be found)
# Throws: Exception if this library has not been properly
# initialized or if there are errors in parsing the file
# Note: this would be more aptly named getDoiID()
self._loadFullText()
if self.fullText:
return self.doiFinder.getDoiID(self.fullText)
else:
return None
def getText (self):
# Purpose: return the full text extracted from the PDF file
# Returns: string (full text)
self._loadFullText()
if self.fullText:
return self.fullText
return None
# end class PdfParser -------------------