-
Notifications
You must be signed in to change notification settings - Fork 21
/
pubPrepCrawl
executable file
·283 lines (230 loc) · 11.1 KB
/
pubPrepCrawl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
#!/usr/bin/env python
# load default python packages
import logging, optparse, os, sys, collections, gzip, re, codecs, operator, glob, random
from os.path import *
from collections import defaultdict
# add <scriptDir>/lib/ to package search path
sys.path.insert(0, join(dirname(abspath(__file__)),"lib"))
# load our own libraries
import pubConf, pubGeneric, tabfile, maxCommon, pubPubmed, pubResolvePublishers, pubCrawlConf, unidecode
import pubUpdatePmids
from urllib2 import urlparse
import xml.etree.cElementTree as etree
ISSNTAB = "issns.tab"
# ===== FUNCTIONS =======
def parseIssns(pubFname, crawlPubDirs, crawlIssnOverwrite):
" parse Issns into dict Issn, title -> pub "
logging.info("Parsing publisher table %s" % pubFname)
pubs = maxCommon.iterTsvRows(pubFname)
# parse into dict pub -> list of issns
pubToJournals = defaultdict(list)
for pub in pubs:
#issns = pub.journalEIssns.split("|")
issns = pub.journalIssns.split("|")
titles = pub.titles.split("|")
for issn, title in zip(issns, titles):
pubToJournals[pub.pubName].append((issn, title))
if len(titles)!=len(issns):
logging.warn("Illegal fields for publisher %s, %s, %s" % (pub.pubName, pub.titles, pub.journalEIssns))
continue
assert(len(titles)==len(issns))
return pubToJournals
#logging.info("Parsing %s" % pubFname)
#pubs = maxCommon.iterTsvRows(pubFname)
## parse into dict pub -> list of issns
#pubToIssn = {}
#issnToTitle = {}
#for pub in pubs:
# issns = pub.journalEIssns.split("|")
# titles = pub.titles.split("|")
# pubToIssn[pub.pubName] = (issns, titles)
# if len(titles)!=len(issns):
# logging.warn("Illegal fields for publisher %s, %s, %s" % (pub.pubName, pub.titles, pub.journalEIssns))
# continue
# assert(len(titles)==len(issns))
# for title, issn in zip(titles, issns):
# issnToTitle[issn] = title
## make sure that all configured publishers are in the dict we just read
#issnToPub = {}
#for pubName in crawlPubDirs:
# if pubName not in pubToIssn:
# raise Exception("publisher %s defined in pubConf not found in %s" % (pubName, pubFname))
# issns, titles = pubToIssn[pubName]
# for issn, title in zip(issns, titles):
# issnToPub[issn] = pubName
## write into dict issn -> pub
#for corrIssn in crawlIssnOverwrite:
# if corrIssn in issnToPub:
# del issnToPub[corrIssn]
## now reverse again and make dict pub -> set of (issn, title)
#pubToIssnName = {}
#for issn, pub in issnToPub.iteritems():
# #if issn=="1536-4844":
# #print repr(issn), pub, issnToTitle.get(issn, "")
# #assert(False)
# pubToIssnName.setdefault(pub, set()).add((issn, issnToTitle.get(issn, "")))
#return pubToIssnName
def writeIssnTables(outDir, pubIds, pubToIssn, issnOverwrite):
""" write (issn,minYear,maxYear) as issn.tab to one publisher directories,
adding issnOverwrite at the end
"""
outFhDict = {}
# prep the lookup table
pubIdToDescs = {}
for pubDesc, pubId in pubIds.iteritems():
pubIdToDescs.setdefault(pubId, []).append(pubDesc)
# outDir has to be identical to the pubId from pubConf
pubId = basename(outDir)
issnOutFname = join(outDir, ISSNTAB)
if not isdir(outDir):
logging.info("Creating dir %s" % outDir)
os.makedirs(outDir)
# lookup Issns for publisher description
# write default ISSNs
logging.info("Writing to %s" % issnOutFname)
if issnOutFname not in outFhDict:
outFh = open(issnOutFname, "w")
outFh.write("issn\tjournal\tstartYear\tendYear\tpublisher\n")
outFhDict[issnOutFname] = outFh
else:
outFh = outFhDict[issnOutFname]
# now output ISSNs
issnCount = 0
for pubDesc in pubIdToDescs[pubId]:
logging.debug("Found %d issns for publisher %s" % (len(pubToIssn[pubDesc]), pubId))
for issn, title in pubToIssn[pubDesc]:
logging.info("Title: %s, print ISSN %s" % (title, issn))
if issn=="":
continue
line = "%(issn)s\t%(title)s\t0\t0\t%(pubDesc)s\n" % locals()
# some journal names contain unicode, we just strip it here
outFh.write(unidecode.unidecode(line))
issnCount+=1
# add manual ISSNs
for issn, startYear, endYear in issnOverwrite.get(pubId, []):
outFh.write("%s\t%d\t%d\t%s\n" % (issn, startYear, endYear, pubName))
issnCount+=1
logging.info("Wrote %d ISSNs" % issnCount)
def writeIssns(pubFname, outDir, pubDirs, issnOverwrite):
""" use pubConf.py and the publishers.tab file to get target ISSNs
and write them to outDir/issns.tab
"""
pubToIssnTitle = parseIssns(pubFname, pubDirs, issnOverwrite)
writeIssnTables(outDir, pubDirs, pubToIssnTitle, issnOverwrite)
def writePmids(pubDir, pmids):
" randomize pmids and write as pmids.txt to pubDir "
random.shuffle(pmids)
pmidFname = join(pubDir, "pmids.txt")
logging.info("Writing %d PMIDs to %s" % (len(pmids), pmidFname))
pmidFh = open(pmidFname, "w")
for pmid in pmids:
pmidFh.write(pmid+"\n")
def resolveIssnToPmid_Pubmed(issnRowFnames, minYear):
""" use pubmed eutils to get the PMIDs for a dict of
pubDir -> (list of ISSN records) and append to filename """
for pubDir, issns in issnRowFnames.iteritems():
logging.info("Processing publisher dir %s" % pubDir)
pmids = []
for issnData in issns:
# read pmids from pubmed or filesystem
logging.info("Retrieving PMIDs for ISSN %s" % issnData.issn)
query = issnData.issn+"[ta]"
startYear = int(issnData.startYear)
endYear = int(issnData.endYear)
if startYear==0:
startYear=str(minYear)
if endYear==0:
endYear="2030"
query += " %s:%s[dp]" % (str(startYear), str(endYear))
logging.debug("sending query to pubmed: %s" % query)
issnPmids = list(pubPubmed.ncbiESearch(query, tool="pubtools_pubPrepCrawl", \
email=pubConf.email, delaySecs=pubConf.eutilsDelaySecs))
if len(issnPmids)==0:
logging.warn("No PMIDs for pubmed query %s" % query)
else:
logging.info("Got %d PMIDs for ISSN %s" % (len(issnPmids), issnData.issn))
pmids.extend(issnPmids)
writePmids(pubDir, pmids)
def getPmidsForIssns(outDir, minYear):
" look for issns.lst in outdir, download all PMIDs for these ISSNs and write to pmids.txt "
#outDirFiles = glob.glob(join(outDir, "*"))
outDirFiles = [outDir]
# parse all issns into a big dict dirPath -> list of issns
outDirIssns = {}
for outDirPath in outDirFiles:
if not isdir(outDirPath):
logging.info("ignoring file %s (not a directory)" % outDirPath)
continue
#if onlyDir!=None and onlyDir!=basename(outDirPath):
#logging.info("Ignoring %s, onlyDir option set to %s" % (outDirPath, onlyDir))
#continue
issnFname = join(outDirPath, ISSNTAB)
if not isfile(issnFname):
logging.warn("Could not find %s" % issnFname)
continue
logging.info("Reading %s" % issnFname)
for row in maxCommon.iterTsvRows(issnFname):
outDirIssns.setdefault(outDirPath, []).append(row)
resolveIssnToPmid_Pubmed(outDirIssns, minYear)
def main(args, options):
pubGeneric.setupLoggingOptions(options)
command = args[0]
if len(args)>1:
outDir = args[1]
minYear = options.minYear
#onlyDir = options.onlyDir
pubDirs = pubCrawlConf.crawlPubIds
issnOverwrite = pubConf.crawlIssnOverwrite
pubFname = pubConf.publisherIssnTable
journalFname = pubConf.journalTable
command = command.lower()
if command=="publishers":
journalListDir = pubConf.journalListDir
pubResolvePublishers.initJournalDir(journalListDir, options.nlmCatalog, journalFname, pubFname)
elif command=="issns":
writeIssns(pubFname, outDir, pubDirs, issnOverwrite)
elif command=="pmids":
getPmidsForIssns(outDir, minYear)
elif command=="allpmids":
medlineDir = pubConf.resolveTextDir("medline")
pubUpdatePmids.updatePmids(medlineDir, outDir, None, minYear=minYear)
else:
raise Exception("Unknown command %s" % command)
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [issns|pmids] <outDir> - prepare the directories for the crawler filling it with ISSNs and PMIDs.
outDir has to correspond to a "publisher name". Publishers are defined in
pubCrawlConf.py and need three configuration items for the crawler:
- a publisher name in the NLM Catalog / Pubmed, for %prog. This will be used by %prog to
find the PMIDs.
- a list of rules, to specify how to crawl the files
- a delay time, between any two http requests
The journal lists for this program are stored in data/publishers/journals.tab.
If your publisher of interest is not already recognized in the "publisher" column in
the file journals.tab there, then the easiest way forward is probably to not
use the "issns" argument, but copy an existing issns.tab file and fill it with a handful of
ISSNs manually.
To get an idea of the problem, try this search:
$ egrep '(^source)|(EMBO)' data/journals/journals.tab
You can see that EMBO journals sometimes show Wiley as the publisher, sometimes EMBO,
and at other times are assigned to NPG. Neither NLM, nor Wiley nor Scopus really
show us the copyright holder.
commands:
%prog issns <pubName> - create a file pubName/issn.tab and fill it with all ISSNs of the
publisher named <pubName> from the file data/journals/publishers.tab
%prog pmids <pubName> - retrieve the PMIDs from Pubmed Eutils for all ISSNs in pubName/issns.tab
%prog allPmids <pubName> - retrieve the PMIDs from a local medline copy for all ISSNs
in all direct *subdirectories* of baseDir. In this way, you can update
all PMIDs for all publishers that you crawl with a single command.
usually not needed (the dir data/journals/ contains a pre-made list of publishers and their journals):
%prog publishers - re-create the list of publishers and their journals
""")
parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="show more debug messages")
parser.add_option("", "--nlmCatalog", dest="nlmCatalog", action="store", help="use a given nlmCatalog.xml instead of the default one. Used for the 'publishers' argument.")
parser.add_option("-m", "--minYear", dest="minYear", action="store", type="int", help="minimum year for articles, default is %default", default=1990)
#parser.add_option("-o", "--onlyDir", dest="onlyDir", action="store", help="apply the PMID step only on a given directory")
(options, args) = parser.parse_args()
if len(args)<1:
parser.print_help()
exit(1)
main(args, options)