-
Notifications
You must be signed in to change notification settings - Fork 21
/
pubConvElsevier
executable file
·93 lines (75 loc) · 4.25 KB
/
pubConvElsevier
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python
# load default python packages
from __future__ import print_function
import sys, logging, optparse, os, glob, zipfile, types, re, tempfile, shutil, codecs
from os.path import *
# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)
# now load our own libraries
import pubGeneric, pubStore, pubConf, maxCommon, pubXml, pubConvElsevier, maxRun
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] <in> <out> - convert a Elsevier fulltext from CONSYN XML (zipfiles) format into pubtools format
If in and out are directories:
Will create an index of all xml files and write them to
outDir/consyn.index
Will split these to outDir/consyn.index.split/<xxxx>.tab
If in and out are files:
Will parse the input file as an index file
Writes to outDir/<xxxx>.zip
Example:
pubConvElsevier /hive/data/outside/pubs/elsevier/ /hive/data/inside/pubs/text/elsevier/
*big* CONSYN zipfiles have to be uploaded by Elsevier into your own ftp server
or are shipped on a hard disc. Updates are downloaded via http (RSS feed)
If a file doi2pmid.tab.gz is found in the input directory, the PMIDs are added for DOIs
where they are available.
Make sure that you run pubGetElsevier whenever a new batch is ready. Consyn
will DELETE after a few days. If you missed updates, you need
to add them manually ("updated since=xxx"). More info on consyn.elsevier.com
""")
parser.add_option("", "--chunkSize", dest="chunkSize", action="store", type="int", help="number of articles per chunk, adapt this to your cluster, default %default", default=2000)
parser.add_option("", "--minId", dest="minId", action="store", help="numerical IDs written to the pubStore start at this number times one billion to prevent overlaps of numerical IDs between publishers, default %s", default=pubConf.identifierStart["elsevier"])
parser.add_option("", "--notCompress", dest="notCompress", action="store_true", help="do not use compression", default=False)
parser.add_option("", "--parse", dest="parse", action="store_true", help="for debugging, just parse one single xml file", default=None)
parser.add_option("", "--auto", dest="auto", action="store_true", help="auto mode: read and write from default folders (<pubConf.pubsInDir>/elsevier, <pubConf.pubsTextDir>/elsevier), doesn't require any arguments", default=None)
# ----------- MAIN --------------
def main(args, options):
# only for debugging
pubGeneric.setupLogging(progFile, options)
if options.parse!=None:
fname = args[0]
logging.info("Parsing file %s, writing to temp.txt" % fname)
ofh = codecs.open("temp.txt", "w", encoding="utf8") # etree can only accept normal strings
xmlString = open(fname).read()
xmlTree = pubXml.etreeFromXml(xmlString)
articleData = pubStore.createEmptyArticleDict()
articleData = pubConvElsevier.parseElsevier(xmlTree, articleData)
content = pubConvElsevier.treeToAscii_Elsevier(xmlTree)[0]
ofh.write("RAW CONTENT with repr()\n")
ofh.write(repr(pubStore.replaceSpecialChars(content))+"\n")
ofh.write("RAW CONTENT without repr\n")
ofh.write(pubStore.replaceSpecialChars(content)+"\n")
ofh.write("ARTICLE DATA"+"\n")
for key, val in articleData.iteritems():
ofh.write("%s\t%s\n" % (key, pubStore.replaceSpecialChars(val)))
sys.exit(0)
if args==[] and not options.auto:
parser.print_help()
exit(1)
# normal operation
inDir, outDir = pubGeneric.setInOutDirs(options.auto, args, "elsevier")
maxCommon.mustExist(inDir)
minId = options.minId
#chunkCount = options.chunkCount
#maxSubmit = options.maxSubmit
if not os.path.isdir(inDir):
print("first parameter must be a directory")
sys.exit(1)
runner = pubGeneric.makeClusterRunner(__file__, maxJob=pubConf.convertMaxJob, headNode=options.cluster, outDir=outDir)
pubConvElsevier.createChunksSubmitJobs(inDir, outDir, minId, runner, options.chunkSize)
pubStore.updateSqlite(outDir)
pubGeneric.addGeneralOptions(parser)
(options, args) = parser.parse_args()
main(args, options)