-
Notifications
You must be signed in to change notification settings - Fork 21
/
pubConvSpringer
executable file
·85 lines (67 loc) · 3.47 KB
/
pubConvSpringer
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python
# load default python packages
from __future__ import print_function
import sys, logging, optparse, os, glob, zipfile, types, re, tempfile, shutil, codecs
from os.path import *
# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)
# now load our own libraries
import pubGeneric, pubStore, pubConf, maxCommon, pubXml, pubConvSpringer, maxRun
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] <in> <out> - convert fulltext from Springer directories to pubtools format.
If in and out are directories:
Will create an index of all xml files and write them to
outDir/springer.index
Will split these to outDir/springer.index.split/<xxxx>.tab
If in and out are files:
Will parse the input file as an index file
Writes to outDir/<xxxx>.zip
Example:
pubConvSpringer /hive/data/outside/pubs/springer/ /hive/data/inside/pubs/text/springer/
Big zips are shipped on harddisk, updates transferred by ftp.
Email [email protected] for details.
XML documentation: see http://www.springeropen.com/get-published/indexing-archiving-and-access-to-data/xml-dtd
""")
parser.add_option("", "--chunkSize", dest="chunkSize", action="store", type="int", help="number of articles per chunk, adapt this to your cluster, default %default", default=700)
#parser.add_option("", "--minId", dest="minId", action="store", help="numerical IDs written to the pubStore start at this number times one billion to prevent overlaps of numerical IDs between publishers, default %s", default=pubConf.identifierStart["springer"])
parser.add_option("", "--parse", dest="parse", action="store_true", help="for debugging, just parse one single xml file", default=None)
parser.add_option("", "--auto", dest="auto", action="store_true", help="read from default input text dir, write to default output text dir")
parser.add_option("", "--finish", dest="finish", action="store_true", help="finish up after a cluster batch failed and was fixed")
pubGeneric.addGeneralOptions(parser)
(options, args) = parser.parse_args()
# ----------- MAIN --------------
# only for debugging
pubGeneric.setupLogging(progFile, options)
if options.parse!=None:
fname = args[0]
logging.info("Parsing file %s, writing to temp.txt" % fname)
ofh = codecs.open("temp.txt", "w", encoding="utf8") # etree can only accept normal strings
xmlString = open(fname).read()
xmlTree = pubXml.etreeFromXml(xmlString)
articleData = pubStore.createEmptyArticleDict()
articleData = pubConvSpringer.parseXml(xmlTree, articleData)
ofh.write("ARTICLE DATA"+"\n")
for key, val in articleData.iteritems():
ofh.write("%s\t%s\n" % (key, pubStore.replaceSpecialChars(val)))
sys.exit(0)
if args==[] and not options.auto:
parser.print_help()
exit(1)
# normal operation
if options.auto:
inDir, outDir = pubConf.defaultInOutDirs("springer")
else:
inDir, outDir = args
if options.finish:
pubConvSpringer.finishUp(outDir)
maxCommon.mustExist(inDir)
#minId = options.minId
minId = pubConf.identifierStart["springer"]
if not os.path.isdir(inDir):
print("first parameter must be a directory")
sys.exit(1)
runner = pubGeneric.makeClusterRunner(__file__, maxJob=pubConf.convertMaxJob, headNode=options.cluster, outDir=outDir)
pubConvSpringer.createChunksSubmitJobs(inDir, outDir, runner, options.chunkSize)