pubConvElsevier

#!/usr/bin/env python

# load default python packages
from __future__ import print_function
import sys, logging, optparse, os, glob, zipfile, types, re, tempfile, shutil, codecs
from os.path import *

# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir  = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)

# now load our own libraries
import pubGeneric, pubStore, pubConf, maxCommon, pubXml, pubConvElsevier, maxRun

# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] <in> <out> - convert a Elsevier fulltext from CONSYN XML (zipfiles) format into pubtools format

If in and out are directories:
    Will create an index of all xml files and write them to 
    outDir/consyn.index
    Will split these to outDir/consyn.index.split/<xxxx>.tab
If in and out are files:
    Will parse the input file as an index file
    Writes to outDir/<xxxx>.zip

Example:
pubConvElsevier /hive/data/outside/pubs/elsevier/ /hive/data/inside/pubs/text/elsevier/

*big* CONSYN zipfiles have to be uploaded by Elsevier into your own ftp server
or are shipped on a hard disc. Updates are downloaded via http (RSS feed)

If a file doi2pmid.tab.gz is found in the input directory, the PMIDs are added for DOIs
where they are available.

Make sure that you run pubGetElsevier whenever a new batch is ready. Consyn 
will DELETE after a few days. If you missed updates, you need
to add them manually ("updated since=xxx"). More info on consyn.elsevier.com

""")

parser.add_option("", "--chunkSize", dest="chunkSize", action="store", type="int", help="number of articles per chunk, adapt this to your cluster, default %default", default=2000) 
parser.add_option("", "--minId", dest="minId", action="store", help="numerical IDs written to the pubStore start at this number times one billion to prevent overlaps of numerical IDs between publishers, default %s", default=pubConf.identifierStart["elsevier"]) 
parser.add_option("", "--notCompress", dest="notCompress", action="store_true", help="do not use compression", default=False) 
parser.add_option("", "--parse", dest="parse", action="store_true", help="for debugging, just parse one single xml file", default=None) 
parser.add_option("", "--auto", dest="auto", action="store_true", help="auto mode: read and write from default folders (<pubConf.pubsInDir>/elsevier, <pubConf.pubsTextDir>/elsevier), doesn't require any arguments", default=None) 

# ----------- MAIN --------------
def main(args, options):
    # only for debugging
    pubGeneric.setupLogging(progFile, options)
    if options.parse!=None:
        fname = args[0]
        logging.info("Parsing file %s, writing to temp.txt" % fname)
        ofh = codecs.open("temp.txt", "w", encoding="utf8") # etree can only accept normal strings
        xmlString = open(fname).read()
        xmlTree   = pubXml.etreeFromXml(xmlString)
        articleData = pubStore.createEmptyArticleDict()
        articleData = pubConvElsevier.parseElsevier(xmlTree, articleData)
        content = pubConvElsevier.treeToAscii_Elsevier(xmlTree)[0]
        ofh.write("RAW CONTENT with repr()\n")
        ofh.write(repr(pubStore.replaceSpecialChars(content))+"\n")
        ofh.write("RAW CONTENT without repr\n")
        ofh.write(pubStore.replaceSpecialChars(content)+"\n")
        ofh.write("ARTICLE DATA"+"\n")
        for key, val in articleData.iteritems():
            ofh.write("%s\t%s\n" % (key, pubStore.replaceSpecialChars(val)))
        sys.exit(0)
        
    if args==[] and not options.auto:
        parser.print_help()
        exit(1)

    # normal operation
    inDir, outDir = pubGeneric.setInOutDirs(options.auto, args, "elsevier")
    maxCommon.mustExist(inDir)

    minId = options.minId
    #chunkCount = options.chunkCount
    #maxSubmit = options.maxSubmit

    if not os.path.isdir(inDir):
        print("first parameter must be a directory")
        sys.exit(1)

    runner = pubGeneric.makeClusterRunner(__file__, maxJob=pubConf.convertMaxJob, headNode=options.cluster, outDir=outDir)
    pubConvElsevier.createChunksSubmitJobs(inDir, outDir, minId, runner, options.chunkSize)
    pubStore.updateSqlite(outDir)

pubGeneric.addGeneralOptions(parser)
(options, args) = parser.parse_args()
main(args, options)