pubStore

#!/usr/bin/env python2

# general tool to extract from, rewrite or add to text collection files
# that are split into "chunks". This is our primary text storage format
# aka "pubStore format".

# first load the standard libraries from python
# we require at least python 2.7
#from sys import *
from __future__ import print_function
import sys
if sys.version_info[0]==2 and not sys.version_info[1]>=7:
    print("Sorry, this program requires at least python 2.7")
    exit(1)

# load default python packages
import logging, optparse, os, glob, zipfile, types, gzip, shutil
from os.path import *

# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir  = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)

# now load our own libraries
import pubChange
import pubGeneric, maxRun, pubStore, pubConf, maxCommon, pubXml, pubPubmed

# === CONSTANTS ===================================
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] command options

command is one of filterText, filterPmid, addPmids, rechunk

command "filterText": 
    create subset of dataset(s) with text that contains a keyword
    %prog filter <datasetList> <keywordListCommaSep> <datasetOut> 
example:
%prog filter pmc,elsevier,crawler ebola,filovirus ebola

command "filterPmid": 
    create subset of dataset(s) with a list of PMIDs, create a new dataset
    %prog filter <datasetList> <pmidListFile> <datasetOut> 
example:
%prog filter pmc,elsevier,crawler uniprotPmids.txt uniProtText

command "filterIssn":
    %prog filterIssn <datasetList> issn1,issn2,issn3,... <outFname>
example:
%prog filterIssn pmc 1367-4803 bioformatics
It may be easier to use sqlite3 + filterPmid for ISSN filtering

command "addPmids": 
    read PMIDs from datasetDir/medline.ids.tab, rewrite all 
    articles.gz files and add the new PMID.
    medlines.ids.tab can be created with pubFingerprint.
example:
%prog addPmids elsevier 

command "rechunk":
    %prog rechunk <inDir> <outDir>
read all munched text tables from inDir and write to outDir. Merges smaller
chunks into bigger ones.

command "index":
    %prog index inDir
loads missing article files into the sqlite index. 

command "show":
    %prog show <inDirList> <docSelector>
pull a single text file out of the text directories specified.
Output goes to stdout, attributes are prefixed with |
example:
%prog show pmc 'pmid=1234567' 


""")

#parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") 
#parser.add_option("", "--minId", dest="minId", action="store", help="numerical IDs written to the pubStore start at this number times one billion to prevent overlaps of numerical IDs between publishers, default %default", default=pubConf.identifierStart["medline"]) 
#parser.add_option("", "--parse", dest="parse", action="store", help="for debugging, just parse one single xml file", default=None) 
pubGeneric.addGeneralOptions(parser)
(options, args) = parser.parse_args()

# ==== FUNCTIONs =====
def main(args, options):
    cmd = args[0]

    if cmd=="filterIssn":
        inSpec, issnListStr, outFname = args[1:]
        store  = pubStore.PubWriterFile(outFname)
        issns = issnListStr.split(",")
        inDirs = pubConf.resolveTextDirs(inSpec)

        for inDir in inDirs:
            for reader in pubStore.iterPubReaders(inDir):
                for article, files in reader.iterArticlesFileList(None):
                    if article.eIssn in issns or article.printIssn in issns:
                        fileDicts = [f._asdict() for f in files]
                        store.writeDocs(article._asdict(), fileDicts)

        store.close()

    elif cmd=="filterPmid":
        inSpec, pmidFname, outDir = args[1:]
        pmidFname = os.path.abspath(pmidFname)
        assert(isfile(pmidFname))
        pubChange.filterCmd(inSpec, pmidFname, outDir, options)

    elif cmd=="filterText":
        inSpec, searchSpec, outDir = args[1:]

        partsDir = join(outDir, "parts")
        maxCommon.mustBeEmptyDir(partsDir, makeDir=True)
        outFnames = pubChange.filterCmd(inSpec, searchSpec, partsDir, options)
        pubChange.rechunk(partsDir, outDir)

        # cleanup
        for fname in outFnames:
            os.remove(fname)
        os.removedirs(partsDir)

    elif cmd=="filterJob":
        inSpec, pmidFname, outSpec = args[1:]
        pubChange.filterOneChunk(inSpec, pmidFname, outSpec)

    elif cmd=="rechunk":
        inDir, outDir = args[1:]
        pubChange.rechunk(inDir, outDir)

    elif cmd=="index":
        inDir = args[1]
        inDir = pubConf.resolveTextDir(inDir)
        pubStore.updateSqlite(inDir)

    elif cmd=="show":
        inDirStr, whereExpr = args[1:]
        inDirs = inDirStr.split(",")
        for artDict, fileDicts in pubStore.lookupFullDocs(inDirs, whereExpr):
            print(pubStore.dictToMarkLines(artDict))
            print()
            for fileDict in fileDicts:
                print(pubStore.dictToMarkLines(fileDict))

    elif cmd=="addPmids":
        pubChange.addPmids(args[1])
    else:
        raise Exception("Unknown command %s" % cmd)
        
# ----------- MAIN --------------
if args==[]:
    parser.print_help()
    exit(1)

# normal operation
pubGeneric.setupLogging(progFile, options)
main(args, options)