pubBlast

#!/usr/bin/env python
# load default python packages
import sys, logging, optparse, os, glob, shutil, subprocess, collections, codecs, textwrap
from os.path import *

# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir  = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)

# now load our own libraries
import pubConf, pubGeneric, util, maxCommon, maxRun, xml, html, pubStore, maxTables, maxbio
from Bio.Blast import NCBIXML

BlastMatchFields = "length,identities,positives,eVal,score,tFile,tId,tLen,qFile,qId,qLen,tAln,match,qAln".split(",")
BlastMatch = collections.namedtuple("blastMatch",BlastMatchFields)

AnnotatedMatchFields = (",".join(BlastMatchFields)+",articleId,authors,journal,year,title,abstract,fulltextUrl").split(",")
AnnotMatch = collections.namedtuple("blastMatchAnnot",AnnotatedMatchFields)

# ==== FUNCTIONS =====

def parseBlastXmlFile(qFile):
    " parses qFile and yields BlastMatch objects "
    for rec in NCBIXML.parse(open(qFile)):
        qId =  rec.query
        qLen = rec.query_length
        tFile = rec.database
        for alignment in rec.alignments:
            tLen = alignment.length
            tId = alignment.hit_def

            for hsp in alignment.hsps:
                qAln =  hsp.query
                tAln = hsp.sbjct
                match =  hsp.match
                match = BlastMatch(hsp.align_length, hsp.identities, hsp.positives, \
                    hsp.score, hsp.expect, tFile, tId, tLen, qFile, qId, qLen, tAln, match, qAln)
                yield match

def pubBlatToFasta(inFnames, outDir):
    " write pubBlat annotations from inDir to dbDir in fasta format"
    #maxCommon.mustBeEmptyDir(outDir, makeDir=True)
    #logging.info("Converting all files from %s to fasta" % inDir)
    #inFnames = glob.glob(join(inDir, inMask+".tab.gz"))

    pm = maxCommon.ProgressMeter(len(inFnames))
    for inFname in inFnames:
        faFname = join(outDir, basename(inFname).split(".")[0]+".fa")
        faDir = dirname(faFname)
        if not isdir(faDir):
            logging.info("Creating dir %s" % faDir) 
            os.makedirs(faDir)
        outFh = open(faFname, "w")

        logging.debug("Converting %s to %s" % (inFname, faFname))
        for row in maxCommon.iterTsvRows(inFname):
            if row.seq=="":
                continue
            outFh.write(">%s\n" % row.annotId)
            outFh.write(row.seq+"\n")
        pm.taskCompleted()
        outFh.close()
    return True
    
def pubToolsToFasta(textDir, inMask, seqType, outDir):
    " copy over *.fa files to outDir "
    maxCommon.mustBeEmptyDir(outDir, makeDir=True)
    inFnames = glob.glob(join(textDir, "%s.%s.fa" % (inMask, seqType)))
    if len(inFnames):
        logging.info("No files found of type %s" % seqType)
    logging.info("Copying .fa files from %s to %s" % (textDir, outDir))
    for inFn in inFnames:
        outFn = join(outDir, basename(inFn))
        #inF = gzip.open(inFn)
        #outF = open(outFn, "wb")
        #logging.info("Extracting %s to %s" % (inFn, outFn))
        #shutil.copyfileobj(inF, outF)
        shutil.copyfile(inFn, outFn)
    return True

def indexForBlast(inDir, isProt):
    " formatdb all .fa files in inDir "
    inFnames = glob.glob(join(inDir, "*.fa"))
    pm = maxCommon.ProgressMeter(len(inFnames))
    logging.info("Indexing all fa files in %s" % (inDir))

    if isProt:
        protFlag = "T"
    else:
        protFlag = "F"

    formatDbPath = join(pubConf.blastBinDir, "formatdb")
    for faFname in inFnames:
        logging.debug("Indexing %s" % (faFname))
        cmd = [formatDbPath, "-i", faFname, "-p", protFlag]
        try:
            ret = subprocess.call(cmd)
        except OSError:
            logging.error("Could not call %s" % formatDbPath) 
            sys.exit(1)
        if ret!=0:
            logging.error("Error %d calling %s" % (ret, cmd))
            sys.exit(1)
        pm.taskCompleted()


def indexAnnots(dbDir, datasets, isProt=False):
    " parse annot/prot/*.gz to dbDir as fasta and index for blast "
    maxCommon.mustExistDir(dbDir, makeDir=True)

    if isProt:
        seqType = "prot"
    else:
        seqType = "dna"
    dbDir = join(dbDir, seqType)
    maxCommon.mustExistDir(dbDir, makeDir=True)
    logging.info("Importing files into %s, seqtype protein %s" % (dbDir, isProt))

    for dataset in datasets:
        outDir = join(dbDir, dataset)
        if isdir(outDir):
            logging.warn("%s already exists, will overwrite it" % outDir)
            #continue
        #pubBlatDir = join(pubConf.pubMapBaseDir, dataset, "annots/"+seqType)
        annotMask = join(pubConf.pubMapBaseDir, dataset, "batches", "*", "annots", seqType, "*.tab.gz")
        annotFnames = glob.glob(annotMask)
        if len(annotFnames)==0:
            logging.warn("Not found %s" % annotMask)
            textDir = join(pubConf.textBaseDir, dataset)
            logging.info("Not found %s, trying %s" % (annotMask, textDir))
            if not isdir(textDir):
                logging.warn("Could not find %s" % textDir)
                continue
            pubToolsToFasta(textDir, "*", seqType, outDir)
        else:
            logging.info("Found %s, %d files" % (annotMask, len(annotFnames)))
            pubBlatToFasta(annotFnames, outDir)
        

        indexForBlast(outDir, isProt)
            
def submitBlastJobs(runner, prog, dbDir, inFaDir, outDir, eVal, dbSize, dataFilter):
    " submit blast jobs for all inFaFiles against all files in dbDir "
    if isfile(inFaDir):
        inFaFiles = [inFaDir]
    elif isdir(inFaDir):
        inFaFiles = glob.glob(join(inFaDir, "*.fa"))
    else:
        raise Exception("%s is neither a fa file nor a directory" % inFaDir)

    logging.info("Found %d input files" % len(inFaFiles))

    outDir = abspath(join(outDir, prog))
    maxCommon.mustExistDir(outDir, makeDir=True)
    if prog=="blastn": # nucl against nucl
        blastDir = join(dbDir, "dna")
        ext = ".nin"
    elif prog=="blastp": # prot against prot
        blastDir = join(dbDir, "prot")
        ext = ".pin"
    elif prog=="tblastn": # peptides against trans nucl
        blastDir = join(dbDir, "dna")
        ext = ".nin"
    elif prog=="blastx": # nucl against prot
        blastDir = join(dbDir, "prot")
        ext = ".pin"

    logging.info("Scanning %s for %s files" % (blastDir, ext))
    dbFnames = pubGeneric.findFiles(blastDir, ext)
    logging.info("Found %d target files" % len(dbFnames))

    if dataFilter:
        dbFnames = [f for f in dbFnames if dataFilter in f]
        logging.info("Filter '%s' activated: reduced to %d target files" % (dataFilter, len(dbFnames)))

    logging.info("Preparing %d BLAST jobs" % (len(dbFnames)*len(inFaFiles)))

    for fname in inFaFiles:
        for relDbDir, dbFname in dbFnames:
            if getsize(dbFname)==0:
                logging.warn("Zero filesize: %s" % dbFname)
                continue
            dbFname = splitext(dbFname)[0]
            blastOutDir = join(outDir, relDbDir)
            if not isdir(blastOutDir):
                logging.info("making %s" % blastOutDir)
                os.makedirs(blastOutDir)
            inBase = splitext(basename(fname))[0]
            dbBase = splitext(basename(dbFname))[0]
            outFname = join(blastOutDir, dbBase+"---"+inBase+".xml")
            blastPath = join(pubConf.blastBinDir, "blastall")
            cmd = [blastPath, "-p", prog, "-i", "%s" % fname, "-d", \
                "%s" % dbFname, "-e", eVal, "-z", dbSize, \
                "-o", "{check out exists %s}" % outFname, "-m", "7"]
            runner.submit(cmd)

def parseOneBlastFile(inFname, outFname):
    ofh = open(outFname, "w")
    ofh.write("\t".join(AnnotatedMatchFields)+"\n")
    #try:
    for match in parseBlastXmlFile(inFname):
        articleId = match.tId[:10]
        artData = pubStore.lookupArticleData(articleId)
        if artData==None:
            logging.warn("Could not resolve artId %s" % str(articleId))
            continue
        match = [str(x) for x in match]
        match.append(artData["articleId"].encode("utf8"))
        match.append(artData["authors"].encode("utf8"))
        match.append(artData["journal"].encode("utf8"))
        match.append(str(artData["year"]).encode("utf8"))
        match.append(artData["title"].encode("utf8"))
        match.append(artData["abstract"].encode("utf8"))
        match.append(artData["fulltextUrl"].encode("utf8"))
        ofh.write("\t".join(match))
        ofh.write("\n")
    #except xml.parsers.expat.ExpatError:
        #logging.warn("Parsing error: %s" % inFname)

def parseBlastLookupArticle(inDir, outFile):
    " parse all blast xml files and write to outFile in BlastMatch format "
    ofh = open(outFile, "w")
    ofh.write("\t".join(AnnotatedMatchFields)+"\n")
    logging.info("Searching for XML files in %s" % inDir)
    inFiles = pubGeneric.findFiles(inDir, ".xml")
    logging.info("Parsing files...")
    pm = maxCommon.ProgressMeter(len(inFiles), stepCount=100)
    for relDir, fname in inFiles:
        logging.debug(fname)
        if getsize(fname)==0:
            logging.warn("zero filesize: %s" % fname)
            continue
        try:
            for match in parseBlastXmlFile(fname):
                articleId = match.tId[:10]
                artData = lookupArticleData(articleId)
                match = [str(x) for x in match]
                match.append(artData["authors"].encode("utf8"))
                match.append(artData["journal"].encode("utf8"))
                match.append(str(artData["year"]).encode("utf8"))
                match.append(artData["title"].encode("utf8"))
                match.append(artData["abstract"].encode("utf8"))
                match.append(artData["fulltextUrl"].encode("utf8"))
                ofh.write("\t".join(match))
                ofh.write("\n")
        except xml.parsers.expat.ExpatError:
            logging.warn("Parsing error: %s" % fname)
        pm.taskCompleted()

conCache = {}

def makeHtml(inFile, outFile, textDir, minId):
    " read a tab sep file with BlastMatch records and write out as html table "
    ofh = codecs.open(outFile, "w", encoding="utf8")
    h = html.htmlWriter(fh=ofh)
    h.head("Literature BLAST", styleString=html.getStylesheet("dyndrive"))
    h.startBody("BLAST matches")
    colHeaders = ["Query Sequence", "Alignment", "Target Article"]
    colWidths = [300, 600, 700]
    h.startTable(colWidths, colHeaders)

    for match in maxCommon.iterTsvRows(inFile):
        if minId:
            matchLen = float(len(match.match.replace(" ", "").replace("+", "")))
            queryLen = len(match.qAln)
            if (matchLen/queryLen)*100.0 < minId:
                continue

        h.startTr()
        qFile = match.qFile.split("---")[1].split(".")[0]
        lines = textwrap.wrap(match.qId, 30)
        qText = "<b>Input Sample:</b> %s<br><b>Sequence:</b> %s" % (qFile, " ".join(lines))
        alignText = "<tt>Query: &nbsp;%s<br>Match:&nbsp;  %s<br>Target: %s</tt>" % (match.qAln, match.match, match.tAln)
        articleId = match.tId[:10]
        #tText = lookupArticleData(textDir, articleId)
        dataset = pubStore.articleIdToDataset(articleId)
        authors = match.authors
        author = authors.split(",")[0]+" et al., "+match.journal
        title = match.title
        year = match.year
        journal = match.journal
        #title = title.encode("latin1").decode("utf8")
        articleHtml = '<small>%s (%s)</small><br><a href="%s">%s</a>' % (author, dataset, "", title)

        h.td(qText)
        h.td(alignText)
        h.td(articleHtml)
        h.endTr()

    h.endTable()

def submitParseJobs(inDir, outDir):
    maxCommon.mustBeEmptyDir(outDir)
    logging.info("Searching for XML files in %s" % inDir)
    inFnames = pubGeneric.findFiles(inDir, ".xml")

    logging.info("Creating jobList file")
    jblfh = open("jobList", "w")
    for relDir, inFname in inFnames:
        relOutDir = join(outDir, relDir)
        if not isdir(relOutDir):
            os.makedirs(relOutDir)
        outFname = join(relOutDir, splitext(basename(inFname))[0]+".tab")
        cmd = "%s %s %s {check in exists %s} {check out exists %s}\n" % \
                (sys.executable, __file__ , "parseJob", inFname, outFname)
        jblfh.write(cmd)
    jblfh.close()

    logging.info("Running jobs")
    #parseBlastLookupArticle(inDir, outFile)
    cmd = "ssh ku 'cd %s; para resetCounts; para clearSickNodes; para make jobList'" % os.getcwd()
    os.system(cmd)

def iterIds(faFname):
    for line in open(faFname):
        line = line.strip()
        if line.startswith(">"):
            yield line.strip(">").split()[0]

def iterSeqCounts(dbDir):
    " find all fa files in dbDir and return count of sequences per article "
    logging.info("Searching for fa files in %s" % dbDir)
    faFnames = pubGeneric.findFiles(dbDir, ".fa")
    pm = maxCommon.ProgressMeter(len(faFnames))

    logging.info("Parsing fa files to get article -> seq counts")
    for relDir, fname in faFnames:
        counts = collections.defaultdict(int)
        for seqId in iterIds(fname):
            artId = seqId[:10]
            counts[int(artId)]+=1
            #print counts
        yield counts
        pm.taskCompleted()
            

# === MAIN ====
def main(args, options):
    cmd = args[0]
    dbDir = args[1]

    if cmd == "index":
        datasets = args[2:]
        indexAnnots(dbDir, datasets, isProt=True)
        indexAnnots(dbDir, datasets, isProt=False)

    elif cmd in ["dna", "prot"]:
        inFaDir = args[2]
        inFaDir = abspath(inFaDir)
        outDir = args[3]
        #runner = maxRun.Runner(maxRam="2g")
        runner = pubGeneric.makeClusterRunner(__file__)
        if cmd=="dna":
            submitBlastJobs(runner, "blastn", dbDir, inFaDir, outDir, options.eVal, options.dbSize, options.limit)
            submitBlastJobs(runner, "blastx", dbDir, inFaDir, outDir, options.eVal, options.dbSize, options.limit)
        elif cmd=="prot":
            submitBlastJobs(runner, "blastp", dbDir, inFaDir, outDir, options.eVal, options.dbSize, options.limit)
            submitBlastJobs(runner, "tblastn", dbDir, inFaDir, outDir, options.eVal, options.dbSize, options.limit)
        runner.finish(wait=True)

    elif cmd=="parse":
        inDir = args[1]
        outFile = args[2]
        parseBlastLookupArticle(inDir, outFile)
        
    elif cmd=="clustParse":
        inDir = args[1]
        outDir = args[2]
        submitParseJobs(inDir, outDir)
        
    elif cmd=="parseJob":
        inFname = args[1]
        outFname = args[2]
        parseOneBlastFile(inFname, outFname)

    elif cmd=="clustCat":
        inDir = args[1]
        outFname = args[2]
        logging.info("Looking for files in %s" % inDir)
        inFnames = pubGeneric.findFiles(inDir, ".tab")
        ofh = open(outFname, "w")
        ofh.write("\t".join(AnnotatedMatchFields)+"\n")
        pm = maxCommon.ProgressMeter(len(inFnames))
        logging.info("Concatting...")
        for relDir, fn in inFnames:
            for line in open(fn):
                if not line.startswith("length"):
                    ofh.write(line)
            pm.taskCompleted()
        ofh.close()

    elif cmd=="counts":
        outFname = args[2]

        # get list of article ids to remove
        #removeArticles = set()
        ofh = open(outFname, "w")
        ofh.write("\t".join(["articleId", "seqCount"])+"\n")
        for counts in iterSeqCounts(dbDir):
            for artId, count in counts.iteritems():
                ofh.write("%s\t%d\n" % (artId, count))
            #if count > options.maxCount:
                #removeArticles.add(artId)

        #logging.info("Filtering %s" % inFname)
        #headerLine = open(inFname).readline()
        #ofh = open(outFname, "w")
        #ofh.write("\t".join(AnnotatedMatchFields)+"\n")
        #ofh.write(headerLine)
        #for row in maxCommon.iterTsvRows(inFname):
            #if not row.tId[:10] in removeArticles:
                #ofh.write("\t".join(row)+"\n")

    elif cmd=="html":
        inFile = args[1]
        outFile = args[2]
        textDir = None
        if len(args)>3:
            textDir = args[3]
        makeHtml(inFile, outFile, textDir, options.percentId)

    else:
        raise Exception("Illegal command %s" % cmd)
        

# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: 
%prog index <dbDir> <importNames> - import data from importNames
        dataset names can be either from pubConf.pubMapBaseDir (fulltext, like elsevier, pmc)
        or pubConf.textBaseDir (DBs, like genbank, imgt)
    
%prog dna <dbDir> <inDir> <outDir>  - blastn files from inDir onto dbDir and write results to outDir
%prog prot <dbDir> <inDir> <outDir>  - blastp files from inDir onto dbDir and write results to outDir

%prog parse <outDir> <outFile>  - parse blast xml files in outDir and subdirs to outFile
ALTERNATIVE:
%prog clustParse <outDir> <parseOutDir>  - parse blast xml files on ku to .tab files
%prog clustCat <parseOutDir> <outFile>  - concat .tab files

%prog html <outFile> <outHtmlFile> - write html summary 

%prog counts <dbDir> <outTabFile>

examples:
pubBlast index /hive/data/inside/pubs/blastpDb elsevier pmc imgt aai
pubBlast prot /hive/data/inside/pubs/blastpDb /hive/users/nknguyen/immuno/AS/adaptiveTcr/irepNallAdaptTCR/productiveCdr3 out/
mkdir out.parse
pubBlast clustParse out out.parse
pubBlast clustCat out.parse out.tab
pubBlast html out.tab out.html

count command:
pubBlast counts hive/data/inside/pubs/blastpDb artIdCounts.tab

""")

parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") 
parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="show more debug messages") 
parser.add_option("-e", "--eVal", dest="eVal", action="store", help="maximum eVal (careful: set the database size to something reasonable!), default %default", default="10")
parser.add_option("-z", "--dbSize", dest="dbSize", action="store", help="effective database size, default %default", default="30000000")
parser.add_option("-m", "--maxCount", dest="maxCount", action="store", type="int", help="for the filter step: remove articles with more than X matches, default %default", default=50)
parser.add_option("-i", "--percentId", dest="percentId", action="store", type="int", help="minimum percent id for match to shown in the html step, calculated as length of Match-string without spaces/+s etc divided by length of Query string", default=0) 
parser.add_option("-l", "--limit", dest="limit", action="store", help="blast only against a given dataset, e.g. pmc, elsevier or imgt")
(options, args) = parser.parse_args()
if len(args)<=1:
    parser.print_help()
    sys.exit(1)
pubGeneric.setupLoggingOptions(options)
main(args, options)