-
Notifications
You must be signed in to change notification settings - Fork 21
/
pubBedOverlapSearch
executable file
·207 lines (168 loc) · 8.17 KB
/
pubBedOverlapSearch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!/usr/bin/env python
# first load the standard libraries from python
# we require at least python 2.5
#from sys import *
from __future__ import print_function
import sys
if sys.version_info[0]==2 and not sys.version_info[1]>=7:
print("Sorry, this program requires at least python 2.7")
print("You can download a more current python version from python.org and compile it")
print("into your homedir with 'configure --prefix ~/python'; make;")
print("then run this program by specifying your own python executable like this: ")
print(" ~/python/bin/python ~/pubtools/pubtools")
print("or add python/bin to your PATH before /usr/bin, then run pubtools itself")
exit(1)
# load default python packages
import logging, optparse, os, tempfile
from os.path import *
# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)
# now load our own libraries
import pubGeneric, pubStore, pubConf, maxCommon, pubPubmed, util, tabfile, pubAlg, maxTables
from pubXml import *
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] <blatBaseDir> <db> <keywordsCommaSep> <bedfile1> <bedfile2> ... outFile - search pubBlat matches that overlap a given bed (with unique names) file for list of keywords (comma-sep), return numbers of bed features that are overlapped by features with this keyword for each input file
""")
parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="show more debug messages")
parser.add_option("-b", "--pubBed", dest="pubBed", action="store", help="use this bed file instead of blatBaseDir/tables/<db>.bed")
parser.add_option("-r", "--removeBed", dest="removeBed", action="store", help="do an overlapSelect -noOverlap against this file at each step")
parser.add_option("-o", "--outFname", dest="outFname", action="store", help="outfile file, default is stdout", default="stdout")
parser.add_option("-a", "--onlyAbstracts", dest="onlyAbstracts", action="store_true", help="search only abstracts for keywords", default=False)
(options, args) = parser.parse_args()
# ==== FUNCTIONs =====
def overlapSelect_MergeId(selectBed, inBed, removeBed=None):
"""
overlap two bed files (using blocks! not ranges),
return dictionary inBedName-name -> set of selectBed names
"""
logging.info("Running overlapSelect (input bed needs to have unique names for all features)")
tempOut = tempfile.NamedTemporaryFile(suffix=".bed", prefix="overlapSelectOutputId")
tempOutName = tempOut.name
if removeBed==None:
removeBedCmd = ""
else:
removeBedCmd = "overlapSelect -nonOverlapping -inFmt=bed %s stdin stdout | " % removeBed
cmd = "cut -f1-12 %(selectBed)s | %(removeBedCmd)s overlapSelect stdin -idOutput %(inBed)s -selectFmt=bed %(tempOutName)s" % locals()
util.execCmdLine(cmd)
idDict = tabfile.slurpdictset(tempOutName)
return idDict
def dispIdToArticleId(overlapIdMap, dispToArt, targetArticleIds):
""" convert all elements in values of overlapIdMap (dispIds) to
articleIds using the authors.tab file and keep those also in targetArticleIds"""
allArticleIds = []
filteredArticleIds = []
demoCount = 0
convDict = {}
for id1, dispIdSet in overlapIdMap.iteritems():
artIdList = []
for dispId in dispIdSet:
if not dispId in dispToArt:
logging.warn("%s not found in authors.tab" % dispId)
continue
artId = dispToArt[dispId]
artIdList.append(artId)
filteredSet = set(artIdList).intersection(targetArticleIds)
if len(filteredSet)>0:
convDict[id1] = filteredSet
if demoCount<10:
logging.debug(id1)
demoCount+=1
allArticleIds.extend(artIdList)
filteredArticleIds.extend(filteredSet)
logging.info("Overlapped articleIds %d" % len(set(allArticleIds)))
logging.info("Keyword-containing articleIds %d" % len(set(filteredArticleIds)))
return convDict
def getAllValues(dict):
" given a dict key -> list, get all list members "
vals = []
for key, valList in dict.iteritems():
vals.extend(valList)
return set(vals)
#def overlapArticleIds(inBed, pubBed, blatBaseDir, db, dispToArt, targetArticleIds, removeBed):
#overlapIdMap = overlapSelectOutputId(pubBed, inBed, removeBed)
#bedArticleIds = dispIdToArticleId(overlapIdMap, dispToArt, targetArticleIds)
#articleIds = getAllValues(bedArticleIds)
#return bedArticleIds
def searchForKeywords(textDir, keywords, onlyAbstracts):
" search textDir for keywords, search either abstracts of fulltext "
keywordDict = {}
keywordFname = ""
for keyword in keywords:
keyword = keyword.replace("_", " ")
keywordDict[keyword]=keyword
logging.debug("Got search keyword %s" % keyword)
keywordFname += keyword.replace(" ","_")
if onlyAbstracts:
keywordFname += ".abstracts"
keywordFname += ".articleIds"
paramDict = {"keywordDict" : keywordDict, "onlyAbstract": onlyAbstracts}
if not isfile(keywordFname):
logging.info("File %s not found, regenerating it" % keywordFname)
pubAlg.mapReduce("keywordSearch.py:FilterKeywords", textDir, paramDict, keywordFname, deleteDir=False)
else:
logging.info("Found file %s with articleIds for these keywords" % keywordFname)
logging.info("Delete this file to regenerate it")
targetArticleIds = set(maxTables.TableParser(open(keywordFname)).column("articleId"))
logging.info("Got %d articleIds with keywords" % len(targetArticleIds))
return targetArticleIds
def filterPubBed(inBed, targetArticleIds, dispToArticleId):
""" keep only features from bed that come from targetArticleIds, use dispToArtId to resolve IDs
returns temporary filename, has to be deleted afterwards
"""
logging.debug("Reading and filtering %s" % inBed)
passedLines = []
for line in open(inBed):
fields = line.split("\t")
bedName = fields[3]
articleId = dispToArticleId[bedName]
if articleId in targetArticleIds:
passedLines.append(line)
tmpFname = tempfile.mktemp(prefix="filterPubBed", suffix=".bed")
logging.debug("Writing %s" % tmpFname)
fh = open(tmpFname, "w")
for line in passedLines:
fh.write(line)
fh.close
return tmpFname
def showBedFeatures(bedNameDict, filteredPubBed):
logging.debug("Debug mode: showing input BEDs that are overlapped")
#print bedNameDict
for b in maxTables.openBed(filteredPubBed, fileType="bed4"):
#print b
if b.name in bedNameDict:
print("%s:%d-%d %s %s" % (b.chrom, b.start, b.end, b.name, bedNameDict[b.name]))
#print "\t".join(b)
# ----------- MAIN --------------
if args==[]:
parser.print_help()
exit(1)
pubGeneric.setupLogging(progFile, options)
blatBaseDir, db, keywordString = args[:3]
inBedFiles = args[3:]
pubBed = options.pubBed
removeBed = options.removeBed
outFname = options.outFname
if not isdir(blatBaseDir):
blatBaseDir = join(pubConf.pubBlatBaseDir, blatBaseDir)
textDir = open(join(blatBaseDir, "textDir.conf")).read().strip("\n")
keywords = keywordString.split(",")
targetArticleIds = searchForKeywords(textDir, keywords, options.onlyAbstracts)
# read mapping dispId -> articleId
dispIdFname = join(blatBaseDir, "authors.tab")
dispToArt = tabfile.slurpdict(dispIdFname, keyField=1, valField=0)
if pubBed==None:
pubBed = join(blatBaseDir, "tables", db+".bed")
outFile = maxTables.openFile(outFname, "w")
filteredPubBed = filterPubBed(pubBed, targetArticleIds, dispToArt)
for inBed in inBedFiles:
bedNameDict = overlapSelect_MergeId(filteredPubBed, inBed, removeBed=removeBed)
if options.debug:
showBedFeatures(bedNameDict, inBed)
#bedToArticleIds = overlapArticleIds(inBed, pubBed, blatBaseDir, db, dispToArt, targetArticleIds, removeBed)
outFile.write("overlapCount %s %d\n" %(inBed, len(bedNameDict)))
if not options.debug:
os.remove(filteredPubBed)