-
Notifications
You must be signed in to change notification settings - Fork 20
/
plotBigRocPrecRecall.py
274 lines (229 loc) · 9.67 KB
/
plotBigRocPrecRecall.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
# for each dataset, get the top25% and the bottom25% and assign labels 1.0 and 0.0
# then get all scores for these sequences and write scores and labels to a file out/precRecData/dataset-scoreType.tab
# then use the R ROCR package, make a precision/recall plot and write the x and y values to a file
# precRec/dataset-scoreType.precRec.tab
# finally plot all these curves with matplotlib
import os, logging, glob
from annotateOffs import *
from collections import defaultdict
from os.path import *
import matplotlib
#matplotlib.use('Agg')
#matplotlib.rcParams['pdf.fonttype'] = 42
import matplotlib.pyplot as plt
#selDatasets = ["doench2014-Hs", "morenoMateos2015", "chari2015Train", "xu2015TrainHl60", "chariEval", "hart2016HelaLib2_hg19"]
#selDatasets = ["xu2015TrainHl60", "xu2015TrainMEsc", "doench2014-Hs", "doench2014-Mm", "morenoMateos2015", "chari2015Train", "doench2016azd_hg19", "hart2016HelaLib2_hg19", "hart2016HelaLib1_hg19", "hart2016Hct1161lib1_hg19", "wang2015_hg19"]
#selDatasets = ["xu2015TrainHl60", "xu2015TrainMEsc", "doench2014-Hs", "doench2014-Mm", "morenoMateos2015", "chari2015Train", "doench2016azd_hg19", "hart2016-HelaLib2Avg", "hart2016-HelaLib1Avg", "hart2016-Hct1161lib1Avg"]
selDatasets = ["xu2015TrainHl60", "xu2015TrainMEsc", "doench2014-Mm", "morenoMateos2015", "chari2015Train293T", "doench2016azd_hg19", "hart2016-Hct1161lib1Avg"]
scoreDescs = {
"wang" : "Wang Score2",
"wangOrig" : "Wang Score",
"doench" : "Doench Score",
"ssc" : "Xu Score",
"chariRaw" : "Chari Score",
"chariRank" : "Chari Rank Score",
"chariRaw" : "Chari Score",
"finalGc6" : "Ren: last 6 bp GC>4",
#"finalGc2" : "Farboud-like, last 2 bp GC",
"finalGg" : "Farboud: ends with GG",
"myScore" : "Max Score",
"crisprScan" : "Moreno-Matos Score",
"fusi" : "Fusi/Doench Score",
"drsc" : "Housden Score",
"wuCrispr" : "Wong Score",
"oof" : "Bae Out-of-Frame Score"
}
# doench = regression
def getQuartiles(freqs):
""" given a seq -> (name, freq) dict return the sequences with top/bottom 25% freq """
freqNames = []
for seq, nameFreq in freqs.iteritems():
name, freq = nameFreq
freqNames.append( (freq, seq) )
freqNames.sort()
topQuartStart = int(len(freqNames)*0.75)
bottomQuartEnd = int(len(freqNames)*0.25)
topQuart = freqNames[topQuartStart:]
bottomQuart = freqNames[:bottomQuartEnd]
topSeqs = [y for x,y in topQuart]
bottomSeqs = [y for x,y in bottomQuart]
return topSeqs, bottomSeqs
def getScores(seqScores, seqs):
""" given a dict seq -> seqType -> score and a list of seqs, return a dict
scoreType -> list of score (float, same order as in seqs)
"""
nameToScores = {}
for scoreName in allScoreNames:
scores = []
for s in seqs:
scores.append(seqScores[s][scoreName])
nameToScores[scoreName] = scores
return nameToScores
def readData(datasetName):
" return the 1/0 labels and a dict with scoreName -> list of scores "
seqScores, freqs = parseEffScores(datasetName)
topSeqs, bottomSeqs = getQuartiles(freqs)
labels = [1]*len(topSeqs)
labels.extend( [0]*len(bottomSeqs) )
allSeqs = topSeqs
allSeqs.extend(bottomSeqs)
allScores = getScores(seqScores, allSeqs)
#assert(len(labels)==len(allScores.values()[0]))
return labels, allScores, allSeqs
def writeSeqLabelsScores(allSeqs, labels, allScores, outDir, datasetName):
for scoreName, scores in allScores.iteritems():
assert(len(allSeqs)==len(scores)==len(labels))
outFname = join(outDir, "%s_%s.tab" % (datasetName, scoreName))
ofh = open(outFname, "w")
ofh.write("seq\tlabel\tscore\n")
for seq, label, score in zip(allSeqs, labels, scores):
ofh.write("%s\t%d\t%f\n" % (seq, label, score))
ofh.close()
print "wrote %s" % ofh.name
def writeRTables(outDir, datasetName):
" write R dataframes to outDir/datasetName-scoreName.tab "
allFound = True
for scoreName in allScoreNames:
outFname = join(outDir, "%s_%s.tab" % (datasetName, scoreName))
if not isfile(outFname):
allFound = False
if allFound:
print "all scoring files already exist for dataset %s in outFname %s" % (datasetName, outFname)
return
labels, allScores, allSeqs = readData(datasetName)
writeSeqLabelsScores(allSeqs, labels, allScores, outDir, datasetName)
def parseCoords(inDir, datasetName, plotType):
""" parse all files in inDir/datasetName*.coords.tab and return as dict
scoreName -> (xList, yList)
"""
ret = {}
fnames = glob.glob(join(inDir, "%s_*.%sCoords.tab" % (datasetName, plotType)))
if len(fnames)==0:
raise Exception("No filenames for dataset %s" % datasetName)
for fname in fnames:
xList, yList = [], []
for line in open(fname):
x, y = line.strip().split()
if y=="NA":
continue
xList.append(float(x))
yList.append(float(y))
scoreName = basename(fname).split('.')[0].split('_')[-1]
ret[scoreName] = (xList, yList)
return ret
def runR(dataDir):
" run R on all files in dataDir and generate the x/y coordinates "
for inFname in glob.glob(join(dataDir, "*.tab")):
if "Coords.tab" in inFname:
continue
outFname = inFname.replace(".tab", ".precRecCoords.tab")
if isfile(outFname):
continue
outFname2 = inFname.replace(".tab", ".rocCoords.tab")
cmd = "Rscript precRecCurve.R %s %s %s"% (inFname, outFname, outFname2)
print cmd
assert(os.system(cmd)==0)
def plotCoords(coords, axis):
" plot the x-y coordinates onto the axis "
plots = []
lineStyles = ['-', '--', ':', "-", "--", ":", "-", "--", "-", "-"]
selColors = ["blue", "red", "orange", "magenta", "orange", "black", "orange", "black", "black", "black"]
legendNames = []
for i, scoreName in enumerate(allScoreNames):
if scoreName.startswith("final"):
continue
if scoreName=="drsc":
continue
legendNames.append(scoreDescs[scoreName])
xyLists = coords[scoreName]
xList, yList = xyLists
color = selColors[i]
lineStyle = lineStyles[i]
xList, yList = xyLists
plot, = axis.plot(xList, yList, color=color, ls=lineStyle)
plots.append(plot)
axis.set_xlim(0,1.0)
axis.set_ylim(0,1.0)
#print scoreName, xList, yList
#plot = axArr[0].scatter(xList, yList)
#scoreNames.append(scoreName)
return plots, legendNames
def makeChariEval():
" create a special dataset chariEval based on S1 of Chari et al "
#labels, allScores, allSeqs = readData('chari2015TrainK562')
seqScores, freqs = parseEffScores('chari2015Train293T')
topSeqs = open("effData/chari2015Train/chari-S1-high-CasSp.txt").read().splitlines()
bottomSeqs = open("effData/chari2015Train/chari-S1-low-CasSp.txt").read().splitlines()
labels = [1]*len(topSeqs)
labels.extend([0]*len(bottomSeqs))
evalSeqs = list(tuple(topSeqs)) # copy of list
evalSeqs.extend(bottomSeqs)
#wuCrisprFilter = False
wuCrisprFilter = True
if wuCrisprFilter:
# keep only sequences where the wuCrispr score is not 0
filtSeqs = []
for seq in evalSeqs:
scores = seqScores[seq]
if scores["wuCrispr"]==0.0:
print "skip", seq
continue
filtSeqs.append(seq)
# make labels for them
filtLabels = []
for seq in filtSeqs:
if seq in topSeqs:
#print "Yes", seq
filtLabels.append(1)
elif seq in bottomSeqs:
#print "No", seq
filtLabels.append(0)
else:
assert(False)
print "Chari eval: after filtering, having %d seqs, %d labels" % (len(filtSeqs), len(filtLabels))
labels = filtLabels
evalSeqs = filtSeqs
# and get all scores for them
evalScores = defaultdict(list)
for seq in evalSeqs:
scores = seqScores[seq]
for scoreName, score in scores.iteritems():
evalScores[scoreName].append(score)
print "Chari eval: got %d seqs, %d labels, %d scores" % (len(evalSeqs), len(labels), len(evalScores.values()[0]))
print labels
return labels, evalScores, evalSeqs
def main():
datasetNames = selDatasets
dataDir = "out/precRecData"
if not isdir(dataDir):
os.mkdir(dataDir)
for datasetName in datasetNames:
if datasetName=="chariEval":
labels, allScores, allSeqs = makeChariEval()
writeSeqLabelsScores(allSeqs, labels, allScores, dataDir, datasetName)
else:
writeRTables(dataDir, datasetName)
runR(dataDir)
for plotType in ["precRec", "roc"]:
fig, axes = plt.subplots(2, 4)
fig.set_size_inches(20, 10)
for datasetName, axis in zip(datasetNames, fig.axes):
coords = parseCoords(dataDir, datasetName, plotType)
plots, scoreNames = plotCoords(coords, axis)
axis.set_title(datasetDescs[datasetName])
if plotType=="precRec":
axis.set_xlabel("recall")
axis.set_ylabel("precision")
else:
axis.set_xlabel("False positive rate")
axis.set_ylabel("True positive rate")
axis.legend(plots, scoreNames, loc="lower right", fontsize=8, ncol=2)
plt.tight_layout()
outfname = "out/effScores-%s.pdf" % plotType
plt.savefig(outfname)
print "wrote %s" % outfname
outfname = outfname.replace("pdf", "png")
plt.savefig(outfname)
print "wrote %s" % outfname
plt.close()
main()