-
Notifications
You must be signed in to change notification settings - Fork 20
/
monteCarloAlena.py
124 lines (110 loc) · 4.29 KB
/
monteCarloAlena.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# determine how well some score predicts performance in the alena dataset
# on the level of the locus, not overall
from annotateOffs import *
from collections import defaultdict
import random
import operator
def parseGuides(fname):
" parse scores.tab, return dict locusName -> (score1, score2, modFreq) "
guideCount = 0
scoreData = defaultdict(list)
for row in iterTsvRows(fname):
setName = row.guide.split("_")[0]
predScore1 = float(row.crisprScan)
#predScore1 = float(row.doench)
#predScore2 = float(row.doench)
predScore2 = float(row.fusi)
scoreData[setName].append((predScore1, predScore2, float(row.modFreq)))
guideCount += 1
return scoreData, guideCount
def getTopFreqs(scoreData):
"""
return dict with locusNames -> list of top frequencies
"""
locusTopFreqs = dict()
for locusName, locusGuides in scoreData.items():
modFreqs = [modFreq for predScore1,predScore2,modFreq in locusGuides]
modFreqs.sort()
topFreqs = modFreqs[-2:]
locusTopFreqs[locusName] = topFreqs
return locusTopFreqs
def countSuccesses(scoreData, locusTopFreqs, predFunc):
""" for each locus, identify best guides using selection function and
count how often we find indeed a top guide
topScores is a dict locus -> set of top scores
"""
successCount = 0
for locusName, locusGuides in scoreData.items():
# sort by pred score and get data of top three pred scores
# get the best two mod freqs
topFreqs= locusTopFreqs[locusName]
topGuides = predFunc(locusGuides)
# check if any of the top guides has a top mod freq
success = False
for topGuide in topGuides:
predScore1, predScore2,modFreq = topGuide
if modFreq in topFreqs:
success = True
break
if success:
successCount += 1
return successCount
#print "Locus and guideCount", locusName, len(locusGuides), "best pred:", topGuides, "highest measured mod. freqs:", topFreqs, "prediction worked?", success
#print "%d guides, "%guideCount, "number of loci", len(scoreData), "success count", successCount
def getBestCombGuides(guides):
" return a set of guides predicted by score1 and score2 "
guides.sort()
topGuides = []
# take top2 based on primary score
topGuides.extend(guides[-1:])
guides.sort(key=operator.itemgetter(1))
topGuides.extend(guides[-1:])
assert(len(topGuides)==2)
return topGuides
def getBestTwoPredGuides(guides):
" return a set of guides predicted by score1 "
guides.sort()
topGuides = []
# take top2 based on primary score
topGuides.extend(guides[-2:])
return topGuides
def getBestPredGuide(guides):
" return a set of guides predicted by score1 "
guides.sort()
topGuides = []
# take top2 based on primary score
topGuides.extend(guides[-1:])
return topGuides
def getRndGuides(guides):
" return a set of four random guides "
random.shuffle(guides)
return guides[:2]
print "Shkumatava lab zebrafish dataset"
scoreData, guideCount = parseGuides("effData/alenaAll.scores.tab")
topFreqs = getTopFreqs(scoreData)
print "Strategy: pick 2 guides with highest Moreno-Mateos score"
successCount = countSuccesses(scoreData, topFreqs, getBestTwoPredGuides)
#successCount = countSuccesses(scoreData, topFreqs, getBestCombGuides)
print "success: ", successCount, "out of", len(scoreData), "loci"
n = 100000
rndSuccCount = 0
for i in range(0, n):
rndCount = countSuccesses(scoreData, topFreqs, getRndGuides)
if rndCount >= successCount:
rndSuccCount +=1
print "pVal", rndSuccCount / float(n)
# copied for the Teboul dataset
print "Teboul in-vivo dataset"
scoreData, guideCount = parseGuides("effData/teboulVivo_mm9.scores.tab")
topFreqs = getTopFreqs(scoreData)
print "Strategy: pick 1 guide with highest Moreno-Mateos score"
successCount = countSuccesses(scoreData, topFreqs, getBestPredGuide)
#successCount = countSuccesses(scoreData, topFreqs, getBestCombGuides)
print "success: ", successCount, "out of", len(scoreData), "loci"
n = 100000
rndSuccCount = 0
for i in range(0, n):
rndCount = countSuccesses(scoreData, topFreqs, getRndGuides)
if rndCount >= successCount:
rndSuccCount +=1
print "pVal", rndSuccCount / float(n)