-
Notifications
You must be signed in to change notification settings - Fork 20
/
compRelativeScores.py
90 lines (80 loc) · 3.4 KB
/
compRelativeScores.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os, logging, operator
from os.path import isfile, splitext, join
from annotateOffs import *
from collections import defaultdict
from scipy.stats import linregress, binom
import matplotlib
matplotlib.use('Agg')
matplotlib.rcParams['pdf.fonttype'] = 42
import matplotlib.pyplot as plt
import numpy as np
#logging.getLogger().setLevel(logging.DEBUG)
def compPairs(fname, minScoreDiff):
print "Comparing guides from %s, minimum score difference %f" % (fname, minScoreDiff)
byGene = defaultdict(list) # dict gene -> list of (guideName, modFreq, scores)
for row in iterTsvRows(fname):
if float(row.modFreq)==0.0:
continue
gene = row.guide.split("-")[0]
scores = {}
scores["doench"] = float(row.doench)
scores["ssc"] = float(row.ssc)
scores["svm"] = float(row.svm)
chariRaw, chariRank = lookupchariScore(row.extSeq[4:27])
scores["chariRaw"] = chariRaw
scores["chariRank"] = chariRank
byGene[gene].append( (row.guide, float(row.modFreq), scores) )
# keep only genes with two guides
twoGuides = dict()
for gene, guideList in byGene.iteritems():
if len(guideList)==2:
twoGuides[gene]=guideList
elif len(guideList)>2:
guideList.sort(key=operator.itemgetter(1))
twoGuides[gene]=(guideList[0], guideList[1])
else:
continue
# for each gene, test if the order of the modFreq scores is the same as the order of the scores
scoreNames = ["doench", "ssc", "svm", "chariRaw"]
okCounts = defaultdict(int)
for gene, guidePair in twoGuides.iteritems():
guide1, guide2 = guidePair
guide1Name, guide2Name = guide1[0], guide2[0]
freq1, freq2 = guide1[1], guide2[1]
if abs(freq2-freq1) < minScoreDiff:
#print abs(freq2-freq1), "is <0.1"
logging.debug("difference not high enough")
continue
okCounts["all"] += 1
scores1, scores2 = guide1[2], guide2[2]
logging.debug("guides (%s, %s), modFreq (%f, %f), doench (%f,%f), ssc (%f,%f)" % (guide1Name, guide2Name, freq1, freq2, scores1["doench"], scores2["doench"], scores1["ssc"], scores2["ssc"]))
anyOk = False
if freq2 > freq1:
for scoreName in scoreNames:
if scores2[scoreName] > scores1[scoreName]:
logging.debug( scoreName+ " OK")
okCounts[scoreName] += 1
anyOk = True
else:
for scoreName in scoreNames:
if scores2[scoreName] < scores1[scoreName]:
logging.debug( scoreName+ " OK")
okCounts[scoreName] += 1
anyOk = True
if not anyOk:
logging.debug( "No score was OK")
geneCount = okCounts["all"]
print "total number of genes:", geneCount
for scoreType, scoreCount in okCounts.iteritems():
if scoreType=="all":
continue
pVal = binom.sf(scoreCount-1,geneCount,0.5)
print "%s was correct %d times (p-Val %f)" % (scoreType, scoreCount, pVal)
def main():
# parse the input file
#compPairs("out/xu2015-compDoenchSsc.tsv", 20)
compPairs("out/varshney2015-compEffData.tsv", 20)
#compPairs("out/xu2015Train-compDoenchSsc.tsv", 0.9)
#compPairs("out/doench2014-S7-9Genes-compDoenchSsc.tsv", 1.0)
compPairs("out/doench2014-Hs-compEffData.tsv", 0.5)
main()