forked from ctb/eel-pond
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake-reciprocal-best-hits.py
98 lines (82 loc) · 2.67 KB
/
make-reciprocal-best-hits.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#! /usr/bin/env python
import sys
import blastparser
import cPickle
import argparse
def collect_best_hits(filename, qfn=None):
d = {}
for n, record in enumerate(blastparser.parse_fp(open(filename))):
if n % 25000 == 0:
print '...', filename, n
best_score = None
for hit in record.hits:
for match in hit.matches:
query = record.query_name
if qfn:
query = qfn(query)
subject = hit.subject_name
score = match.score
# only keep the best set of scores for any query
if best_score and best_score > score:
continue
best_score = score
x = d.get(query, [])
x.append((subject, score))
d[query] = x
if best_score and best_score != score:
break
return d
def parse_ncbi_query(name):
name = name.split('|')[2:]
name = '|'.join(name)
return name
def main():
parser = argparse.ArgumentParser()
parser.add_argument('tr_vs_ref')
parser.add_argument('ref_vs_tr')
parser.add_argument('output')
parser.add_argument('-z', '--no-ncbi', action='store_false',
dest='ncbi', default=True)
parser.add_argument('-n', '--no-parse', action='store_false',
dest='do_parse', default=True)
args = parser.parse_args()
tr_vs_ref = args.tr_vs_ref
ref_vs_tr = args.ref_vs_tr
outputfilename = args.output
cachefile = outputfilename + '.cache'
if args.do_parse:
print 'collecting best hits from:', tr_vs_ref
d = collect_best_hits(tr_vs_ref)
print 'collecting best hits from:', ref_vs_tr
if args.ncbi:
e = collect_best_hits(ref_vs_tr, parse_ncbi_query)
else:
e = collect_best_hits(ref_vs_tr)
print 'saving best hits result to', cachefile
fp = open(cachefile, 'w')
cPickle.dump(d, fp)
cPickle.dump(e, fp)
fp.close()
else:
print 'loading cached best hits from', cachefile
fp = open(cachefile)
d = cPickle.load(fp)
e = cPickle.load(fp)
fp.close()
dd = {}
ee = {}
print 'calculating reciprocal best hits'
for k in d:
v = map(lambda x: x[0], d[k])
for k2 in v:
v2 = map(lambda x: x[0], e.get(k2, []))
if k in v2:
dd[k] = k2
ee[k2] = k
print 'saving reciprocal best hits to', outputfilename
fp = open(outputfilename, 'w')
cPickle.dump(dd, fp)
cPickle.dump(ee, fp)
fp.close()
if __name__ == '__main__':
main()