-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathortho_filtering.py
173 lines (157 loc) · 6.59 KB
/
ortho_filtering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
GNU nano 2.0.9 File: /home/zoo/guermons/lab/Scripts/ortho_filtering.py
#!/usr/bin/env python2.7
# this script outputs a .end file of orthologs containing only the required distribution of taxa
# from the .end output file of FastOrtho
# Sarah Guermond
# last updated 11 July 2014
import io
import sys
import re
import os
from collections import defaultdict
if len(sys.argv) < 6:
print("\nProduces filtered .end file (FastOrtho) with the required number of matches")
print "from two user-specified categories"
print("\nUsage: ./ortho_filtering.py <ortho.end> <output> <list 1> <num 1> <list 2> <num 2>")
print("\n\t<ortho.end>: FastOrtho .end file")
print("\t<list 1>: list of species in category 1 (must match species names in .end)")
print("\t<num 1>: minimum number of representatives from list 1 desired in ortholog")
print("\t<list 2>: list of species in category 2 (must match species names in .end)")
print("\t<num 2>: minimum number of representatives from list 2 desired in ortholog")
print("\nEx: ./ortho_filtering.py orthos.end filtered_orthos.end cats.txt 2 dogs.txt 3")
print "will return orthologs with at least 2 cats and 3 dogs"
print "\nIn cases where only orthologs from one list are desired, enter None for the other number"
quit()
orthologs = sys.argv[1]
output = sys.argv[2]
list1_file = sys.argv[3]
min1 = sys.argv[4]
list2_file = sys.argv[5]
min2 = sys.argv[6]
# read lists into memory
def file_to_list(file):
list = []
list_handle = io.open(file, "rb")
for line in list_handle:
word = line.strip()
list.append(word)
list_handle.close()
return(list)
list1 = file_to_list(list1_file)
list2 = file_to_list(list2_file)
# creat filehandles
ortho_handle = io.open(orthologs, "rb")
outfile_handle = io.open(output, "wb")
seqnum_handle = io.open(output + "_seqnums", "wb")
# make function dealing with none
def none_list(inhandle, outhandle, numhandle, list_none, list_num, min):
match = 0
nomatch = 0
numhandle.write("Ortholog\t" + "\t".join(list_num))
for line in inhandle:
sp_dict = {} # keeps track of which species have been seen in this ortholog
list_num_count = 0
list_none_count = 0
line_list = line.strip().split("\t")
seq_line = line_list[1].strip()
seq_species = seq_line.split(" ")
for seq_sp in seq_species:
left = seq_sp.find('(')
right = seq_sp.find(')')
sp = seq_sp[left+1:right]
if sp in list_num:
# check if seen before
if sp in sp_dict:
sp_dict[sp] += 1
else:
sp_dict[sp] = 1
list_num_count += 1
elif sp in list_none:
list_none_count += 1
else:
print "No species ", sp, " found in none list"
# check against requirements
if list_num_count >= min and list_none_count == 0:
outhandle.write(line)
# get ortholog name
ortho_header = line_list[0]
ortho_header_split = ortho_header.split(" ")
ortho = ortho_header_split[0]
match += 1
# print to seq num file
numhandle.write('\n' + ortho + '\t')
for species in list_num:
if species in sp_dict:
numhandle.write((str(sp_dict[species]) + "\t").rstrip('\n'))
else:
numhandle.write((str("0") + '\t').rstrip('\n'))
else:
nomatch += 1
return(match, nomatch)
# if no None
def regular_list(inhandle, outhandle, numhandle, list1, min1, list2, min2):
match = 0
nomatch = 0
numhandle.write("Ortholog\t" + "\t".join(list1 + list2))
for line in inhandle:
sp_dict = {} # keeps track of which species have been seen in this ortholog
list1_count = 0
list2_count = 0
line_list = line.strip().split("\t")
seq_line = line_list[1].strip()
seq_species = seq_line.split(" ")
for seq_sp in seq_species:
# split on ()
left = seq_sp.find('(')
right = seq_sp.find(')')
sp = seq_sp[left+1:right]
if sp in list1:
if sp in sp_dict:
sp_dict[sp] += 1
else:
sp_dict[sp] = 1
list1_count += 1
elif sp in list2:
if sp in sp_dict:
sp_dict[sp] += 1
else:
sp_dict[sp] = 1
list2_count += 1
else:
print "No species ", sp, " found in list 1 or 2"
if list1_count >= min1 and list2_count >= min2:
# print line to outfile
outhandle.write(line)
# get ortholog name
ortho_header = line_list[0]
ortho_header_split = ortho_header.split(" ")
ortho = ortho_header_split[0]
# print to seq num file
numhandle.write('\n' + ortho + '\t')
for species in list1 + list2:
if species in sp_dict:
numhandle.write((str(sp_dict[species]) + "\t").rstrip('\n'))
else:
numhandle.write((str("0") + '\t').rstrip('\n'))
match += 1
else:
nomatch += 1
return(match, nomatch)
# establish number needed for list
if min1 == "None" and min2 != "None":
num2 = int(min2)
match, nomatch = none_list(ortho_handle, outfile_handle, seqnum_handle, list1, list2, num2)
elif min2 == "None" and min1 != "None":
num1 = int(min1)
match, nomatch = none_list(ortho_handle, outfile_handle, seqnum_handle, list2, list1, num1)
elif min1 == min2 == "None":
print "Error: you asked for none from both lists"
quit()
else:
num1 = int(min1)
num2 = int(min2)
match, nomatch = regular_list(ortho_handle, outfile_handle, seqnum_handle, list1, num1, list2, num2)
print match, " matches found"
print nomatch, " had no matches"
ortho_handle.close()
outfile_handle.close()