-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclean_alignment.py
executable file
·228 lines (180 loc) · 7.63 KB
/
clean_alignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#!/usr/bin/env python3
###############################################
# Author: B.M. Anderson
# Date: 12 Mar 2019
# Modified: Oct 2020
# Modified: Jun 2021 (updated for phylip and using AlignIO, numpy and pandas)
# Modified: Mar 2023 (add sequence removal option)
# Modified: Sep 2024 (made default no filtering; updated for pandas function deprecation)
# Description: read in an alignment and remove positions/sequences with more than a specified number or percentage of gaps or Ns
###############################################
import sys
import pandas
import numpy
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
def help():
print('A script to remove positions/sequences in an alignment based on number/percentage of gaps and Ns.')
print('Choosing both a number and a percentage overrides the percentage.')
print('The cleaned alignment is output as a fasta/phylip file in the current directory.')
print('')
print('Usage: ' + str(sys.argv[0]) + ' options(-... -...) alignment')
print('')
print('Options:')
print(' -f File type (fasta [default] or phylip)')
print('')
print(' -g Maximum number of gaps and Ns allowed in the alignment at any position [default: no maximum]')
print('')
print(' -p Maximum percentage (%) of gaps and Ns allowed in the alignment at any position [default: 100]')
print('')
print(' -s Maximum percentage (%) of gaps and Ns allowed in a sequence to keep it (after gap filtering) [default: 100]')
print('')
opts = ['-f', '-g', '-p', '-s']
opt_valdic = {}
# print help if the script is called without args
if len(sys.argv[1:]) == 0:
sys.exit(help())
# assign arguments to variables and evaluate options
alignment = sys.argv[-1:][0] # retrieve the last argument slice, then entry as string
for index, arg in enumerate(sys.argv[1:-1]): # don't include last entry
if arg[0] == '-':
if arg in opts:
if index < len(sys.argv[1:-1]) - 1:
if sys.argv[1:-1][index + 1][0] == '-':
sys.exit('No argument provided for ' + str(arg))
opt_valdic[arg] = sys.argv[1:-1][index + 1]
else:
sys.exit(help())
else:
sys.exit('The argument ' + str(arg) + ' is not a valid option')
else:
continue
if '-f' in opt_valdic:
align_specified = True
align_type = opt_valdic['-f']
else:
align_specified = False
align_type = 'fasta'
if '-g' in opt_valdic:
gap_number = True
num_gaps = int(opt_valdic['-g'])
else:
gap_number = False
if '-p' in opt_valdic:
gap_percent = True
perc_gaps = int(opt_valdic['-p'])
else:
gap_percent = False
if '-s' in opt_valdic:
seq_percent = True
seq_perc_gaps = int(opt_valdic['-s'])
else:
seq_percent = False
# read in the alignment
if align_type == 'fasta':
align = AlignIO.read(open(alignment, 'r'), 'fasta')
elif align_type == 'phylip':
align = AlignIO.read(open(alignment, 'r'), 'phylip-relaxed')
else:
sys.exit('Please specify alignment type as "fasta" or "phylip"')
# capture the sequence ids and descriptions
ids = []
descriptions = []
for sequence in align:
ids.append(sequence.id)
descriptions.append(sequence.description)
len_align = align.get_alignment_length()
print('Alignment sequences: ' + str(len(align)) + ', length: ' + str(len_align) + ' bp')
# now that the data has been read, we need to convert the sequences to an array, then to a dataframe
# due to memory issues, I need to do this in chunks when the file is large
# define a function to convert to upper case
def upperit(x):
return(str(x).upper())
if len_align > 500000: # alignment is super large
align_list = []
increment = 200000
for start in range(0, len_align, increment):
end = min(len_align, start + increment)
print('Filtering from positions: ' + str(start) + ' to ' + str(end) + ' of ' + str(len_align))
n_array = numpy.array([list(sample) for sample in align[:, start: end]])
p_df = pandas.DataFrame(n_array)
rows = float(len(p_df))
# convert to upper case for search
p_df = p_df.map(upperit)
# now we need to filter the dataframe by columns, to keep columns which pass filters
# from https://stackoverflow.com/questions/31614804/how-to-delete-a-column-in-pandas-dataframe-based-on-a-condition/31618820
if gap_number:
filtp_df = p_df.loc[:, (p_df.eq('-').sum() + p_df.eq('N').sum() <= num_gaps)]
elif gap_percent:
filtp_df = p_df.loc[:, (round(100*(p_df.eq('-').sum() + p_df.eq('N').sum())/rows, 0) <= perc_gaps)]
else:
# No gap/N threshold specified, so keeping all positions
filtp_df = p_df
# now that the dataframe is filtered, we need to convert it back into an alignment
new_records = []
for index, sequence in enumerate(filtp_df.values.tolist()):
new_sequence = Seq(''.join(sequence))
new_record = SeqRecord(new_sequence, id = ids[index], description = descriptions[index])
new_records.append(new_record)
# append it to a list
align_list.append(MultipleSeqAlignment(new_records))
# stick the alignments together and output the new alignment
new_alignment = align_list[0]
for piece in align_list[1:]:
new_alignment = new_alignment + piece
# if there is a flag to check gap composition of sequences, do so
if seq_percent:
temp_record_list = []
for align_rec in new_alignment:
this_count = align_rec.seq.count('-') + align_rec.seq.count('N')
if 100 * float(this_count) / len(align_rec.seq) > seq_perc_gaps:
continue
temp_record_list.append(align_rec)
new_alignment = MultipleSeqAlignment(temp_record_list)
print('Filtered alignment sequences: ' + str(len(new_alignment)) + ', length: ' +
str(new_alignment.get_alignment_length()) + ' bp\n')
if align_type == 'fasta':
with open(alignment.replace('.f', '_clean.f'), 'w') as out_file:
AlignIO.write(new_alignment, out_file, 'fasta')
elif align_type == 'phylip':
with open(alignment.replace('.p', '_clean.p'), 'w') as out_file:
AlignIO.write(new_alignment, out_file, 'phylip-relaxed')
else: # alignment is reasonable size
n_array = numpy.array([list(sample) for sample in align])
#print('Dimensions of array: ' + str(n_array.shape))
p_df = pandas.DataFrame(n_array)
rows = float(len(p_df))
# convert to upper case for search
p_df = p_df.map(upperit)
# now we need to filter the dataframe by columns, to keep columns which pass filters
# from https://stackoverflow.com/questions/31614804/how-to-delete-a-column-in-pandas-dataframe-based-on-a-condition/31618820
if gap_number:
filtp_df = p_df.loc[:, (p_df.eq('-').sum() + p_df.eq('N').sum() <= num_gaps)]
elif gap_percent:
filtp_df = p_df.loc[:, (round(100*(p_df.eq('-').sum() + p_df.eq('N').sum())/rows, 0) <= perc_gaps)]
else:
print('No gap/N threshold specified, so keeping all positions')
filtp_df = p_df
# now that the dataframe is filtered, we need to convert it back into an alignment
new_records = []
for index, sequence in enumerate(filtp_df.values.tolist()):
new_sequence = Seq(''.join(sequence))
# if there is a flag to check gap composition of sequences, do so
if seq_percent:
this_count = new_sequence.count('-') + new_sequence.count('N')
if 100 * float(this_count) / len(new_sequence) > seq_perc_gaps:
continue
new_record = SeqRecord(new_sequence, id = ids[index], description = descriptions[index])
new_records.append(new_record)
new_alignment = MultipleSeqAlignment(new_records)
print('Filtered alignment sequences: ' + str(len(new_alignment)) + ', length: ' +
str(new_alignment.get_alignment_length()) + ' bp\n')
# output the new alignment
if align_type == 'fasta':
with open(alignment.replace('.f', '_clean.f'), 'w') as out_file:
AlignIO.write(new_alignment, out_file, 'fasta')
elif align_type == 'phylip':
with open(alignment.replace('.p', '_clean.p'), 'w') as out_file:
AlignIO.write(new_alignment, out_file, 'phylip-relaxed')