-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconservation.py
291 lines (274 loc) · 12.5 KB
/
conservation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
###############################
# Name: conservation.py #
# Author: Santiago Gil Begue. #
###############################
from sys import argv
from os.path import exists, isdir
from math import log
from Bio import SeqIO
import operator
# Available formats.
formats = {'abi': 'Reads the ABI "Sanger" capillary sequence traces files, including the PHRED' +
'quality scores for the base calls. This allows ABI to FASTQ conversion. Note ' +
'each ABI file contains one and only one sequence (so there is no point in indexing the file).',
'ace': 'Reads the contig sequences from an ACE assembly file. Uses Bio.Sequencing.Ace internally.',
'clustal': 'The alignment format of Clustal X and Clustal W.',
'embl': 'The EMBL flat file format. Uses Bio.GenBank internally.',
'fasta': 'This refers to the input FASTA file format introduced for Bill Pearson\'s FASTA tool, where ' +
'each record starts with a ">" line. Resulting sequences have a generic alphabet by default.',
'fastq': 'FASTQ files are a bit like FASTA files but also include sequencing qualities. In Biopython, ' +
'"fastq" (or the alias "fastq-sanger") refers to Sanger style FASTQ files which encode PHRED ' +
'qualities using an ASCII offset of 33. See also the incompatible "fastq-solexa" and ' +
'"fastq-illumina" variants used in early Solexa/Illumina pipelines, Illumina pipeline 1.8 ' +
'produces Sanger FASTQ.',
'fastq-solexa': 'In Biopython, "fastq-solexa" refers to the original Solexa/Illumina style FASTQ files ' +
'which encode Solexa qualities using an ASCII offset of 64. See also what we call the ' +
'"fastq-illumina" format.',
'fastq-illumina': 'In Biopython, "fastq-illumina" refers to early Solexa/Illumina style FASTQ files ' +
'(from pipeline version 1.3 to 1.7) which encode PHRED qualities using an ASCII offset ' +
'of 64. For good quality reads, PHRED and Solexa scores are approximately equal, so + '
'the "fastq-solexa" and "fastq-illumina" variants are almost equivalent.',
'genbank': 'The GenBank or GenPept flat file format. Uses Bio.GenBank internally for parsing.',
'ig': 'This refers to the IntelliGenetics file format, apparently the same as the MASE alignment format.',
'imgt': 'This refers to the IMGT variant of the EMBL plain text file format.',
'nexus': 'The NEXUS multiple alignment format, also known as PAUP format. Uses Bio.Nexus internally.',
'pdb-seqres': 'Reads a Protein Data Bank (PDB) file to determine the complete protein sequence as it ' +
'appears in the header (no dependency on Bio.PDB and NumPy).',
'pdb-atom': 'Uses Bio.PDB to determine the (partial) protein sequence as it appears in the structure ' +
'based on the atom coordinate section of the file (requires NumPy).',
'phd': 'PHD files are output from PHRED, used by PHRAP and CONSED for input. Uses Bio.Sequencing.Phd internally.',
'phylip': 'PHYLIP files. Truncates names at 10 characters.',
'pir': 'A "FASTA like" format introduced by the National Biomedical Research Foundation (NBRF) for the ' +
'Protein Information Resource (PIR) database, now part of UniProt.',
'seqxml': 'Simple sequence XML file format.',
'sff': 'Standard Flowgram Format (SFF) binary files produced by Roche 454 and IonTorrent/IonProton sequencing machines.',
'stockholm': 'The Stockholm alignment format is also known as PFAM format.',
'swiss': 'Swiss-Prot aka UniProt format. Uses Bio.SwissProt internally. See also the UniProt XML format.',
'tab': 'Simple two column tab separated sequence files, where each line holds a record\'s identifier and ' +
'sequence. For example, this is used by Aligent\'s eArray software when saving microarray probes ' +
'in a minimal tab delimited text file.',
'qual': 'Qual files are a bit like FASTA files but instead of the sequence, record space separated integer ' +
'sequencing values as PHRED quality scores. A matched pair of FASTA and QUAL files are often used ' +
'as an alternative to a single FASTQ file.',
'uniprot-xml': 'UniProt XML format, successor to the plain text Swiss-Prot format.'}
# Check correct number of parameters.
error = False;
if len(argv) != 3 and len(argv) != 5:
error = True
# Check if a valid format is given.
elif argv[2] not in formats:
error = True
# Check if option -top is selected.
if len(argv) == 5:
if argv[3] == '-top':
isTop = True
top = float(argv[4])
if top < 0 or top > 1:
print('Percentage must be in range [0-1]')
exit(1)
else:
error = True
else: isTop = False
if error:
print('Usage: python ' + argv[0] + ' <sequencies_file> <format> [ -top <percentage> ]')
print('Tip: You can redirect it to a file by doing: ' + argv[0] + ' <sequencies_file> <format> [ -top <percentage> ] > output.txt\n')
print('Available formats: ')
print(formats.keys())
print('\nUse: python ' + argv[0] + ' -h <format> for a format detail.')
exit(1)
# Check if option is -h.
if argv[1] == '-h':
print(formats[argv[2]])
exit(1)
# Check that input file exists.
if not exists(argv[1]):
print('> ERROR: File \'' + argv[1] + '\' doesn\'t exist.')
print(' Exiting...')
exit(1)
# Check that input file is not a directory.
if isdir(argv[1]):
print('> ERROR: File \'' + argv[1] + '\' is a directory.')
print(' Exiting...')
exit(1)
# DNA molecule bases. IUPAC nucleotide code.
ADENINE = 'a'
CYTOSINE = 'c'
GUANINE = 'g'
THYMINE = 't'
URACIL = 'u'
GAP1 = '-'
GAP2 = '.'
DNA_BASES = { ADENINE: [ADENINE],
CYTOSINE: [CYTOSINE],
GUANINE: [GUANINE],
THYMINE: [THYMINE],
URACIL: [THYMINE],
# Gaps.
GAP1 : [],
GAP2 : [],
# Combinations.
'r': [ADENINE, GUANINE],
'y': [CYTOSINE, THYMINE],
's': [GUANINE, CYTOSINE],
'w': [ADENINE, THYMINE],
'k': [GUANINE, THYMINE],
'm': [ADENINE, CYTOSINE],
'b': [CYTOSINE, GUANINE, THYMINE],
'd': [ADENINE, GUANINE, THYMINE],
'h': [ADENINE, CYTOSINE, THYMINE],
'v': [ADENINE, CYTOSINE, GUANINE],
'n': [ADENINE, CYTOSINE, GUANINE, THYMINE] }
# Amino acids. IUPAC amino acid code.
ALANINE = 'a'
CYSTEINE = 'c'
ASPARTIC_ACID = 'd'
GLUTAMIC_ACID = 'e'
PHENYLALANINE = 'f'
GLYCINE = 'g'
HISTIDINE = 'h'
ISOLEUCINE = 'i'
LYSINE = 'k'
LEUCINE = 'l'
METHIONINE = 'm'
ASPARAGINE = 'n'
PROLINE = 'p'
GLUTAMINE = 'q'
ARGININE = 'r'
SERINE = 's'
THREONINE = 't'
VALINE = 'v'
TRYPTOPHAN = 'w'
TYROSINE = 'y'
# Two additional amino acids: stop codons.
SELENOCYSTEINE = 'u'
PYRROLYSINE = 'o'
AMINO_ACIDS = { ALANINE: [ALANINE],
CYSTEINE: [CYSTEINE],
ASPARTIC_ACID: [ASPARTIC_ACID],
GLUTAMIC_ACID: [GLUTAMIC_ACID],
PHENYLALANINE: [PHENYLALANINE],
GLYCINE: [GLYCINE],
HISTIDINE: [HISTIDINE],
ISOLEUCINE: [ISOLEUCINE],
LYSINE: [LYSINE],
LEUCINE: [LEUCINE],
METHIONINE: [METHIONINE],
ASPARAGINE: [ASPARAGINE],
PROLINE: [PROLINE],
GLUTAMINE: [GLUTAMINE],
ARGININE: [ARGININE],
SERINE: [SERINE],
THREONINE: [THREONINE],
VALINE: [VALINE],
TRYPTOPHAN: [TRYPTOPHAN],
TYROSINE: [TYROSINE],
SELENOCYSTEINE: [SELENOCYSTEINE],
PYRROLYSINE: [PYRROLYSINE],
# Gaps.
GAP1: [],
GAP2: [],
# Combinations
'x': [ALANINE, CYSTEINE, ASPARTIC_ACID, GLUTAMIC_ACID,
PHENYLALANINE, GLYCINE, HISTIDINE, ISOLEUCINE,
LYSINE, LEUCINE, METHIONINE, ASPARAGINE, PROLINE,
GLUTAMINE, ARGININE, SERINE, THREONINE, VALINE,
TRYPTOPHAN, TYROSINE, SELENOCYSTEINE, PYRROLYSINE],
'b': [ASPARTIC_ACID, ASPARAGINE],
'z': [GLUTAMIC_ACID, GLUTAMINE],
'j': [ISOLEUCINE, LEUCINE] }
# Array with the sequences to process.
try:
sequences = list(SeqIO.parse(argv[1], argv[2]))
except Exception:
print('> ERROR: Wrong file format.')
print(' File \'' + argv[1] + '\' is not in \'' + argv[2] + '\' format.')
print(' Exiting...')
exit(1)
# Check that we have sequences to process. Format file may be incorrect.
if not sequences:
print('> ERROR: No DNA sequences recognized.')
print(' File \'' + argv[1] + '\' may not be in \'' + argv[2] + '\' format.')
print(' Exiting...')
exit(1)
# Check if the sequences are DNA or PROTEINS.
dna = 0; proteins = 0
for base in sequences[0].seq.lower():
dna += base in DNA_BASES
proteins += base in AMINO_ACIDS
isDNA = dna >= proteins
# Get maximum length of the sequences.
maxLength = max(len(sequence.seq) for sequence in sequences)
### DNA
if isDNA:
# Dictionary with the occurrences of each base in each position.
# bases[ADENINE][i] means how many sequences have an ADENINE base in the position i.
bases = { ADENINE : [0]*maxLength,
CYTOSINE : [0]*maxLength,
GUANINE : [0]*maxLength,
THYMINE : [0]*maxLength }
target = DNA_BASES
### PROTEIN
else:
# Dictionary with the occurrences of each amino acid in each position.
# bases[ALANINE][i] means how many sequences have an ALANINE amino acid in the position i.
bases = { ALANINE : [0]*maxLength,
CYSTEINE : [0]*maxLength,
ASPARTIC_ACID : [0]*maxLength,
GLUTAMIC_ACID : [0]*maxLength,
PHENYLALANINE : [0]*maxLength,
GLYCINE : [0]*maxLength,
HISTIDINE : [0]*maxLength,
ISOLEUCINE : [0]*maxLength,
LYSINE : [0]*maxLength,
LEUCINE : [0]*maxLength,
METHIONINE : [0]*maxLength,
ASPARAGINE : [0]*maxLength,
PROLINE : [0]*maxLength,
GLUTAMINE : [0]*maxLength,
ARGININE : [0]*maxLength,
SERINE : [0]*maxLength,
THREONINE : [0]*maxLength,
VALINE : [0]*maxLength,
TRYPTOPHAN : [0]*maxLength,
TYROSINE : [0]*maxLength,
SELENOCYSTEINE : [0]*maxLength,
PYRROLYSINE : [0]*maxLength }
target = AMINO_ACIDS
# Fill bases dictionary.
for sequence in sequences:
for i in range(len(sequence.seq)):
# No case sensitive.
base = sequence.seq[i].lower()
# Be robust to non DNA/PROTEIN format.
if base in target:
equivalentBases = target[base]
for equivalentBase in equivalentBases:
bases[equivalentBase][i] += 1.0 / len(equivalentBases)
else:
print('> ERROR: ' + sequence.id + ' may not be a DNA molecule or PROTEIN.')
print(' Found \'' + base + '\' in position ' + str(i+1) + '.')
print(' Exiting...')
exit(1)
# Calculate conservation for each position.
conservations = {}
for i in range(maxLength):
total = sum(bases[base][i] for base in bases)
conservation = 0
for base in bases:
# When bases[base][i] == 0, we have 0*log(0), which is undefined.
# So, avoid it, and add 0 (do nothing).
if bases[base][i] != 0:
freq = bases[base][i] / total
conservation += freq * log(freq)
# -top option selected.
if isTop:
if conservation < 0:
conservations[i+1] = conservation
# Normal output.
else:
print('Position ' + str(i+1) + ': ' + str(conservation))
# Return <top>% non-zero highest conservations.
if isTop:
sorted_conservations = sorted(conservations.items(), key=operator.itemgetter(1), reverse=True)
for i in range(int(len(sorted_conservations)*top)):
print('Position ' + str(sorted_conservations[i][0]) + ': ' + str(sorted_conservations[i][1]))