-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsequencelib.py
4377 lines (3463 loc) · 190 KB
/
sequencelib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# By Anders Gorm Pedersen, [email protected], Technical University of Denmark, Bioinformatics, 2012-2024
##############################################################################################################
"""Classes and methods for reading, analyzing, manipulating, and writing DNA and protein sequences"""
from math import ceil
from math import floor
from math import sqrt
from math import log
from math import log10
import math
from collections import Counter
from contextlib import nullcontext
import re
import string
import sys
import time
import random
import itertools
from io import StringIO
import copy
import os
import numpy as np
import pandas as pd
import Levenshtein as lv
##############################################################################################################
# Various functions used by methods, that do not fit neatly in any class
##############################################################################################################
def find_seqtype(seqsample):
# Note: for small sequences it can occur that Protein seq contains only DNA_maxambig symbols
# Perhaps look at frequencies instead? Other approaches?
letters = set(seqsample) - set("-") # Accepts most input types (list, string, set)
if letters <= Const.Restriction:
return "restriction"
elif letters <= Const.Standard:
return "standard"
elif letters <= Const.DNA_maxambig:
return "DNA"
elif letters <= Const.Protein_maxambig:
return "protein"
elif letters <= Const.ASCII:
return "ASCII" # Maybe drop as a type?
else:
raise SeqError("Unknown sequence type. Unrecognized symbols: {}".format(list(letters - Const.ASCII)))
##############################################################################################################
def seqtype_attributes(seqtype):
"""Returns alphabet and ambigsymbols for given seqtype"""
# Python note: hack. Should rethink logic around seqtypes and refactor
# They can be set too many places at both container and element level
if seqtype == "restriction":
return (Const.Restriction, set())
elif seqtype == "standard":
return (Const.Standard, set())
elif seqtype == "DNA":
return (Const.DNA_maxambig, Const.DNA_maxambig - Const.DNA)
elif seqtype == "protein":
return (Const.Protein_maxambig, Const.Protein_maxambig - Const.Protein)
elif seqtype == "ASCII":
return (Const.ASCII, set())
else:
raise SeqError("Unknown sequence type: {}".format(seqtype))
##############################################################################################################
def _indices(mystring, substring):
"""Helper function that finds indices of substring in string. Returns as set"""
result = set()
offset = -1
while True:
try:
offset = mystring.index(substring, offset+1)
except ValueError:
return result
result.add(offset)
##############################################################################################################
def _multi_indices(mystring, substrings):
"""Helper function: finds indices of all occurences of substrings in mystring
substrings: iterable with multiple strings.
Result is returned as set of indices"""
all_indices = set()
for c in substrings:
if not isinstance(c, str):
raise SeqError(f"Symbol is not a string as required: {c}")
all_indices |= _indices(mystring, c)
return all_indices
##############################################################################################################
def remove_comments(text, leftdelim, rightdelim):
"""Takes input string and strips away commented text, delimited by 'leftdelim' and 'rightdelim'.
Also deals with nested comments."""
# NOTE: only deals with block comments at present
# Python note: maybe this is too general. Will I ever use multichar delims?
def wordsoverlap(w1, w2):
for i in range(1, len(w2)):
if w1.startswith(w2[i:]):
return True
for i in range(1, len(w1)):
if w2.startswith(w1[i:]):
return True
return False
if leftdelim == rightdelim:
raise SeqError("Left and right delimiters are identical")
elif leftdelim in rightdelim:
raise SeqError("Left delimiter is substring of right delimiters")
elif rightdelim in leftdelim:
raise ExcepSeqErrortion("Right delimiter is substring of left delimiters")
elif wordsoverlap(leftdelim, rightdelim):
raise SeqError("Right and left delimiters overlap")
# Preprocess delims for use in re etc
leftdelim = re.escape(leftdelim)
rightdelim = re.escape(rightdelim)
# Construct sorted list of tuples of the form [(0, 'start'), (5, 'stop'), (7, 'start'), ...]
delimlist = [(match.start(), match.end(), "start") for match in re.finditer(leftdelim, text)]
# If text contains no starts (=> no comments): return un-altered text
if not delimlist:
return text
else:
delimlist.extend([(match.start(), match.end(), "stop") for match in re.finditer(rightdelim, text)])
delimlist.sort()
# Traverse text; along the way copy text not inside comment-delimiter pairs.
# Use stack ("unmatched_starts") to keep track of nesting
unmatched_starts = 0
prevpos = 0
processed_text = []
for (match_start, match_end, match_type) in delimlist:
if match_type == "start":
unmatched_starts += 1
if unmatched_starts == 1: # Beginning of new comment region
processed_text.append(text[prevpos:match_start])
elif match_type == "stop":
unmatched_starts -= 1
if unmatched_starts == 0: # End of comment region
prevpos = match_end
elif unmatched_starts == -1: # Error: more right delims than left delims
raise Exception("Unmatched end-comment delimiter. Context: '{}'".format(text[prevpos-10:prevpos+10]))
# Add final block of text if relevant (i.e., if text does not stop with rightdelim), return processed text
if prevpos < len(text):
processed_text.append(text[prevpos:])
return "".join(processed_text)
#############################################################################################################
def make_sparseencoder(alphabet, padding="X"):
"""Returns function that can sparse-encode strings in specified alphabet"""
# This function uses a "closure" to create and return a function that can later be used
# for sparse-encoding strings in the specified alphabet.
# The purpose is to avoid having to build the translation dictionary every time the
# encoder function is run (or alternatively to compute it preemptively on module import)
# Check that padding symbol is not also present in alphabet
if padding in alphabet:
raise SeqError("Sparse-encoding error: padding symbol can't also be present in alphabet")
# Build translation dictionary for specified alphabet.
# This will be available to enclosed (and returned) function
alphabet = sorted(alphabet)
zerolist = [0] * len(alphabet)
transdict = {}
for i,residue in enumerate(alphabet):
vec = zerolist[:]
vec[i] = 1
transdict[residue] = vec
transdict[padding] = zerolist[:]
# Enclosed function that will be returned and that can do sparse-encoding of input string
def sparse_encoder(sequence_string):
"""Sparse-encodes input string. Output is numpy array. Padding encoded as all zeroes:
array([0,0,0,1,1,0,0,0,0,0,0,0])"""
sparse_list = []
for residue in sequence_string:
sparse_list.extend(transdict[residue])
sparse_seq = np.array(sparse_list)
return sparse_seq
return sparse_encoder # Closure: return pointer to function that knows about transdict
#############################################################################################
#############################################################################################
class SeqError(Exception):
"""Seqlib exceptions"""
def __init__(self, errormessage):
self.errormessage = errormessage
def __str__(self):
return self.errormessage
#############################################################################################
#############################################################################################
class Const(object):
"""Global constants used by seqlib"""
# Alphabets
Restriction = set("01")
Standard = set("0123456789")
DNA = set("ACGT")
DNA_minambig = set("ACGTN")
DNA_typicalambig = set("ACGTYRN")
DNA_maxambig = set("ACGTURYMKWSBDHVN")
Protein = set("ACDEFGHIKLMNPQRSTVWY")
Protein_minambig = set("ACDEFGHIKLMNPQRSTVWYX")
Protein_maxambig = set("ACDEFGHIKLMNPQRSTVWYBZX")
ASCII = set(string.ascii_uppercase + string.digits + " ,._")
Mixed = DNA_maxambig | Protein_maxambig | Standard
#############################################################################################
#############################################################################################
class Sequence(object):
"""Baseclass for sequence classes"""
def __init__(self, name, seq, annotation, comments, check_alphabet, degap):
# NOTE: additional fields can be added later (e.g., feature list, etc.)
self.name = name
self.seq = seq.upper()
self.annotation = annotation
self.comments = comments
self.check_alphabet = check_alphabet
self.degap = degap
# If requested: remove gap characters
if self.degap:
self.remgaps()
# If requested: use set arithemtic to check whether seq contains any non-alphabet characters
if check_alphabet:
seqset = set(self.seq.upper())
non_alphabet = seqset - self.alphabet - set("-.")
if len(non_alphabet)>0:
raise SeqError("Unknown symbols in sequence %s: %s" % (name, list(non_alphabet)))
#######################################################################################
def __eq__(self, other):
"""Implements equality check between sequences"""
# Note: should gaps be removed? currently ignoring (and much faster to do so)
if self.seq == other.seq:
return True
else:
return False
#######################################################################################
def __ne__(self, other):
"""Implements inequality checking between sequences"""
# Python note: __eq__ does not cover use of != operator, so this has to be explicitly implemented
if self.seq != other.seq:
return True
else:
return False
#######################################################################################
def __len__(self):
"""Implements the len() operator for sequences"""
return len(self.seq)
#######################################################################################
def __getitem__(self, index):
"""Implements indexing, slicing, and iteration for sequences"""
# Python note: should this return new Sequence object instead?
return self.seq[index]
#######################################################################################
def __setitem__(self, index, residue):
"""Allows change of single residues by using dict-like syntax: seq[5] = 'A' for instance"""
newseq = []
newseq.append(self.seq[:index]) # Sequence string up to right before new residue
newseq.append(residue.upper())
newseq.append(self.seq[index + 1:]) # Original sequence string from right after new residue, to end
self.seq = "".join(newseq)
#######################################################################################
def __str__(self):
"""String (fasta) representation of sequence"""
return self.fasta()
#######################################################################################
def copy_seqobject(self):
"""Customized deep copy of sequence object"""
seqcopy = self.__class__(self.name, self.seq, self.annotation, self.comments)
return seqcopy
#######################################################################################
def rename(self, newname):
"""Changes name of sequence"""
self.name = newname
#######################################################################################
def subseq(self, start, stop, slicesyntax=True, rename=False):
"""Returns subsequence as sequence object of proper type. Indexing can start at zero or one"""
# Rename if requested. Note: naming should be according to original indices (before alteration)
if rename:
name = self.name + "_%d_%d" % (start, stop)
else:
name = self.name
# If slicesyntax is False: indexing starts at 1, and stop is included (and hence left unchanged)
if not slicesyntax:
start -= 1
# Sanity check: are requested subsequence indices within range of seq?
if start < 0 or stop > len(self):
raise SeqError("Requested subsequence (%d to %d) exceeds sequence length (%d)" % (start, stop, len(self)))
seq = self.seq[start:stop]
comments = self.comments
annotation = self.annotation[start:stop]
subseq = self.__class__(name, seq, annotation, comments) # Create new object of same class as self
# (don't know which sequence type we're in)
return(subseq)
#######################################################################################
def subseqpos(self, poslist, namesuffix=None):
"""Returns subsequence containing residues on selected positions as
sequence object of proper type. Indexing can start at zero or one"""
subseq = self.__class__(self.name, self.seq,
self.annotation, self.comments) # Create new object of same class as self
# (don't know which sequence type we're in)
subseq.indexfilter(poslist)
if namesuffix:
subseq.name = subseq.name + namesuffix
return(subseq)
#######################################################################################
def appendseq(self, other):
"""Appends seq from other to end of self. Returns as new Sequence object.
Name from self is retained"""
newseq = self.seq + other.seq
newannot = self.annotation + other.annotation
if self.comments and other.comments:
newcomment = self.comments + " " + other.comments
else:
newcomment = self.comments + other.comments
if self.seqtype == other.seqtype:
return self.__class__(self.name, newseq, newannot, newcomment)
else:
return Mixed_sequence(self.name, newseq, newannot, newcomment)
#######################################################################################
def prependseq(self, other):
"""Prepends seq from other before start of self. Returns as new Sequence object.
Name from self is retained"""
newseq = other.seq + self.seq
newannot = other.annotation + self.annotation
newcomment = other.comments + " " + self.comments
if self.seqtype == other.seqtype:
return self.__class__(self.name, newseq, newannot, newcomment)
else:
return Mixed_sequence(self.name, newseq, newannot, newcomment)
#######################################################################################
def windows(self, wsize, stepsize=1, l_overhang=0, r_overhang=0, padding="X", rename=False):
"""Returns window iterator object"""
# Function that returns window iterator object, which can be used to iterate over windows on sequence
# Iteration returns objects of the proper sequence type (same as parent). The rename option is like for subseq()
# Main trick here is a class which keeps track of its enclosing Sequence object ("parent"),
# while at the same time keeping track of which window the iteration has gotten to
# stepsize: consecutive windows are stepsize residues apart
# l_overhang, r_overhang: Iteration includes windows that "fall off the edge" of the sequence
# l_overhang specifies the location of the leftmost window (how many window positions on the left are outside the seq)
# r_overhang does the same for rightmost window (how many window positions are on the right of seq for rightmost window)
# padding: use this character to pad the overhangs (all non-sequence positions are set to this)
class Window_iterator:
"""Window iterator object"""
def __init__(self, parent, wsize, stepsize, l_overhang, r_overhang, padding, rename):
if l_overhang > wsize or r_overhang > wsize:
raise SeqError("l_overhang and r_overhang must be smaller than wsize")
if wsize > len(parent):
raise SeqError("wsize must be smaller than length of sequence")
self.i = 0 - l_overhang
self.wsize = wsize
self.stepsize = stepsize
self.l_overhang = l_overhang
self.r_overhang = r_overhang
self.padding = padding
self.length = len(parent)
self.right_end = self.length + r_overhang
self.parent = parent
self.rename = rename
def __iter__(self):
return self
def __next__(self):
if self.i + self.wsize <= self.right_end:
start = self.i
stop = start + self.wsize
self.i += self.stepsize
if start < 0:
window = self.parent.subseq(0, stop, rename=self.rename)
window.seq = abs(start) * self.padding + window.seq # Add the required number of padding chars
elif start >= 0 and stop <= self.length:
window = self.parent.subseq(start, stop, rename=self.rename)
elif start > 0 and stop > self.length:
window = self.parent.subseq(start, self.length, rename=self.rename)
window.seq = window.seq + (stop - self.length) * self.padding # Add the required number of padding chars
else:
raise SeqError("Execution really should never get to this line")
return window
else:
raise StopIteration
# In call below "self" points to enclosing Sequence object ("parent")
return Window_iterator(self, wsize, stepsize, l_overhang, r_overhang, padding, rename)
#######################################################################################
def remgaps(self):
"""Removes gap characters from sequence"""
self.seq = self.seq.replace("-", "")
#######################################################################################
def shuffle(self):
"""Returns shuffled sequence as object of proper subclass"""
seqlist = list(self.seq)
random.shuffle(seqlist)
shufseq = "".join(seqlist)
return self.__class__(name=self.name + "_shuffled", seq=shufseq) # Works regardless of class
#######################################################################################
def indexfilter(self, keeplist):
"""Discards symbols at positions that are not in keeplist (seq changed in place)"""
s = self.seq
seqlist = [s[i] for i in keeplist]
self.seq = "".join(seqlist)
# If Sequence object contains annotation: also apply filter to annotation sequence
if self.annotation:
s = self.annotation
annotlist = [s[i] for i in keeplist]
self.annotation = "".join(annotlist)
#######################################################################################
def seqdiff(self, other, zeroindex=True):
"""Returns list of tuples summarizing difference between seqs in self and other:
(site, residue1, residue2)
Option zeroindex=False causes site numbering to start at 1 (otherwise 0)
"""
difflist = []
for i, (res1, res2) in enumerate(zip(self.seq, other.seq)):
if res1 != res2:
if zeroindex:
difflist.append((i, res1, res2))
else:
difflist.append((i+1, res1, res2))
return difflist
# Python note: perhaps some of these returned lists should be generators instead
#######################################################################################
def hamming(self, other):
"""Directly observable distance between self and other (absolute count of different positions)"""
# Python note: completely outsourced to Levenshtein library which is blazing fast!
return lv.hamming(self.seq, other.seq)
#######################################################################################
def hamming_ignoregaps(self, other):
"""Like hamming distance, but positions with gaps are ignored"""
# Note: using this to build distance matrix will mean that for different pairs of sequences
# a different set of alignment positions are included in the distance measure
diffs = self.hamming(other)
gap_indices_self = _indices(self.seq, "-")
gap_indices_other = _indices(other.seq, "-")
n_dont_count = len(gap_indices_self ^ gap_indices_other)
return diffs - n_dont_count
#######################################################################################
def pdist(self, other):
"""Directly observable distance between self and other (in differences per site)"""
return self.hamming(other)/len(self.seq)
#######################################################################################
def pdist_ignoregaps(self, other):
"""Like pdist distance, but positions with gaps are ignored"""
# Note: using this to build distance matrix will mean that for different pairs of sequences
# a different set of alignment positions are included in the distance measure
diffs = self.hamming(other)
gap_indices_self = _indices(self.seq, "-")
gap_indices_other = _indices(other.seq, "-")
n_dont_count = len(gap_indices_self ^ gap_indices_other) # Dont count diff where ONE is gap
n_gap_pos = len(gap_indices_self | gap_indices_other)
return (diffs - n_dont_count) / ( len(self.seq) - n_gap_pos)
#######################################################################################
def pdist_ignorechars(self, other, igchars):
"""Like pdist_ignoregaps, but ignoring positions with ANY of the characters in igchars"""
# Note: using this to build distance matrix will mean that for different pairs of sequences
# a different set of alignment positions are included in the distance measure
diffs = self.hamming(other)
char_indices_self = _multi_indices(self.seq, igchars)
char_indices_other = _multi_indices(other.seq, igchars)
n_dont_count = len(char_indices_self ^ char_indices_other)
n_char_pos = len(char_indices_self | char_indices_other)
return (diffs - n_dont_count) / ( len(self.seq) - n_char_pos)
#######################################################################################
def residuecounts(self):
"""Returns dictionary with counts of residues for single seq. {letter:count}"""
return Counter(self.seq)
#######################################################################################
def composition(self, ignoregaps=True, ignoreambig=False):
"""Returns dictionary with composition for single seq. {letter:(count,freq)}"""
countdict = self.residuecounts()
ignorechars = set()
if ignoregaps:
ignorechars |= set("-")
if ignoreambig:
ignorechars |= self.ambigsymbols
if ignorechars:
alphabet = set(countdict.keys()) - ignorechars
ignorecount = sum(countdict[key] for key in ignorechars)
length = len(self) - ignorecount
else:
alphabet = set(countdict.keys())
length = len(self)
compdict = {}
for residue in alphabet:
count = countdict[residue]
freq = count/length
compdict[residue] = (count, freq)
return compdict
#######################################################################################
def findgaps(self):
"""Returns list of tuples giving (start,stop) indices for any gaps in sequence"""
indices = []
ingap = False
for i in range(len(self.seq)):
if self.seq[i] == '-' and not ingap:
start = i
ingap = True
elif self.seq[i] != '-' and ingap:
stop = i - 1
indices.append((start,stop))
ingap = False
if self.seq[-1] == "-":
stop = i
indices.append((start,stop))
return indices
#######################################################################################
def fasta(self, width=60, nocomments=False):
"""Return fasta-formatted sequence as a string"""
if not nocomments and self.comments:
fasta = [">%s %s\n" % (self.name, self.comments)]
else:
fasta = [">%s\n" % (self.name)]
# width == -1 has special meaning: print whole seq
if width == -1:
fasta.append(self.seq)
# else: fold lines at width characters
else:
numlines = int(ceil(len(self)/float(width)))
for i in range(numlines):
fasta.append(self[i*width:i*width+width])
fasta.append("\n")
del fasta[-1] # Remove trailing newline
return "".join(fasta)
#######################################################################################
def how(self, width=80, nocomments=False):
"""Return how-formatted sequence as a string"""
# NOTE: HOW requires a fixed width of 80. Should perhaps remove width-option from function
if self.comments and not nocomments:
how = ["{length:>6} {name} {com}\n".format(length=len(self.seq), name=self.name, com=self.comments)]
else:
how = ["{length:>6} {name}\n".format(length=len(self.seq), name=self.name)]
# If sequence has no annotation: construct default annotation field consisting of all dots (same len as seq)
# Note: I should perhaps check that annotation has same length as seq when present?
if not self.annotation:
self.annotation = "." * len(self)
# width == -1 has special meaning: print whole seq
if width == -1:
how.append(self.seq)
how.append("\n")
how.append(self.annotation)
# Print seq and annotation, fold lines at "width" characters
else:
numlines = int(ceil(len(self)/float(width)))
for i in range(numlines):
how.append(self[i*width:i*width+width])
how.append("\n")
for i in range(numlines):
how.append(self.annotation[i*width:i*width+width])
how.append("\n")
del how[-1] # Remove trailing newline
return "".join(how)
#######################################################################################
def gapencoded(self):
"""Returns gap-encoding of sequence (gap=1, nongap=0) as a string"""
# Note: mostly useful in mrbayes analyses where one wants to model gap changes using binary model
allsymbols = "".join(Const.DNA_maxambig | Const.Protein_maxambig | Const.ASCII)
transtable = str.maketrans("-" + allsymbols, "1" + "0" * len(allsymbols))
return self.seq.translate(transtable)
#######################################################################################
def tab(self, nocomments=False):
"""Returns tab-formatted sequence as a string:
name TAB seq TAB annotation TAB comments
If no annotation is present, comments are placed after two TABs if present"""
if (nocomments):
tabstring = "%s\t%s\t%s" % (self.name, self.seq, self.annotation)
else:
tabstring = "%s\t%s\t%s\t%s" % (self.name, self.seq, self.annotation, self.comments)
return tabstring
#######################################################################################
def raw(self):
"""Returns raw-formatted sequence as a string"""
# Note: RAW format consists of one sequence per line, with no other information
return(self.seq)
#############################################################################################
#############################################################################################
class DNA_sequence(Sequence):
"""Class containing one DNA sequence, and corresponding methods"""
# Implementation note: alphabet should really be set in a more principled manner
def __init__(self, name, seq, annotation="", comments="", check_alphabet=False, degap=False):
self.seqtype="DNA"
self.alphabet = Const.DNA_maxambig
self.ambigsymbols = self.alphabet - Const.DNA
Sequence.__init__(self, name, seq, annotation, comments, check_alphabet, degap)
## #######################################################################################
##
## NOTE: Consider whether this is a productive idea for alignments, if so reimplement at some point
##
##
## def seq_to_intlist(self):
## """Converts sequence to a list of integers, useful for rapid alignment"""
##
## trans_dict = {'A':0, 'C':1, 'G':2, 'T':3}
## intlist = [(trans_dict[letter]) for letter in self.seq]
## return intlist
##
#######################################################################################
def revcomp(self):
"""Returns reverse complement sequence as DNA sequence object - original is unchanged"""
comptable = str.maketrans("ATUGCYRSWKMBDHVN", "TAACGRYSWMKVHDBN")
revcompseq = self.seq.translate(comptable)
revcompseq = revcompseq[::-1] # Bit of a hack: Empty slice (returns all) with stride -1 (reverses)
name = self.name + "_revcomp"
return DNA_sequence(name, revcompseq)
#######################################################################################
def translate(self, reading_frame=1):
"""Translates DNA sequence to protein. Returns Protein_sequence object.
reading_frame: 1, 2, or 3, where 1 means start translation at first
nucleotide in sequence. Translation includes as many full-length
codons as possible."""
# All unknown triplets are translated as X (ambiguity symbol "N" handled intelligently)
gencode = { 'TTT': 'F', 'TCT': 'S', 'TAT': 'Y', 'TGT': 'C',
'TTC': 'F', 'TCC': 'S', 'TAC': 'Y', 'TGC': 'C',
'TTA': 'L', 'TCA': 'S', 'TAA': '*', 'TGA': '*',
'TTG': 'L', 'TCG': 'S', 'TAG': '*', 'TGG': 'W',
'TCN': 'S',
'CTT': 'L', 'CCT': 'P', 'CAT': 'H', 'CGT': 'R',
'CTC': 'L', 'CCC': 'P', 'CAC': 'H', 'CGC': 'R',
'CTA': 'L', 'CCA': 'P', 'CAA': 'Q', 'CGA': 'R',
'CTG': 'L', 'CCG': 'P', 'CAG': 'Q', 'CGG': 'R',
'CTN': 'L', 'CCN': 'P', 'CGN': 'R',
'ATT': 'I', 'ACT': 'T', 'AAT': 'N', 'AGT': 'S',
'ATC': 'I', 'ACC': 'T', 'AAC': 'N', 'AGC': 'S',
'ATA': 'I', 'ACA': 'T', 'AAA': 'K', 'AGA': 'R',
'ATG': 'M', 'ACG': 'T', 'AAG': 'K', 'AGG': 'R',
'ACN': 'T',
'GTT': 'V', 'GCT': 'A', 'GAT': 'D', 'GGT': 'G',
'GTC': 'V', 'GCC': 'A', 'GAC': 'D', 'GGC': 'G',
'GTA': 'V', 'GCA': 'A', 'GAA': 'E', 'GGA': 'G',
'GTG': 'V', 'GCG': 'A', 'GAG': 'E', 'GGG': 'G',
'GTN': 'V', 'GCN': 'A', 'GGN': 'G'
}
protseqlist = []
first_codon_start = reading_frame - 1
n_full_codons = (len(self) - first_codon_start) // 3
last_codon_start = first_codon_start + (n_full_codons - 1) * 3
for i in range(first_codon_start, last_codon_start + 1, 3):
triplet = self[i:i+3]
aa = gencode.get(triplet, "X")
protseqlist.append(aa)
protseq = "".join(protseqlist)
return Protein_sequence(self.name, protseq)
#############################################################################################
#############################################################################################
class Protein_sequence(Sequence):
"""Class containing one protein sequence, and corresponding methods"""
def __init__(self, name, seq, annotation="", comments="", check_alphabet=False, degap=False):
self.seqtype="protein"
self.alphabet = Const.Protein_maxambig
self.ambigsymbols = self.alphabet - Const.Protein
Sequence.__init__(self, name, seq, annotation, comments, check_alphabet, degap)
#######################################################################################
def shuffle(self):
"""Returns shuffled sequence as protein sequence object - original is unchanged"""
shufseq = Sequence.shuffle(self)
name = self.name + "_shuffled"
return Protein_sequence(name, shufseq)
#############################################################################################
#############################################################################################
class ASCII_sequence(Sequence):
"""Class containing one sequence containing ASCII letters, and corresponding methods"""
# Python note: will this ever be used???
def __init__(self, name, seq, annotation="", comments="", check_alphabet=False, degap=False):
self.seqtype="ASCII"
self.alphabet = Const.ASCII
self.ambigsymbols = set() # Empty set. Attribute used by some functions. Change?
Sequence.__init__(self, name, seq, annotation, comments, check_alphabet, degap)
#######################################################################################
def shuffle(self):
"""Returns shuffled sequence as ASCII sequence object - original is unchanged"""
shufseq = Sequence.shuffle(self)
name = self.name + "_shuffled"
return ASCII_sequence(name, shufseq)
#############################################################################################
#############################################################################################
class Restriction_sequence(Sequence):
"""Sequence containing restriction data-type (presence/absence - e.g. gaps)"""
def __init__(self, name, seq, annotation="", comments="", check_alphabet=False, degap=False):
self.seqtype="restriction"
self.alphabet = Const.Restriction
self.ambigsymbols = set() # Empty set. Attribute used by some functions. Change?
Sequence.__init__(self, name, seq, annotation, comments, check_alphabet, degap)
#############################################################################################
#############################################################################################
class Standard_sequence(Sequence):
"""Sequence containing standard data-type (e.g., morphological)"""
def __init__(self, name, seq, annotation="", comments="", check_alphabet=False, degap=False):
self.seqtype="standard"
self.alphabet = Const.Standard
self.ambigsymbols = set() # Empty set. Attribute used by some functions. Change?
Sequence.__init__(self, name, seq, annotation, comments, check_alphabet, degap)
#############################################################################################
#############################################################################################
class Mixed_sequence(Sequence):
"""Sequence containing mix of different sequence types (e.g., restriction and DNA)"""
# Python note: I am not keeping track of original sequence types in Sequence objects
# This information is available only in Seq_alignment objects. Rethink approach?
# It means I no longer have access to type-specific methods such as translation
# Python note: perhaps better to rethink seqtypes and to have collection of seqs in seq?
def __init__(self, name, seq, annotation="", comments="", check_alphabet=False, degap=False):
self.seqtype="mixed"
self.alphabet = Const.Mixed
self.ambigsymbols = self.alphabet - Const.Protein - Const.DNA - Const.Mixed
Sequence.__init__(self, name, seq, annotation, comments, check_alphabet, degap)
#############################################################################################
#############################################################################################
class Contig(object):
"""Class corresponding to contig, i.e. assembly of several reads (sequences).
Has methods for checking Contig overlap, assembly, etc."""
count = 0 # Used to keep track of number of instances of class, for individual naming of contigs
#######################################################################################
def __init__(self, sequence):
"""Initialises Contig object with one Sequence object"""
self.__class__.count += 1
self.name = "contig_{:04d}".format(self.__class__.count)
contigseq = sequence.seq
self.assembly = sequence.__class__(self.name, contigseq) # Sequence object containing assembled sequence of entire Contig
readname = sequence.name
self.readdict = {}
self.readdict[readname] = sequence
self.readdict[readname].startpos = 0 # I am adding "startpos" and "stoppos" attributes to the Sequence object in readdict
self.readdict[readname].stoppos = len(sequence.seq)
#######################################################################################
def findoverlap(self, other, minoverlap=0):
"""Checks if two Contig objects overlap (one contained in other, or 3' of one = 5' of other).
Minoverlap: overlap has to be at least this large to call a match"""
alen, blen = len(self.assembly.seq), len(other.assembly.seq)
minlen = min(alen, blen)
# Check if one sequence is contained in other sequence
if alen > blen:
found = self.assembly.seq.find(other.assembly.seq)
if found != -1:
astart, astop = found, found + blen
bstart, bstop = 0, blen
size = blen
return astart, astop, bstart, bstop, size
else:
found = other.assembly.seq.find(self.assembly.seq)
if found != -1:
astart, astop = 0, alen
bstart, bstop = found, found + alen
size = alen
return astart, astop, bstart, bstop, size
# Check if 3'end of self overlaps with 5' end of other
bestmatch = None
bestlen = 0
k = minlen
while k >= minoverlap:
if self.assembly.seq[-k:] == other.assembly[:k]:
bestlen = k
astart, astop = alen - k, alen
bstart, bstop = 0, k
size = k
bestmatch = astart, astop, bstart, bstop, size
break
else:
k -= 1
# Now check if there is a better match where 5' end of self overlaps with 3' end of other
k = minlen
while k >= max(minoverlap, bestlen + 1):
if (self.assembly.seq[:k] == other.assembly[-k:]):
bestlen = k
astart, astop = 0, k
bstart, bstop = blen - k, blen
size = k
bestmatch = astart, astop, bstart, bstop, size
break
else:
k -= 1
return bestmatch
#######################################################################################
def merge(self, other, overlap):
"""Merges Contig object with other Contig given overlap info from findoverlap.
Keeps track of positions of all reads from both Contigs"""
# "overlap" is result tuple from .findoverlap method
# b == 0: a is more upstream, so assembly starts with a (vice versa for a == 0)
# bstop < bhi: b has extra part downstream of overlap that should be appended to assembly
# Update read positions with respect to assembly when merging
astart, astop, bstart, bstop, overlaplen = overlap
alen = len(self.assembly)
blen = len(other.assembly)
# First part of assembly is sequence a (self)
if bstart == 0:
# Sequence b (other) has extra part downstream that needs to be appended to assembly
if bstop < blen:
downstreamseq = other.assembly.subseq(bstop, blen)
self.assembly = self.assembly.appendseq(downstreamseq)
# Add reads from b to readlist in a, and update read coordinates:
# b added to end, so all reads in a are OK, but all reads in b were shifted by astart
for readname in other.readdict:
self.readdict[readname] = other.readdict[readname]
self.readdict[readname].startpos += astart
self.readdict[readname].stoppos += astart
# First part of assembly is sequence b (other)
# Sequence b must have extra part upstream that needs to be prepended to assembly (since bstart != 0)
elif astart == 0:
upstreamseq = other.assembly.subseq(0, bstart)
self.assembly = self.assembly.prependseq(upstreamseq)
# Correct coordinates for reads in readlist of a: these have been shifted by bstart
for readname in self.readdict:
self.readdict[readname].startpos += bstart
self.readdict[readname].stoppos += bstart
# Add reads from b to readlist in a. These have correct coordinates since b added at front
self.readdict.update( other.readdict )
#######################################################################################