Skip to content

Commit

Permalink
Don't use Seq._data in Codonseq (biopython#3179)
Browse files Browse the repository at this point in the history
* one

* two

* three

* four

* five

* six

* seven

* eight

* nine

* ten

* eleven

* travis
  • Loading branch information
mdehoon authored Jul 30, 2020
1 parent 63c3dc4 commit 6b8a4e4
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 48 deletions.
49 changes: 23 additions & 26 deletions Bio/codonalign/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,6 @@ def _get_codon_rec(
from Bio.Seq import Seq

nucl_seq = nucl.seq.ungap(gap_char)
codon_seq = ""
span = span_mode[0]
mode = span_mode[1]
aa2re = _get_aa_regex(codon_table)
Expand All @@ -600,11 +599,12 @@ def _get_codon_rec(
f"Nucleotide Record {nucl.id} do not match!"
)
aa_num = 0
codon_seq = CodonSeq()
for aa in pro.seq:
if aa == "-":
codon_seq += "---"
elif complete_protein and aa_num == 0:
this_codon = nucl_seq._data[span[0] : span[0] + 3]
this_codon = nucl_seq[span[0] : span[0] + 3]
if not re.search(
_codons2re[codon_table.start_codons], this_codon.upper()
):
Expand All @@ -622,10 +622,8 @@ def _get_codon_rec(
codon_seq += this_codon
aa_num += 1
else:
this_codon = nucl_seq._data[
(span[0] + 3 * aa_num) : (span[0] + 3 * (aa_num + 1))
]
if not str(Seq(this_codon.upper()).translate(table=codon_table)) == aa:
this_codon = nucl_seq[span[0] + 3 * aa_num : span[0] + 3 * (aa_num + 1)]
if this_codon.upper().translate(table=codon_table) != aa:
max_score -= 1
warnings.warn(
"%s(%s %d) does not correspond to %s(%s)"
Expand All @@ -639,7 +637,7 @@ def _get_codon_rec(
)
codon_seq += this_codon
aa_num += 1
return SeqRecord(CodonSeq(codon_seq), id=nucl.id)
return SeqRecord(codon_seq, id=nucl.id)
elif mode == 2:
from collections import deque

Expand All @@ -666,12 +664,13 @@ def _get_codon_rec(
i = shift_pos[shift_start.index(i)][1]
if i >= match.end():
break
codon_seq = CodonSeq()
aa_num = 0
for aa in pro.seq:
if aa == "-":
codon_seq += "---"
elif complete_protein and aa_num == 0:
this_codon = nucl_seq._data[rf_table[0] : rf_table[0] + 3]
this_codon = nucl_seq[rf_table[0] : rf_table[0] + 3]
if not re.search(
_codons2re[codon_table.start_codons], this_codon.upper()
):
Expand All @@ -692,25 +691,22 @@ def _get_codon_rec(
start = rf_table[aa_num]
end = start + (3 - shift_val)
ngap = shift_val
this_codon = nucl_seq._data[start:end] + "-" * ngap
this_codon = nucl_seq[start:end] + "-" * ngap
elif rf_table[aa_num] - rf_table[aa_num - 1] - 3 > 0:
max_score -= 1
start = rf_table[aa_num - 1] + 3
end = rf_table[aa_num]
ngap = 3 - (rf_table[aa_num] - rf_table[aa_num - 1] - 3)
this_codon = (
nucl_seq._data[start:end]
nucl_seq[start:end]
+ "-" * ngap
+ nucl_seq._data[rf_table[aa_num] : rf_table[aa_num] + 3]
+ nucl_seq[rf_table[aa_num] : rf_table[aa_num] + 3]
)
else:
start = rf_table[aa_num]
end = start + 3
this_codon = nucl_seq._data[start:end]
if (
not str(Seq(this_codon.upper()).translate(table=codon_table))
== aa
):
this_codon = nucl_seq[start:end]
if this_codon.upper().translate(table=codon_table) != aa:
max_score -= 1
warnings.warn(
f"Codon of {pro.id}({aa} {aa_num}) does not "
Expand All @@ -724,7 +720,8 @@ def _get_codon_rec(
)
codon_seq += this_codon
aa_num += 1
return SeqRecord(CodonSeq(codon_seq, rf_table=rf_table), id=nucl.id)
codon_seq.rf_table = rf_table
return SeqRecord(codon_seq, id=nucl.id)


def _align_shift_recs(recs):
Expand All @@ -751,7 +748,7 @@ def find_next_int(k, lst):
if isinstance(i, int):
rf_num[k] += 1
# isinstance(i, float) should be True
elif rec.seq._data[int(i) : int(i) + 3] == "---":
elif rec.seq[int(i) : int(i) + 3] == "---":
rf_num[k] += 1
if len(set(rf_num)) != 1:
raise RuntimeError("Number of alignable codons unequal in given records")
Expand All @@ -766,43 +763,43 @@ def find_next_int(k, lst):
break
for j, k in enumerate(col_rf_lst):
add_lst.append((j, int(k)))
if isinstance(k, float) and recs[j].seq._data[int(k) : int(k) + 3] != "---":
if isinstance(k, float) and recs[j].seq[int(k) : int(k) + 3] != "---":
m, p = find_next_int(k, full_rf_table_lst[j])
if (m - k) % 3 != 0:
gap_num = 3 - (m - k) % 3
else:
gap_num = 0
if gap_num != 0:
gaps = "-" * int(gap_num)
seq = (
recs[j].seq._data[: int(k)] + gaps + recs[j].seq._data[int(k) :]
)
seq = CodonSeq(rf_table=recs[j].seq.rf_table)
seq += recs[j].seq[: int(k)] + gaps + recs[j].seq[int(k) :]
full_rf_table = full_rf_table_lst[j]
bp = full_rf_table.index(k)
full_rf_table = full_rf_table[:bp] + [
v + int(gap_num) for v in full_rf_table[bp + 1 :]
]
full_rf_table_lst[j] = full_rf_table
recs[j].seq = CodonSeq(seq, rf_table=recs[j].seq.rf_table)
recs[j].seq = seq
add_lst.pop()
gap_num += m - k
i += p - 1
if len(add_lst) != rec_num:
for j, k in add_lst:
seq = CodonSeq(rf_table=recs[j].seq.rf_table)
gaps = "-" * int(gap_num)
seq = recs[j].seq._data[: int(k)] + gaps + recs[j].seq._data[int(k) :]
seq += recs[j].seq[: int(k)] + gaps + recs[j].seq[int(k) :]
full_rf_table = full_rf_table_lst[j]
bp = full_rf_table.index(k)
inter_rf = []
for t in filter(lambda x: x % 3 == 0, range(len(gaps))):
for t in range(0, len(gaps), 3):
inter_rf.append(k + t + 3.0)
full_rf_table = (
full_rf_table[:bp]
+ inter_rf
+ [v + int(gap_num) for v in full_rf_table[bp:]]
)
full_rf_table_lst[j] = full_rf_table
recs[j].seq = CodonSeq(seq, rf_table=recs[j].seq.rf_table)
recs[j].seq = seq
i += 1
return recs

Expand Down
41 changes: 19 additions & 22 deletions Bio/codonalign/codonseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ def __init__(self, data="", gap_char="-", rf_table=None):

# check the length of the alignment to be a triple
if rf_table is None:
seq_ungapped = self._data.replace(gap_char, "")
if len(self) % 3 != 0:
length = len(self)
if length % 3 != 0:
raise ValueError(
"Sequence length is not a multiple of "
"three (i.e. a whole number of codons)"
)
self.rf_table = list(filter(lambda x: x % 3 == 0, range(len(seq_ungapped))))
self.rf_table = list(range(0, length - self.count(gap_char), 3))
else:
# if gap_char in self._data:
# if gap_char in self:
# assert len(self) % 3 == 0, \
# "Gapped sequence length is not a triple number"
if not isinstance(rf_table, (tuple, list)):
Expand All @@ -91,9 +91,6 @@ def __init__(self, data="", gap_char="-", rf_table=None):
)
self.rf_table = rf_table

def __getitem__(self, index):
return Seq(self._data[index])

def get_codon(self, index):
"""Get the index codon from the sequence."""
if len({i % 3 for i in self.rf_table}) != 1:
Expand All @@ -103,9 +100,9 @@ def get_codon(self, index):
)
if isinstance(index, int):
if index != -1:
return self._data[index * 3 : (index + 1) * 3]
return str(self[index * 3 : (index + 1) * 3])
else:
return self._data[index * 3 :]
return str(self[index * 3 :])
else:
# This slice ensures that codon will always be the unit
# in slicing (it won't change to other codon if you are
Expand All @@ -119,8 +116,8 @@ def cslice(p):
aa_slice = aa_index[p]
codon_slice = ""
for i in aa_slice:
codon_slice += self._data[i * 3 : i * 3 + 3]
return codon_slice
codon_slice += self[i * 3 : i * 3 + 3]
return str(codon_slice)

codon_slice = cslice(index)
return CodonSeq(codon_slice)
Expand All @@ -144,9 +141,9 @@ def translate(
codon_table = CodonTable.generic_by_id[1]
amino_acids = []
if ungap_seq:
tr_seq = self._data.replace(self.gap_char, "")
tr_seq = str(self).replace(self.gap_char, "")
else:
tr_seq = self._data
tr_seq = str(self)
if rf_table is None:
rf_table = self.rf_table
p = -1 # initiation
Expand Down Expand Up @@ -183,7 +180,7 @@ def translate(

def toSeq(self):
"""Convert DNA to seq object."""
return Seq(self._data)
return Seq(str(self))

def get_full_rf_table(self):
"""Return full rf_table of the CodonSeq records.
Expand All @@ -192,21 +189,21 @@ def get_full_rf_table(self):
it translate gaps in CodonSeq. It is helpful to construct
alignment containing frameshift.
"""
ungap_seq = self._data.replace("-", "")
ungap_seq = str(self).replace("-", "")
relative_pos = [self.rf_table[0]]
for i in range(1, len(self.rf_table[1:]) + 1):
relative_pos.append(self.rf_table[i] - self.rf_table[i - 1])
full_rf_table = []
codon_num = 0
for i in filter(lambda x: x % 3 == 0, range(len(self._data))):
if self._data[i : i + 3] == self.gap_char * 3:
for i in range(0, len(self), 3):
if self[i : i + 3] == self.gap_char * 3:
full_rf_table.append(i + 0.0)
elif relative_pos[codon_num] == 0:
full_rf_table.append(i)
codon_num += 1
elif relative_pos[codon_num] in (-1, -2):
# check the gap status of previous codon
gap_stat = len(self._data[i - 3 : i].replace("-", ""))
gap_stat = 3 - self.count("-", i - 3, i)
if gap_stat == 3:
full_rf_table.append(i + relative_pos[codon_num])
elif gap_stat == 2:
Expand All @@ -217,7 +214,7 @@ def get_full_rf_table(self):
elif relative_pos[codon_num] > 0:
full_rf_table.append(i + 0.0)
try:
this_len = len(self._data[i : i + 3].replace("-", ""))
this_len = 3 - self.count("-", i, i + 3)
relative_pos[codon_num] -= this_len
except Exception: # TODO: IndexError?
# we probably reached the last codon
Expand All @@ -240,15 +237,15 @@ def ungap(self, gap="-"):
"""Return a copy of the sequence without the gap character(s)."""
if len(gap) != 1 or not isinstance(gap, str):
raise ValueError("Unexpected gap character, %s" % repr(gap))
return CodonSeq(str(self._data).replace(gap, ""), rf_table=self.rf_table)
return CodonSeq(str(self).replace(gap, ""), rf_table=self.rf_table)

@classmethod
def from_seq(cls, seq, rf_table=None):
"""Get codon sequence from sequence data."""
if rf_table is None:
return cls(seq._data)
return cls(str(seq))
else:
return cls(seq._data, rf_table=rf_table)
return cls(str(seq), rf_table=rf_table)


def _get_codon_list(codonseq):
Expand Down

0 comments on commit 6b8a4e4

Please sign in to comment.