Don't use Seq._data in Codonseq (biopython#3179)

* one * two * three * four * five * six * seven * eight * nine * ten * eleven * travis
peterjc · Jul 30, 2020 · 6b8a4e4 · 6b8a4e4
1 parent 63c3dc4
commit 6b8a4e4
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 48 deletions.
diff --git a/Bio/codonalign/__init__.py b/Bio/codonalign/__init__.py
@@ -589,7 +589,6 @@ def _get_codon_rec(
     from Bio.Seq import Seq
 
     nucl_seq = nucl.seq.ungap(gap_char)
-    codon_seq = ""
     span = span_mode[0]
     mode = span_mode[1]
     aa2re = _get_aa_regex(codon_table)
@@ -600,11 +599,12 @@ def _get_codon_rec(
                 f"Nucleotide Record {nucl.id} do not match!"
             )
         aa_num = 0
+        codon_seq = CodonSeq()
         for aa in pro.seq:
             if aa == "-":
                 codon_seq += "---"
             elif complete_protein and aa_num == 0:
-                this_codon = nucl_seq._data[span[0] : span[0] + 3]
+                this_codon = nucl_seq[span[0] : span[0] + 3]
                 if not re.search(
                     _codons2re[codon_table.start_codons], this_codon.upper()
                 ):
@@ -622,10 +622,8 @@ def _get_codon_rec(
                 codon_seq += this_codon
                 aa_num += 1
             else:
-                this_codon = nucl_seq._data[
-                    (span[0] + 3 * aa_num) : (span[0] + 3 * (aa_num + 1))
-                ]
-                if not str(Seq(this_codon.upper()).translate(table=codon_table)) == aa:
+                this_codon = nucl_seq[span[0] + 3 * aa_num : span[0] + 3 * (aa_num + 1)]
+                if this_codon.upper().translate(table=codon_table) != aa:
                     max_score -= 1
                     warnings.warn(
                         "%s(%s %d) does not correspond to %s(%s)"
@@ -639,7 +637,7 @@ def _get_codon_rec(
                     )
                 codon_seq += this_codon
                 aa_num += 1
-        return SeqRecord(CodonSeq(codon_seq), id=nucl.id)
+        return SeqRecord(codon_seq, id=nucl.id)
     elif mode == 2:
         from collections import deque
 
@@ -666,12 +664,13 @@ def _get_codon_rec(
                 i = shift_pos[shift_start.index(i)][1]
             if i >= match.end():
                 break
+        codon_seq = CodonSeq()
         aa_num = 0
         for aa in pro.seq:
             if aa == "-":
                 codon_seq += "---"
             elif complete_protein and aa_num == 0:
-                this_codon = nucl_seq._data[rf_table[0] : rf_table[0] + 3]
+                this_codon = nucl_seq[rf_table[0] : rf_table[0] + 3]
                 if not re.search(
                     _codons2re[codon_table.start_codons], this_codon.upper()
                 ):
@@ -692,25 +691,22 @@ def _get_codon_rec(
                     start = rf_table[aa_num]
                     end = start + (3 - shift_val)
                     ngap = shift_val
-                    this_codon = nucl_seq._data[start:end] + "-" * ngap
+                    this_codon = nucl_seq[start:end] + "-" * ngap
                 elif rf_table[aa_num] - rf_table[aa_num - 1] - 3 > 0:
                     max_score -= 1
                     start = rf_table[aa_num - 1] + 3
                     end = rf_table[aa_num]
                     ngap = 3 - (rf_table[aa_num] - rf_table[aa_num - 1] - 3)
                     this_codon = (
-                        nucl_seq._data[start:end]
+                        nucl_seq[start:end]
                         + "-" * ngap
-                        + nucl_seq._data[rf_table[aa_num] : rf_table[aa_num] + 3]
+                        + nucl_seq[rf_table[aa_num] : rf_table[aa_num] + 3]
                     )
                 else:
                     start = rf_table[aa_num]
                     end = start + 3
-                    this_codon = nucl_seq._data[start:end]
-                    if (
-                        not str(Seq(this_codon.upper()).translate(table=codon_table))
-                        == aa
-                    ):
+                    this_codon = nucl_seq[start:end]
+                    if this_codon.upper().translate(table=codon_table) != aa:
                         max_score -= 1
                         warnings.warn(
                             f"Codon of {pro.id}({aa} {aa_num}) does not "
@@ -724,7 +720,8 @@ def _get_codon_rec(
                     )
                 codon_seq += this_codon
                 aa_num += 1
-        return SeqRecord(CodonSeq(codon_seq, rf_table=rf_table), id=nucl.id)
+        codon_seq.rf_table = rf_table
+        return SeqRecord(codon_seq, id=nucl.id)
 
 
 def _align_shift_recs(recs):
@@ -751,7 +748,7 @@ def find_next_int(k, lst):
             if isinstance(i, int):
                 rf_num[k] += 1
             # isinstance(i, float) should be True
-            elif rec.seq._data[int(i) : int(i) + 3] == "---":
+            elif rec.seq[int(i) : int(i) + 3] == "---":
                 rf_num[k] += 1
     if len(set(rf_num)) != 1:
         raise RuntimeError("Number of alignable codons unequal in given records")
@@ -766,43 +763,43 @@ def find_next_int(k, lst):
             break
         for j, k in enumerate(col_rf_lst):
             add_lst.append((j, int(k)))
-            if isinstance(k, float) and recs[j].seq._data[int(k) : int(k) + 3] != "---":
+            if isinstance(k, float) and recs[j].seq[int(k) : int(k) + 3] != "---":
                 m, p = find_next_int(k, full_rf_table_lst[j])
                 if (m - k) % 3 != 0:
                     gap_num = 3 - (m - k) % 3
                 else:
                     gap_num = 0
                 if gap_num != 0:
                     gaps = "-" * int(gap_num)
-                    seq = (
-                        recs[j].seq._data[: int(k)] + gaps + recs[j].seq._data[int(k) :]
-                    )
+                    seq = CodonSeq(rf_table=recs[j].seq.rf_table)
+                    seq += recs[j].seq[: int(k)] + gaps + recs[j].seq[int(k) :]
                     full_rf_table = full_rf_table_lst[j]
                     bp = full_rf_table.index(k)
                     full_rf_table = full_rf_table[:bp] + [
                         v + int(gap_num) for v in full_rf_table[bp + 1 :]
                     ]
                     full_rf_table_lst[j] = full_rf_table
-                    recs[j].seq = CodonSeq(seq, rf_table=recs[j].seq.rf_table)
+                    recs[j].seq = seq
                 add_lst.pop()
                 gap_num += m - k
                 i += p - 1
         if len(add_lst) != rec_num:
             for j, k in add_lst:
+                seq = CodonSeq(rf_table=recs[j].seq.rf_table)
                 gaps = "-" * int(gap_num)
-                seq = recs[j].seq._data[: int(k)] + gaps + recs[j].seq._data[int(k) :]
+                seq += recs[j].seq[: int(k)] + gaps + recs[j].seq[int(k) :]
                 full_rf_table = full_rf_table_lst[j]
                 bp = full_rf_table.index(k)
                 inter_rf = []
-                for t in filter(lambda x: x % 3 == 0, range(len(gaps))):
+                for t in range(0, len(gaps), 3):
                     inter_rf.append(k + t + 3.0)
                 full_rf_table = (
                     full_rf_table[:bp]
                     + inter_rf
                     + [v + int(gap_num) for v in full_rf_table[bp:]]
                 )
                 full_rf_table_lst[j] = full_rf_table
-                recs[j].seq = CodonSeq(seq, rf_table=recs[j].seq.rf_table)
+                recs[j].seq = seq
         i += 1
     return recs
 

diff --git a/Bio/codonalign/codonseq.py b/Bio/codonalign/codonseq.py
@@ -70,15 +70,15 @@ def __init__(self, data="", gap_char="-", rf_table=None):
 
         # check the length of the alignment to be a triple
         if rf_table is None:
-            seq_ungapped = self._data.replace(gap_char, "")
-            if len(self) % 3 != 0:
+            length = len(self)
+            if length % 3 != 0:
                 raise ValueError(
                     "Sequence length is not a multiple of "
                     "three (i.e. a whole number of codons)"
                 )
-            self.rf_table = list(filter(lambda x: x % 3 == 0, range(len(seq_ungapped))))
+            self.rf_table = list(range(0, length - self.count(gap_char), 3))
         else:
-            # if gap_char in self._data:
+            # if gap_char in self:
             #    assert  len(self) % 3 == 0, \
             #            "Gapped sequence length is not a triple number"
             if not isinstance(rf_table, (tuple, list)):
@@ -91,9 +91,6 @@ def __init__(self, data="", gap_char="-", rf_table=None):
                 )
             self.rf_table = rf_table
 
-    def __getitem__(self, index):
-        return Seq(self._data[index])
-
     def get_codon(self, index):
         """Get the index codon from the sequence."""
         if len({i % 3 for i in self.rf_table}) != 1:
@@ -103,9 +100,9 @@ def get_codon(self, index):
             )
         if isinstance(index, int):
             if index != -1:
-                return self._data[index * 3 : (index + 1) * 3]
+                return str(self[index * 3 : (index + 1) * 3])
             else:
-                return self._data[index * 3 :]
+                return str(self[index * 3 :])
         else:
             # This slice ensures that codon will always be the unit
             # in slicing (it won't change to other codon if you are
@@ -119,8 +116,8 @@ def cslice(p):
                 aa_slice = aa_index[p]
                 codon_slice = ""
                 for i in aa_slice:
-                    codon_slice += self._data[i * 3 : i * 3 + 3]
-                return codon_slice
+                    codon_slice += self[i * 3 : i * 3 + 3]
+                return str(codon_slice)
 
             codon_slice = cslice(index)
             return CodonSeq(codon_slice)
@@ -144,9 +141,9 @@ def translate(
             codon_table = CodonTable.generic_by_id[1]
         amino_acids = []
         if ungap_seq:
-            tr_seq = self._data.replace(self.gap_char, "")
+            tr_seq = str(self).replace(self.gap_char, "")
         else:
-            tr_seq = self._data
+            tr_seq = str(self)
         if rf_table is None:
             rf_table = self.rf_table
         p = -1  # initiation
@@ -183,7 +180,7 @@ def translate(
 
     def toSeq(self):
         """Convert DNA to seq object."""
-        return Seq(self._data)
+        return Seq(str(self))
 
     def get_full_rf_table(self):
         """Return full rf_table of the CodonSeq records.
@@ -192,21 +189,21 @@ def get_full_rf_table(self):
         it translate gaps in CodonSeq. It is helpful to construct
         alignment containing frameshift.
         """
-        ungap_seq = self._data.replace("-", "")
+        ungap_seq = str(self).replace("-", "")
         relative_pos = [self.rf_table[0]]
         for i in range(1, len(self.rf_table[1:]) + 1):
             relative_pos.append(self.rf_table[i] - self.rf_table[i - 1])
         full_rf_table = []
         codon_num = 0
-        for i in filter(lambda x: x % 3 == 0, range(len(self._data))):
-            if self._data[i : i + 3] == self.gap_char * 3:
+        for i in range(0, len(self), 3):
+            if self[i : i + 3] == self.gap_char * 3:
                 full_rf_table.append(i + 0.0)
             elif relative_pos[codon_num] == 0:
                 full_rf_table.append(i)
                 codon_num += 1
             elif relative_pos[codon_num] in (-1, -2):
                 # check the gap status of previous codon
-                gap_stat = len(self._data[i - 3 : i].replace("-", ""))
+                gap_stat = 3 - self.count("-", i - 3, i)
                 if gap_stat == 3:
                     full_rf_table.append(i + relative_pos[codon_num])
                 elif gap_stat == 2:
@@ -217,7 +214,7 @@ def get_full_rf_table(self):
             elif relative_pos[codon_num] > 0:
                 full_rf_table.append(i + 0.0)
             try:
-                this_len = len(self._data[i : i + 3].replace("-", ""))
+                this_len = 3 - self.count("-", i, i + 3)
                 relative_pos[codon_num] -= this_len
             except Exception:  # TODO: IndexError?
                 # we probably reached the last codon
@@ -240,15 +237,15 @@ def ungap(self, gap="-"):
         """Return a copy of the sequence without the gap character(s)."""
         if len(gap) != 1 or not isinstance(gap, str):
             raise ValueError("Unexpected gap character, %s" % repr(gap))
-        return CodonSeq(str(self._data).replace(gap, ""), rf_table=self.rf_table)
+        return CodonSeq(str(self).replace(gap, ""), rf_table=self.rf_table)
 
     @classmethod
     def from_seq(cls, seq, rf_table=None):
         """Get codon sequence from sequence data."""
         if rf_table is None:
-            return cls(seq._data)
+            return cls(str(seq))
         else:
-            return cls(seq._data, rf_table=rf_table)
+            return cls(str(seq), rf_table=rf_table)
 
 
 def _get_codon_list(codonseq):