From cb6ddc4d637fb980da0ec227b3c5d2e323d848e5 Mon Sep 17 00:00:00 2001 From: Kyle McChesney Date: Wed, 17 Apr 2024 14:44:08 -0600 Subject: [PATCH 1/3] fix performance issue finding last alignment with lots of alignments --- palamedes/align.py | 51 +++++++++++++++++++++++++++++++++++---------- tests/test_align.py | 51 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 11 deletions(-) diff --git a/palamedes/align.py b/palamedes/align.py index 3bba158..a2ccd23 100644 --- a/palamedes/align.py +++ b/palamedes/align.py @@ -36,6 +36,24 @@ def generate_seq_record(sequence: str, seq_id: str, molecule_type: str = MOLECUL ) +def reverse_seq_record(seq_record: SeqRecord) -> SeqRecord: + """ + Helper function to copy a SeqRecord into a new one, with the sequence reversed. This is a best effort copy, + which is only used internally to the alignment logic to more easily access the 3' end most representation of + the best alignment. + """ + return SeqRecord( + Seq("".join(reversed(seq_record.seq))), + id=seq_record.id, + name=seq_record.name, + description=seq_record.description, + dbxrefs=seq_record.dbxrefs[:], + features=seq_record.features[:], + annotations=seq_record.annotations.copy(), + letter_annotations=seq_record.letter_annotations.copy(), + ) + + def make_variant_base(ref_base: str, alt_base: str) -> str: """Helper function to generate the correct variant base given the ref and alt alignment bases""" if ref_base == alt_base: @@ -169,19 +187,30 @@ def generate_alignment( f"got: {alt_molecule_type}, expected: {molecule_type}!" ) - alignments = aligner.align(reference_seq_record, alternate_seq_record) - best_alignment = next(alignments) - best_alignment_score = best_alignment.score + # reverse the sequences and then align the reversed, keeping the first best alignment + reversed_ref_seq_record = reverse_seq_record(reference_seq_record) + reversed_alt_seq_record = reverse_seq_record(alternate_seq_record) + reversed_alignments = aligner.align(reversed_ref_seq_record, reversed_alt_seq_record) + reversed_alignment = reversed_alignments[0] - other_alignments_with_best = [alignment for alignment in alignments if alignment.score == best_alignment_score] - if len(other_alignments_with_best) > 0: - LOGGER.debug( - "Found %s alignments with max score, returning last in the list (3' end rule)", - len(other_alignments_with_best) + 1, - ) - best_alignment = other_alignments_with_best[-1] + # undo the reversal, to recover the "last" highest scoring alignment for the forward + # which should correspond to the 3' end most alignment and follow HGSV spec + # note 'infer_coordinates' will soon be deprecated, update to 'parse_printed_alignment' once its live + forward_coordinates = Alignment.infer_coordinates( + [ + reversed_alignment[0][::-1], + reversed_alignment[1][::-1], + ] + ) + foward_alignment = Alignment( + [reference_seq_record, alternate_seq_record], + forward_coordinates, + ) + + # score is not technically an attribute on the class + setattr(foward_alignment, "score", reversed_alignment.score) - return best_alignment + return foward_alignment def generate_variant_blocks(alignment: Alignment) -> list[VariantBlock]: diff --git a/tests/test_align.py b/tests/test_align.py index a0b809c..bd1f5ed 100644 --- a/tests/test_align.py +++ b/tests/test_align.py @@ -197,6 +197,57 @@ def test_generate_alignment_custom_aligner(self): alignment = generate_alignment(ref, alt, aligner=custom_aligner) self.assertTrue(alignment.score, custom_match_score) + def test_generate_alignment_three_prime_end_most_del(self): + ref, alt = self.make_seq_records( + "T" + "A" * 10 + "G", + "T" + "A" * 9 + "G", + ) + + alignment = generate_alignment(ref, alt) + + # ensure we get the ref unchanged + # and the last A deleted, not any others + self.assertEqual(alignment[0], ref.seq) + self.assertEqual(alignment[1], "T" + "A" * 9 + "-G") + + def test_generate_alignment_three_prime_end_most_double_del(self): + ref, alt = self.make_seq_records( + "ATTGCCA", + "ATGCA", + ) + + alignment = generate_alignment(ref, alt) + + # ensure we get the ref unchanged + # and the second of each paired based deleted + self.assertEqual(alignment[0], ref.seq) + self.assertEqual(alignment[1], "AT-GC-A") + + def test_generate_alignment_three_prime_end_most_ins(self): + ref, alt = self.make_seq_records( + "T" + "A" * 9 + "G", + "T" + "A" * 10 + "G", + ) + + alignment = generate_alignment(ref, alt) + + # ensure we get the alt unchanged + # and the insertion happens after the last A in the ref + self.assertEqual(alignment[0], "T" + "A" * 9 + "-G") + self.assertEqual(alignment[1], alt.seq) + + def test_generate_alignment_three_prime_end_most_double_ins(self): + ref, alt = self.make_seq_records( + "ATGCA", + "ATTGCCA", + ) + alignment = generate_alignment(ref, alt) + + # ensure we get the alt unchanged + # and the insertion happens after the last A in the ref + self.assertEqual(alignment[0], "AT-GC-A") + self.assertEqual(alignment[1], alt.seq) + class GenerateVariantBlocksTestCase(PalamedesBaseCase): def test_generate_variant_blocks_all_matches(self): From 8099521798b80850fc5cf103ac8f5b20a9c81c0f Mon Sep 17 00:00:00 2001 From: Kyle McChesney Date: Wed, 17 Apr 2024 14:45:25 -0600 Subject: [PATCH 2/3] use slice syntax for reversed --- palamedes/align.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/palamedes/align.py b/palamedes/align.py index a2ccd23..80c11e5 100644 --- a/palamedes/align.py +++ b/palamedes/align.py @@ -43,7 +43,7 @@ def reverse_seq_record(seq_record: SeqRecord) -> SeqRecord: the best alignment. """ return SeqRecord( - Seq("".join(reversed(seq_record.seq))), + Seq(seq_record.seq[::-1]), id=seq_record.id, name=seq_record.name, description=seq_record.description, From a305ca50ccbf3ac6d9087291ff445d0595f7dd4e Mon Sep 17 00:00:00 2001 From: Rohan Grover <95771142+rohan-mammothbio@users.noreply.github.com> Date: Wed, 17 Apr 2024 15:10:38 -0700 Subject: [PATCH 3/3] minor: spelling fix --- palamedes/align.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/palamedes/align.py b/palamedes/align.py index 80c11e5..f2e5128 100644 --- a/palamedes/align.py +++ b/palamedes/align.py @@ -202,15 +202,15 @@ def generate_alignment( reversed_alignment[1][::-1], ] ) - foward_alignment = Alignment( + forward_alignment = Alignment( [reference_seq_record, alternate_seq_record], forward_coordinates, ) # score is not technically an attribute on the class - setattr(foward_alignment, "score", reversed_alignment.score) + setattr(forward_alignment, "score", reversed_alignment.score) - return foward_alignment + return forward_alignment def generate_variant_blocks(alignment: Alignment) -> list[VariantBlock]: