Skip to content

Commit

Permalink
Merge pull request mammothbio-os#16 from mbiokyle29/alignment-reversa…
Browse files Browse the repository at this point in the history
…l-3p-end-rule

fix performance issue finding last alignment with lots of alignments
  • Loading branch information
mbiokyle29 authored Apr 18, 2024
2 parents 62da37f + a305ca5 commit cb6e300
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 11 deletions.
51 changes: 40 additions & 11 deletions palamedes/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,24 @@ def generate_seq_record(sequence: str, seq_id: str, molecule_type: str = MOLECUL
)


def reverse_seq_record(seq_record: SeqRecord) -> SeqRecord:
"""
Helper function to copy a SeqRecord into a new one, with the sequence reversed. This is a best effort copy,
which is only used internally to the alignment logic to more easily access the 3' end most representation of
the best alignment.
"""
return SeqRecord(
Seq(seq_record.seq[::-1]),
id=seq_record.id,
name=seq_record.name,
description=seq_record.description,
dbxrefs=seq_record.dbxrefs[:],
features=seq_record.features[:],
annotations=seq_record.annotations.copy(),
letter_annotations=seq_record.letter_annotations.copy(),
)


def make_variant_base(ref_base: str, alt_base: str) -> str:
"""Helper function to generate the correct variant base given the ref and alt alignment bases"""
if ref_base == alt_base:
Expand Down Expand Up @@ -169,19 +187,30 @@ def generate_alignment(
f"got: {alt_molecule_type}, expected: {molecule_type}!"
)

alignments = aligner.align(reference_seq_record, alternate_seq_record)
best_alignment = next(alignments)
best_alignment_score = best_alignment.score
# reverse the sequences and then align the reversed, keeping the first best alignment
reversed_ref_seq_record = reverse_seq_record(reference_seq_record)
reversed_alt_seq_record = reverse_seq_record(alternate_seq_record)
reversed_alignments = aligner.align(reversed_ref_seq_record, reversed_alt_seq_record)
reversed_alignment = reversed_alignments[0]

other_alignments_with_best = [alignment for alignment in alignments if alignment.score == best_alignment_score]
if len(other_alignments_with_best) > 0:
LOGGER.debug(
"Found %s alignments with max score, returning last in the list (3' end rule)",
len(other_alignments_with_best) + 1,
)
best_alignment = other_alignments_with_best[-1]
# undo the reversal, to recover the "last" highest scoring alignment for the forward
# which should correspond to the 3' end most alignment and follow HGSV spec
# note 'infer_coordinates' will soon be deprecated, update to 'parse_printed_alignment' once its live
forward_coordinates = Alignment.infer_coordinates(
[
reversed_alignment[0][::-1],
reversed_alignment[1][::-1],
]
)
forward_alignment = Alignment(
[reference_seq_record, alternate_seq_record],
forward_coordinates,
)

# score is not technically an attribute on the class
setattr(forward_alignment, "score", reversed_alignment.score)

return best_alignment
return forward_alignment


def generate_variant_blocks(alignment: Alignment) -> list[VariantBlock]:
Expand Down
51 changes: 51 additions & 0 deletions tests/test_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,57 @@ def test_generate_alignment_custom_aligner(self):
alignment = generate_alignment(ref, alt, aligner=custom_aligner)
self.assertTrue(alignment.score, custom_match_score)

def test_generate_alignment_three_prime_end_most_del(self):
ref, alt = self.make_seq_records(
"T" + "A" * 10 + "G",
"T" + "A" * 9 + "G",
)

alignment = generate_alignment(ref, alt)

# ensure we get the ref unchanged
# and the last A deleted, not any others
self.assertEqual(alignment[0], ref.seq)
self.assertEqual(alignment[1], "T" + "A" * 9 + "-G")

def test_generate_alignment_three_prime_end_most_double_del(self):
ref, alt = self.make_seq_records(
"ATTGCCA",
"ATGCA",
)

alignment = generate_alignment(ref, alt)

# ensure we get the ref unchanged
# and the second of each paired based deleted
self.assertEqual(alignment[0], ref.seq)
self.assertEqual(alignment[1], "AT-GC-A")

def test_generate_alignment_three_prime_end_most_ins(self):
ref, alt = self.make_seq_records(
"T" + "A" * 9 + "G",
"T" + "A" * 10 + "G",
)

alignment = generate_alignment(ref, alt)

# ensure we get the alt unchanged
# and the insertion happens after the last A in the ref
self.assertEqual(alignment[0], "T" + "A" * 9 + "-G")
self.assertEqual(alignment[1], alt.seq)

def test_generate_alignment_three_prime_end_most_double_ins(self):
ref, alt = self.make_seq_records(
"ATGCA",
"ATTGCCA",
)
alignment = generate_alignment(ref, alt)

# ensure we get the alt unchanged
# and the insertion happens after the last A in the ref
self.assertEqual(alignment[0], "AT-GC-A")
self.assertEqual(alignment[1], alt.seq)


class GenerateVariantBlocksTestCase(PalamedesBaseCase):
def test_generate_variant_blocks_all_matches(self):
Expand Down

0 comments on commit cb6e300

Please sign in to comment.