Skip to content

Commit

Permalink
Merge pull request #18 from mammothbio-os/expose_hgvs_variants_from_a…
Browse files Browse the repository at this point in the history
…lignment

Expose generate_alignment + generate_hgvs_variants_from_alignment functions
  • Loading branch information
austin-mbs authored Jul 1, 2024
2 parents c13ec0c + 78ace50 commit b5b33bc
Show file tree
Hide file tree
Showing 9 changed files with 311 additions and 218 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.0.8
current_version = 0.0.9
commit = True
tag = False

Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ alt 0 TPFKISIH- 8

## Usage - Python

Palamedes currently includes a single public function that provides the alignment to HGVS functionality. All other functions should
be treated as internal and private. No assurances are offered for their consistency and functionality from version to version. The function accepts either raw sequences or biopython `SeqRecord` objects.
Palamedes currently includes public functions in `__init__.py` that provide the alignment to HGVS functionality. All other functions should be treated as internal and private. No assurances are offered for their consistency and functionality from version to version.

```python
>>> from Bio.Seq import Seq
Expand Down
2 changes: 2 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@
API
==========
.. autofunction:: palamedes.generate_hgvs_variants
.. autofunction:: palamedes.generate_hgvs_variants_from_alignment
.. autofunction:: palamedes.generate_alignment
184 changes: 166 additions & 18 deletions palamedes/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,171 @@
from Bio.Align import PairwiseAligner
from Bio.Align import PairwiseAligner, Alignment
from Bio.SeqRecord import SeqRecord
from hgvs.sequencevariant import SequenceVariant

from palamedes.config import MOLECULE_TYPE_PROTEIN, ALT_SEQUENCE_ID, REF_SEQUENCE_ID
from palamedes.align import generate_seq_record, generate_alignment, generate_variant_blocks
from palamedes.align import generate_seq_record, generate_variant_blocks, reverse_seq_record
from palamedes.hgvs.utils import categorize_variant_block
from palamedes.hgvs.builders import BUILDER_CONFIG
from palamedes.config import (
DEFAULT_EXTEND_GAP_SCORE,
DEFAULT_MATCH_SCORE,
DEFAULT_MISMATCH_SCORE,
DEFAULT_OPEN_GAP_SCORE,
GLOBAL_ALIGN_MODE,
MOLECULE_TYPE_PROTEIN,
ALT_SEQUENCE_ID,
REF_SEQUENCE_ID,
)

__version__ = "0.0.9"


def generate_hgvs_variants_from_alignment(
alignment: Alignment, use_non_standard_substitution_rules: bool = False, molecule_type: str = MOLECULE_TYPE_PROTEIN
) -> list[SequenceVariant]:
"""
Given a pairwise alignment object and a molecule type, generate a list of HGVS SequenceVariants.
- Alignment: Generated via BioPython.PairwiseAligner - (See `generate_alignment` for more information)
- molecule_type: Currently only molecule type 'protein' is supported.
- An optional flag: `use_non_standard_substitution_rules` is a boolean flag which will enable logic that treats multiple consecutive mismatches as separate subsitutions, vs merging together into a delins. This is against HGVS
spec but has utility for some use cases.
.. code-block:: python
>>> from Bio.Seq import Seq
>>> from Bio.SeqRecord import SeqRecord
>>> from palamedes import generate_hgvs_variants_from_alignment, generate_alignment
>>> ref = SeqRecord(Seq("PFKISIHL"), id="Jelleine-I", annotations={"molecule_type": "protein"})
>>> alt = SeqRecord(Seq("TPFKISIH"), id="Jelleine-IV", annotations={"molecule_type": "protein"})
>>> alignment = generate_alignment(ref, alt)
>>> generate_hgvs_variants_from_alignment(alignment)
[
SequenceVariant(ac=Jelleine-I, type=p, posedit=Pro1extThr-1, gene=None),
SequenceVariant(ac=Jelleine-I, type=p, posedit=Leu8del, gene=None)
]
"""
if molecule_type not in BUILDER_CONFIG:
raise NotImplementedError(f"No HGVS builder is defined for molecule_type: {molecule_type}!")

variant_blocks = generate_variant_blocks(
alignment,
split_consecutive_mismatches=use_non_standard_substitution_rules,
)
builder = BUILDER_CONFIG[molecule_type](alignment)
return [
builder.build(
variant_block,
categorize_variant_block(variant_block, alignment),
)
for variant_block in variant_blocks
]


def generate_alignment(
reference_seq_record: SeqRecord,
alternate_seq_record: SeqRecord,
molecule_type: str = MOLECULE_TYPE_PROTEIN,
aligner: PairwiseAligner | None = None,
) -> Alignment:
"""
Using biopython's PairwiseAligner, generate an alignment object representing the best alignment
between the 2 biopython SeqRecords. Note that the molecule_type argument will be used to specify a molecule_type
annotation on the SeqRecord.
By default the function creates an aligner object using the defaults, but the caller may provide their
own pre-configured aligner. This aligner must be set to 'global' mode.
Note that it is possible for multiple alignments to be returned with the same max score. Synthetic testing
has shown that generally the highest scoring alignments are returned in "left to right" order. Take the
following example, which was for testing HGVS duplications. This was the "intended" alignment:
ATC---GGGGGGGG
ATCATCGGGGGGGG
__version__ = "0.0.8"
However, there are multiple ways to get a three base insertion, all with the same score:
ipdb> for aln in aligner.align('ATCGGGGGGGG', 'ATCATCGGGGGGGG'):
print(aln)
target 0 ---ATCGGGGGGGG 11
0 ---||||||||||| 14
query 0 ATCATCGGGGGGGG 14
target 0 A---TCGGGGGGGG 11
0 |---|||||||||| 14
query 0 ATCATCGGGGGGGG 14
target 0 AT---CGGGGGGGG 11
0 ||---||||||||| 14
query 0 ATCATCGGGGGGGG 14
target 0 ATC---GGGGGGGG 11
0 |||---|||||||| 14
query 0 ATCATCGGGGGGGG 14
Due to the HGVS rule of always indicating that the 3' most base is considered the modified one, a best effort
attempt is made to return the most "right aligned" alignment, by returning the last alignment with the highest
score. This way not yield the ideal results in more complicated cases.
.. code-block:: python
>>> from Bio.Seq import Seq
>>> from Bio.SeqRecord import SeqRecord
>>> from palamedes import generate_alignment
>>> ref = SeqRecord(Seq("PFKISIHL"), id="Jelleine-I", annotations={"molecule_type": "protein"})
>>> alt = SeqRecord(Seq("TPFKISIH"), id="Jelleine-IV", annotations={"molecule_type": "protein"})
>>> generate_alignment(ref, alt)
<Alignment object (2 rows x 9 columns) at ...>
"""
if aligner is not None:
if aligner.mode != GLOBAL_ALIGN_MODE:
raise ValueError(f"Custom PairwiseAligner must be set to global mode, got: {aligner.mode}")
else:
aligner = PairwiseAligner(
mode=GLOBAL_ALIGN_MODE,
match_score=DEFAULT_MATCH_SCORE,
mismatch_score=DEFAULT_MISMATCH_SCORE,
open_gap_score=DEFAULT_OPEN_GAP_SCORE,
extend_gap_score=DEFAULT_EXTEND_GAP_SCORE,
)

if (ref_molecule_type := reference_seq_record.annotations.get("molecule_type")) != molecule_type:
raise ValueError(
"Cannot generate alignment, reference_seq_record is a SeqRecord an invalid molecule_type annotation "
f"got: {ref_molecule_type}, expected: {molecule_type}!"
)

if (alt_molecule_type := alternate_seq_record.annotations.get("molecule_type")) != molecule_type:
raise ValueError(
"Cannot generate alignment, alternate_seq_record is a SeqRecord an invalid molecule_type annotation "
f"got: {alt_molecule_type}, expected: {molecule_type}!"
)

# reverse the sequences and then align the reversed, keeping the first best alignment
reversed_ref_seq_record = reverse_seq_record(reference_seq_record)
reversed_alt_seq_record = reverse_seq_record(alternate_seq_record)
reversed_alignments = aligner.align(reversed_ref_seq_record, reversed_alt_seq_record)
reversed_alignment = reversed_alignments[0]

# undo the reversal, to recover the "last" highest scoring alignment for the forward
# which should correspond to the 3' end most alignment and follow HGSV spec
# note 'infer_coordinates' will soon be deprecated, update to 'parse_printed_alignment' once its live
forward_coordinates = Alignment.infer_coordinates(
[
reversed_alignment[0][::-1],
reversed_alignment[1][::-1],
]
)
forward_alignment = Alignment(
[reference_seq_record, alternate_seq_record],
forward_coordinates,
)

# score is not technically an attribute on the class
setattr(forward_alignment, "score", reversed_alignment.score)

return forward_alignment


def generate_hgvs_variants(
Expand All @@ -35,7 +192,7 @@ def generate_hgvs_variants(
- extend gap score: `-0.1`
An optional flag: `use_non_standard_substitution_rules` can be passed as `True`, which will enable logic that treats
multiple consecutive mismatches as seperare subsitutions, vs merging together into a delins. This is against HGVS
multiple consecutive mismatches as separate subsitutions, vs merging together into a delins. This is against HGVS
spec but has utility for some use cases.
If using pre-built `SeqRecord` objects, be sure to set the `molecule_type` annotation key to a supported molecule type
Expand Down Expand Up @@ -70,7 +227,9 @@ def generate_hgvs_variants(
"""

if molecule_type not in BUILDER_CONFIG:
raise NotImplementedError(f"No HGVS builder is defined for molecule_type: {molecule_type}!")
raise NotImplementedError(
f"Type {molecule_type} unsupported or unrecognized! Current supported types: {','.join(BUILDER_CONFIG.keys())}"
)

ref_seq_record = (
generate_seq_record(reference_sequence, REF_SEQUENCE_ID, molecule_type=molecule_type)
Expand All @@ -85,15 +244,4 @@ def generate_hgvs_variants(
)

alignment = generate_alignment(ref_seq_record, alt_seq_record, molecule_type=molecule_type, aligner=aligner)
variant_blocks = generate_variant_blocks(
alignment,
split_consecutive_mismatches=use_non_standard_substitution_rules,
)
builder = BUILDER_CONFIG[molecule_type](alignment)
return [
builder.build(
variant_block,
categorize_variant_block(variant_block, alignment),
)
for variant_block in variant_blocks
]
return generate_hgvs_variants_from_alignment(alignment, use_non_standard_substitution_rules, molecule_type)
3 changes: 2 additions & 1 deletion palamedes/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

from Bio.Align import PairwiseAligner

from palamedes.align import generate_seq_record, generate_alignment, generate_variant_blocks
from palamedes import generate_alignment, generate_variant_blocks
from palamedes.align import generate_seq_record
from palamedes.config import MOLECULE_TYPE_PROTEIN, ALT_SEQUENCE_ID, REF_SEQUENCE_ID
from palamedes import __version__
from palamedes.hgvs.utils import categorize_variant_block
Expand Down
102 changes: 1 addition & 101 deletions palamedes/align.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@
import logging
from functools import reduce, partial

from Bio.Align import Alignment, PairwiseAligner
from Bio.Align import Alignment
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

from palamedes.config import (
ALIGNMENT_GAP_CHAR,
DEFAULT_EXTEND_GAP_SCORE,
DEFAULT_MATCH_SCORE,
DEFAULT_MISMATCH_SCORE,
DEFAULT_OPEN_GAP_SCORE,
GLOBAL_ALIGN_MODE,
VARIANT_BASE_DELETION,
VARIANT_BASE_INSERTION,
VARIANT_BASE_MATCH,
Expand Down Expand Up @@ -138,101 +133,6 @@ def merge_reduce(
return blocks


def generate_alignment(
reference_seq_record: SeqRecord,
alternate_seq_record: SeqRecord,
molecule_type: str = MOLECULE_TYPE_PROTEIN,
aligner: PairwiseAligner | None = None,
) -> Alignment:
"""
Using biopython's PairwiseAligner, generate an alignment object representing the best alignment
between the 2 biopython SeqRecords. Note that the molecule_type argument will be used to specify a molecule_type
annotation on the SeqRecord.
By default the function creates an aligner object using the defaults, but the caller may provide their
own pre-configured aligner. This aligner must be set to 'global' mode.
Note that it is possible for multiple alignments to be returned with the same max score. Synthetic testing
has shown that generally the highest scoring alignments are returned in "left to right" order. Take the
following example, which was for testing HGVS duplications. This was the "intended" alignment:
ATC---GGGGGGGG
ATCATCGGGGGGGG
However, there are multiple ways to get a three base insertion, all with the same score:
ipdb> for aln in aligner.align('ATCGGGGGGGG', 'ATCATCGGGGGGGG'):
print(aln)
target 0 ---ATCGGGGGGGG 11
0 ---||||||||||| 14
query 0 ATCATCGGGGGGGG 14
target 0 A---TCGGGGGGGG 11
0 |---|||||||||| 14
query 0 ATCATCGGGGGGGG 14
target 0 AT---CGGGGGGGG 11
0 ||---||||||||| 14
query 0 ATCATCGGGGGGGG 14
target 0 ATC---GGGGGGGG 11
0 |||---|||||||| 14
query 0 ATCATCGGGGGGGG 14
Due to the HGVS rule of always indicating that the 3' most base is considered the modified one, a best effort
attempt is made to return the most "right aligned" alignment, by returning the last alignment with the highest
score. This way not yield the ideal results in more complicated cases.
"""
if aligner is not None:
if aligner.mode != GLOBAL_ALIGN_MODE:
raise ValueError(f"Custom PairwiseAligner must be set to global mode, got: {aligner.mode}")
else:
aligner = PairwiseAligner(
mode=GLOBAL_ALIGN_MODE,
match_score=DEFAULT_MATCH_SCORE,
mismatch_score=DEFAULT_MISMATCH_SCORE,
open_gap_score=DEFAULT_OPEN_GAP_SCORE,
extend_gap_score=DEFAULT_EXTEND_GAP_SCORE,
)

if (ref_molecule_type := reference_seq_record.annotations.get("molecule_type")) != molecule_type:
raise ValueError(
"Cannot generate alignment, reference_seq_record is a SeqRecord an invalid molecule_type annotation "
f"got: {ref_molecule_type}, expected: {molecule_type}!"
)

if (alt_molecule_type := alternate_seq_record.annotations.get("molecule_type")) != molecule_type:
raise ValueError(
"Cannot generate alignment, alternate_seq_record is a SeqRecord an invalid molecule_type annotation "
f"got: {alt_molecule_type}, expected: {molecule_type}!"
)

# reverse the sequences and then align the reversed, keeping the first best alignment
reversed_ref_seq_record = reverse_seq_record(reference_seq_record)
reversed_alt_seq_record = reverse_seq_record(alternate_seq_record)
reversed_alignments = aligner.align(reversed_ref_seq_record, reversed_alt_seq_record)
reversed_alignment = reversed_alignments[0]

# undo the reversal, to recover the "last" highest scoring alignment for the forward
# which should correspond to the 3' end most alignment and follow HGSV spec
# note 'infer_coordinates' will soon be deprecated, update to 'parse_printed_alignment' once its live
forward_coordinates = Alignment.infer_coordinates(
[
reversed_alignment[0][::-1],
reversed_alignment[1][::-1],
]
)
forward_alignment = Alignment(
[reference_seq_record, alternate_seq_record],
forward_coordinates,
)

# score is not technically an attribute on the class
setattr(forward_alignment, "score", reversed_alignment.score)

return forward_alignment


def generate_variant_blocks(alignment: Alignment, split_consecutive_mismatches: bool = False) -> list[VariantBlock]:
"""
Given a BioPython.Alignment object, parse the alignment to generate a list of VariantBlock objects.
Expand Down
Loading

0 comments on commit b5b33bc

Please sign in to comment.