Skip to content

Commit

Permalink
feat!: add sequence to SequenceLocation (#570)
Browse files Browse the repository at this point in the history
close #552

* Remove `vrs_ref_allele_seq` from `GnomadVcfToProteinService` model.
Instead, this is now stored in `variation.locaation.sequence`
  • Loading branch information
korikuzma authored Jul 18, 2024
1 parent 9055f7e commit f4b385f
Show file tree
Hide file tree
Showing 11 changed files with 177 additions and 68 deletions.
39 changes: 16 additions & 23 deletions src/variation/gnomad_vcf_to_protein_variation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from cool_seq_tool.handlers import SeqRepoAccess
from cool_seq_tool.mappers import ManeTranscript
from cool_seq_tool.schemas import ResidueMode, Strand
from cool_seq_tool.schemas import Strand
from ga4gh.core import domain_models, ga4gh_identify
from ga4gh.vrs import models, normalize
from gene.query import QueryHandler as GeneQueryHandler
Expand All @@ -19,6 +19,7 @@
from variation.schemas.validation_response_schema import ValidationResult
from variation.tokenize import Tokenize
from variation.translate import Translate
from variation.utils import get_vrs_loc_seq
from variation.validate import Validate


Expand Down Expand Up @@ -372,11 +373,17 @@ def _dna_to_aa(dna_seq: str, strand: Strand) -> str:
return aa

def _get_protein_representation(
self, ga4gh_seq_id: str, aa_start_pos: int, aa_end_pos: int, aa_alt: str
self,
ga4gh_seq_id: str,
p_ac: str,
aa_start_pos: int,
aa_end_pos: int,
aa_alt: str,
) -> models.Allele:
"""Create VRS Allele for protein representation
:param ga4gh_seq_id: GA4GH identifier for protein accession
:param p_ac: RefSeq or Ensembl protein accession
:param aa_start_pos: Protein start position (inter-residue coordinates)
:param aa_end_pos: Protein end position (inter-residue coordinates)
:param aa_alt: Protein alternate sequence
Expand All @@ -402,6 +409,12 @@ def _get_protein_representation(
msg = f"VRS-Python unable to normalize allele: {e}"
raise GnomadVcfToProteinError(msg) from e

loc_seq = get_vrs_loc_seq(
self.seqrepo_access, p_ac, variation.location.start, variation.location.end
)
if loc_seq:
variation.location.sequence = models.SequenceString(root=loc_seq)

# Add VRS digests for VRS Allele and VRS Sequence Location
variation.id = ga4gh_identify(variation)
variation.location.id = ga4gh_identify(variation.location)
Expand All @@ -420,25 +433,6 @@ def _get_gene_context(self, gene: str) -> domain_models.Gene | None:
else None
)

def _get_vrs_ref_allele_seq(
self, location: models.SequenceLocation, p_ac: str
) -> str | None:
"""Return reference sequence given a VRS location.
:param location: VRS Location object
:param identifier: Identifier for allele
:return: VRS ref seq allele
"""
start = location.start
end = location.end
if isinstance(start, int) and isinstance(end, int) and (start != end):
ref, _ = self.seqrepo_access.get_reference_sequence(
p_ac, start, end, residue_mode=ResidueMode.INTER_RESIDUE
)
else:
ref = None
return ref

async def gnomad_vcf_to_protein(self, vcf_query: str) -> GnomadVcfToProteinService:
"""Get protein consequence for gnomAD-VCF like expression
Assumes input query uses GRCh38 representation
Expand Down Expand Up @@ -576,7 +570,7 @@ async def gnomad_vcf_to_protein(self, vcf_query: str) -> GnomadVcfToProteinServi
# Create the protein VRS Allele
try:
variation = self._get_protein_representation(
p_ga4gh_seq_id, aa_start_pos, aa_end_pos, aa_alt
p_ga4gh_seq_id, p_ac, aa_start_pos, aa_end_pos, aa_alt
)
except GnomadVcfToProteinError as e:
warnings.append(str(e))
Expand All @@ -591,7 +585,6 @@ async def gnomad_vcf_to_protein(self, vcf_query: str) -> GnomadVcfToProteinServi
return GnomadVcfToProteinService(
variation_query=vcf_query,
variation=variation,
vrs_ref_allele_seq=self._get_vrs_ref_allele_seq(variation.location, p_ac),
gene_context=gene_context,
warnings=warnings,
service_meta_=ServiceMeta(
Expand Down
59 changes: 40 additions & 19 deletions src/variation/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from variation import __version__
from variation.classify import Classify
from variation.schemas.app_schemas import Endpoint
from variation.schemas.classification_response_schema import ClassificationType
from variation.schemas.normalize_response_schema import (
HGVSDupDelModeOption,
NormalizeService,
Expand All @@ -21,10 +22,11 @@
TranslationResult,
VrsSeqLocAcStatus,
)
from variation.schemas.validation_response_schema import ValidationSummary
from variation.to_vrs import ToVRS
from variation.tokenize import Tokenize
from variation.translate import Translate
from variation.utils import update_warnings_for_no_resp
from variation.utils import get_vrs_loc_seq, update_warnings_for_no_resp
from variation.validate import Validate


Expand Down Expand Up @@ -138,6 +140,40 @@ def get_hgvs_dup_del_mode(

return hgvs_dup_del_mode, warning

def _get_location_seq(
self,
validation_summary: ValidationSummary,
variation: dict,
priority_translation_result: TranslationResult,
) -> str | None:
"""Get reference sequence for a Sequence Location
Does not support:
- Ambiguous genomic deletions or duplications
- Amplifications
- Variations that are not Allele or Copy Number
:param validation_summary: Validation summary for classification containing
valid and invalid results
:param variation: VRS Variation object
:param priority_translation_result: Prioritized translation result
:return: Reference sequence for a sequence location if found
"""
valid_result = validation_summary.valid_results[0]
classification_type = valid_result.classification.classification_type
if classification_type not in {
ClassificationType.GENOMIC_DELETION_AMBIGUOUS,
ClassificationType.GENOMIC_DUPLICATION_AMBIGUOUS,
ClassificationType.AMPLIFICATION,
} and variation["type"] in {"Allele", "CopyNumberChange", "CopyNumberCount"}:
return get_vrs_loc_seq(
self.seqrepo_access,
priority_translation_result.vrs_seq_loc_ac,
variation["location"]["start"],
variation["location"]["end"],
)
return None

async def normalize(
self,
q: str,
Expand Down Expand Up @@ -231,26 +267,11 @@ async def normalize(
try:
variation = translation_result.vrs_variation
except AttributeError as e:
# vrs_ref_allele_seq = None
warnings.append(str(e))
else:
pass
# valid_result = validation_summary.valid_results[0]
# classification_type = valid_result.classification.classification_type
# if classification_type not in {
# ClassificationType.GENOMIC_DELETION_AMBIGUOUS,
# ClassificationType.GENOMIC_DUPLICATION_AMBIGUOUS,
# ClassificationType.AMPLIFICATION,
# }:
# variation_type = variation["type"]
# if variation_type in {
# "Allele", "CopyNumberChange", "CopyNumberCount"
# }:
# vrs_ref_allele_seq = self.get_ref_allele_seq(
# variation["location"], translation_result.vrs_seq_loc_ac
# )
# else:
# vrs_ref_allele_seq = None
variation["location"]["sequence"] = self._get_location_seq(
validation_summary, variation, translation_result
)

if not variation:
update_warnings_for_no_resp(label, warnings)
Expand Down
2 changes: 0 additions & 2 deletions src/variation/schemas/gnomad_vcf_to_protein_schema.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Module for gnomad vcf to protein response schema"""

from ga4gh.core import domain_models
from pydantic import StrictStr

from variation.schemas.normalize_response_schema import NormalizeService

Expand All @@ -10,4 +9,3 @@ class GnomadVcfToProteinService(NormalizeService):
"""Define response for gnomad vcf to protein service"""

gene_context: domain_models.Gene | None = None
vrs_ref_allele_seq: StrictStr | None = None
14 changes: 12 additions & 2 deletions src/variation/to_copy_number_variation.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from variation.to_vrs import ToVRS
from variation.tokenize import Tokenize
from variation.translate import Translate
from variation.utils import get_priority_sequence_location
from variation.utils import get_priority_sequence_location, get_vrs_loc_seq
from variation.validate import Validate

VALID_CLASSIFICATION_TYPES = [
Expand Down Expand Up @@ -184,7 +184,14 @@ async def _hgvs_to_cnv_resp(
do_liftover=do_liftover,
)
if translations:
variation = translations[0].vrs_variation
translation_result = translations[0]
variation = translation_result.vrs_variation
variation["location"]["sequence"] = get_vrs_loc_seq(
self.seqrepo_access,
translation_result.vrs_seq_loc_ac,
variation["location"]["start"],
variation["location"]["end"],
)

if variation:
if copy_number_type == HGVSDupDelModeOption.COPY_NUMBER_COUNT:
Expand Down Expand Up @@ -493,6 +500,9 @@ def _get_parsed_seq_loc(
sequenceReference=models.SequenceReference(refgetAccession=sequence),
start=start_vrs,
end=end_vrs,
sequence=get_vrs_loc_seq(
self.seqrepo_access, accession, start_vrs, end_vrs
),
)
seq_loc.id = ga4gh_identify(seq_loc)

Expand Down
37 changes: 27 additions & 10 deletions src/variation/to_vrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from variation.schemas.validation_response_schema import ValidationResult
from variation.tokenize import Tokenize
from variation.translate import Translate
from variation.utils import get_vrs_loc_seq
from variation.validate import Validate
from variation.vrs_representation import VRSRepresentation

Expand Down Expand Up @@ -88,6 +89,31 @@ async def get_translations(

return translations, warnings

def _get_vrs_variations(self, translations: list[TranslationResult]) -> list[dict]:
"""Get translated VRS Variations.
This method will also add ``sequence`` to the variation's location
:param translations: List of translation results
:return: List of unique VRS Variations
"""
variations = []
_added_variation_ids = set()

# Ensure only unique VRS variations are in the list of variations returned
for tr in translations:
if tr.vrs_variation["id"] not in _added_variation_ids:
vrs_variation = tr.vrs_variation
vrs_variation["location"]["sequence"] = get_vrs_loc_seq(
self.seqrepo_access,
tr.vrs_seq_loc_ac,
vrs_variation["location"]["start"],
vrs_variation["location"]["end"],
)
variations.append(vrs_variation)
_added_variation_ids.add(vrs_variation["id"])
return variations

async def to_vrs(self, q: str) -> ToVRSService:
"""Return a VRS-like representation of all validated variations for a query.
Expand Down Expand Up @@ -134,15 +160,6 @@ async def to_vrs(self, q: str) -> ToVRSService:
translations = []
warnings = validation_summary.warnings

if not translations:
variations = []
else:
variations = []
# Ensure only unique VRS variations are in the list of variations returned
for tr in translations:
if tr.vrs_variation not in variations:
variations.append(tr.vrs_variation)

params["warnings"] = warnings
params["variations"] = variations
params["variations"] = self._get_vrs_variations(translations)
return ToVRSService(**params)
28 changes: 28 additions & 0 deletions src/variation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
from bioutils.sequences import aa1_to_aa3 as _aa1_to_aa3
from bioutils.sequences import aa3_to_aa1 as _aa3_to_aa1
from cool_seq_tool.handlers import SeqRepoAccess
from cool_seq_tool.schemas import ResidueMode
from ga4gh.core import domain_models
from ga4gh.vrs import models

from variation.schemas.app_schemas import AmbiguousRegexType
from variation.schemas.classification_response_schema import AmbiguousType
Expand Down Expand Up @@ -209,3 +211,29 @@ def get_refget_accession(

refget_accession = ids[0].split("ga4gh:")[-1]
return refget_accession


def get_vrs_loc_seq(
seqrepo_access: SeqRepoAccess,
identifier: str,
start: int | models.Range | None,
end: int | models.Range | None,
) -> str | None:
"""Get the literal sequence encoded by the ``identifier`` at the start and end
coordinates.
Does not support locations that do not have both start/end as ints
:param seqrepo_access: Access to SeqRepo client
:param identifier: Accession for VRS Location (not ga4gh)
:param start: Start position (inter-residue)
:param end: End position (inter-residue)
:return: Get the literal sequence at the given location
"""
if isinstance(start, int) and isinstance(end, int) and (start != end):
ref, _ = seqrepo_access.get_reference_sequence(
identifier, start, end, residue_mode=ResidueMode.INTER_RESIDUE
)
else:
ref = None
return ref or None # get_reference_sequence can return empty str
Loading

0 comments on commit f4b385f

Please sign in to comment.