Merge branch '302_vaf_usage' into 'develop'

302 vaf usage See merge request tron/addannot!257
TRON-Bioinformatics · Sep 18, 2023 · d9ced09 · d9ced09
2 parents 0f1df3f + c229f7f
commit d9ced09
Show file tree

Hide file tree

Showing 13 changed files with 159 additions and 124 deletions.
diff --git a/docs/resources/column_description.xlsx b/docs/resources/column_description.xlsx
diff --git a/docs/source/03_01_input_data.md b/docs/source/03_01_input_data.md
@@ -36,9 +36,8 @@ where:
 
 **NOTE:** 
 
-- If rnaExpression is not provided and the tumor type is given in the patient data, expression will be estimated by gene expression in TCGA cohort indicated in the `tumorType` in the patient data (see below). Please, not that this does not work for mouse data. Here, expression imputation is currently not supported.
-- If `dnaVariantAlleleFrequency` is given while `rnaVariantAlleleFrequency` is not given, the VAF in RNA will be estimated by the VAF in DNA. 
-This means that feature scores that rely on the VAF in RNA will be calulated with the VAF in DNA.
+- Neofox annotates gene expression in TCGA cohort indicated in the `tumorType` in the patient data (see below) which might be helpful if rnaExpression is unknown. Please, not that this does not work for mouse data. 
+
 
 #### Neoepitope candidates 
 
@@ -67,9 +66,7 @@ where:
 
 - Neoepitopes with a value for `alleleMhcI` are considered MHC-I neoepitopes, likewise neoepitopes with a value for `isoformMhcII` are considered MHC-II neoepitopes. Both fields cannot be provided for the same neoepitope.
 - If none of `alleleMhcI` and `isoformMhcII` are provided then the `patientIdentifier` is required and one neoepitope sharing the same sequence will be annotated for each MHC-I allele and MHC-II isoform according to the patient HLA type.
-- If the tumor type is given in the patient data (see below), gene expression in the matching TCGA cohort is annotated and expression-related are calculated based on imputated gene expression additionally (see [description of output data](03_02_output_data.md)). Please, note that this does not work for mouse data. Here, expression imputation is currently not supported.
-- If `dnaVariantAlleleFrequency` is given while `rnaVariantAlleleFrequency` is not given, the VAF in RNA will be estimated by the VAF in DNA. 
-This means that feature scores that rely on the VAF in RNA will be calulated with the VAF in DNA.
+- Neofox annotates gene expression in TCGA cohort indicated in the `tumorType` in the patient data (see below) which might be helpful if rnaExpression is unknown. Please, not that this does not work for mouse data.
 
 
 ### JSON file format
@@ -106,7 +103,7 @@ where:
 - `identifier`: the patient identifier
 - `mhcIAlleles`: comma separated MHC I alleles of the patient for HLA-A, HLA-B and HLA-C. If homozygous, the allele should be added twice.
 - `mhcIIAlleles`: comma separated  MHC II alleles of the patient for HLA-DRB1, HLA-DQA1, HLA-DQB1, HLA-DPA1 and HLA-DPB1. If homozygous, the allele should be added twice.  
-- `tumorType`: tumour entity in TCGA study abbreviation format (https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations). This field is required for expression imputation and at the moment the following tumor types are supported:
+- `tumorType`: tumour entity in TCGA study abbreviation format (https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations). This field is required for imputation with gene expression and at the moment the following tumor types are supported:
 
 | Study Name                                                         | Abbreviation |
 |--------------------------------------------------------------------|-------------------|

diff --git a/docs/source/03_02_output_data.md b/docs/source/03_02_output_data.md
diff --git a/docs/source/03_03_usage.md b/docs/source/03_03_usage.md
@@ -54,12 +54,12 @@ The optional **config** file with the paths to the dependencies can look like th
 ````commandline
 NEOFOX_REFERENCE_FOLDER=path/to/reference/folder
 NEOFOX_RSCRIPT=`which Rscript`
-NEOFOX_BLASTP=path/to/ncbi-blast-2.10.1+/bin/blastp
-NEOFOX_NETMHCPAN=path/to/netMHCpan-4.1/netMHCpan
-NEOFOX_NETMHC2PAN=path/to/netMHCIIpan-4.0/netMHCIIpan
-NEOFOX_MIXMHCPRED=path/to/MixMHCpred-2.1/MixMHCpred
-NEOFOX_MIXMHC2PRED=path/to/MixMHC2pred-1.2/MixMHC2pred_unix
-NEOFOX_MAKEBLASTDB=path/to/ncbi-blast-2.8.1+/bin/makeblastdb
+NEOFOX_BLASTP=path/to/blast/bin/blastp
+NEOFOX_NETMHCPAN=path/to/netMHCpan/netMHCpan
+NEOFOX_NETMHC2PAN=path/to/netMHCIIpan/netMHCIIpan
+NEOFOX_MIXMHCPRED=path/to/MixMHCpred/MixMHCpred
+NEOFOX_MIXMHC2PRED=path/to/MixMHC2pred/MixMHC2pred_unix
+NEOFOX_MAKEBLASTDB=path/to/ncbi-blast/bin/makeblastdb
 NEOFOX_PRIME=/path/to/PRIME/PRIME
 ````
 
@@ -89,6 +89,9 @@ where:
 
 ## Running from docker
 
+**NOTE: The provided docker recipe is not adapted to Neofox-v1.1.0. Please, use a previous version at the moment if running from docker is required.
+The docker recipe will be updated soon.**
+
 In order to run the command line in a docker image, all of the above applies but
 some additional steps are required.
 

diff --git a/neofox/__init__.py b/neofox/__init__.py
@@ -18,7 +18,7 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.#
 
 
-VERSION = "1.1.0b27"
+VERSION = "1.1.0b28"
 
 REFERENCE_FOLDER_ENV = "NEOFOX_REFERENCE_FOLDER"
 NEOFOX_BLASTP_ENV = "NEOFOX_BLASTP"

diff --git a/neofox/annotator/neoantigen_annotator.py b/neofox/annotator/neoantigen_annotator.py
@@ -218,11 +218,12 @@ def get_annotated_neoantigen(self, neoantigen: Neoantigen, patient: Patient, wit
         )
 
         # vaxrank
+        # TODO: consider to calculate vaxrank with DNA VAF aswell
         if netmhcpan and netmhcpan.predictions:
             neoantigen.neofox_annotations.annotations.extend(VaxRank().get_annotations(
                 epitope_predictions=netmhcpan.predictions,
-                expression_score=expression_annotation[0].value,
-                imputed_score=expression_annotation[1].value
+                expression_score=[e.value for e in expression_annotation if e.name == "Mutated_rnaExpression_fromRNA"][0],
+                imputed_score=[e.value for e in expression_annotation if e.name == "Mutated_imputedGeneExpression_fromRNA"][0]
             ))
 
         # hex

diff --git a/neofox/neofox.py b/neofox/neofox.py
@@ -105,16 +105,9 @@ def __init__(
 
         self._validate_input_data()
 
-        # retrieve from the data, if RNA-seq was available
-        # add this information to patient model
-        expression_per_patient = {self.patients[patient].identifier: [] for patient in self.patients}
-        for neoantigen in self.neoantigens:
-            expression_per_patient[neoantigen.patient_identifier].append(neoantigen.rna_expression)
 
-        # only performs the expression imputation for humans
+        # annotate TCGA gene expression
         if self.reference_folder.organism == ORGANISM_HOMO_SAPIENS:
-            # impute expresssion from TCGA, ONLY if isRNAavailable = False for given patient,
-            # otherwise original values is reported
             # NOTE: this must happen after validation to avoid uncaptured errors due to missing patients
             # NOTE: add gene expression to neoantigen candidate model
             self.neoantigens = self._conditional_expression_imputation()
@@ -128,15 +121,15 @@ def _conditional_expression_imputation(self) -> List[Neoantigen]:
         neoantigens_transformed = []
 
         for neoantigen in self.neoantigens:
-            expression_value = neoantigen.rna_expression
+
             patient = self.patients[neoantigen.patient_identifier]
             neoantigen_transformed = neoantigen
+
             gene_expression = expression_annotator.get_gene_expression_annotation(
                 gene_name=neoantigen.gene, tcga_cohort=patient.tumor_type
             )
 
-            neoantigen_transformed.rna_expression = expression_value
-            neoantigen.imputed_gene_expression = gene_expression
+            neoantigen_transformed.imputed_gene_expression = gene_expression
 
             neoantigens_transformed.append(neoantigen_transformed)
         return neoantigens_transformed

diff --git a/neofox/published_features/expression.py b/neofox/published_features/expression.py
@@ -18,26 +18,25 @@
 # You should have received a copy of the GNU General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.#
 from typing import List
-from logzero import logger
-from neofox.model.neoantigen import Annotation, Neoantigen, Patient
+from neofox.model.neoantigen import Annotation, Neoantigen
 from neofox.model.factories import AnnotationFactory
 
 
 class Expression:
 
     @staticmethod
     def _get_expression_annotation(
-        transcript_gene_expression: float, vaf_rna: float
+        transcript_gene_expression: float, vaf: float
     ) -> float:
         """
-        This function calculates the product of VAF in RNA and transcript expression
+        This function calculates the product of VAF and transcript expression
         to reflect the expression of the mutated transcript
         """
         expression_mut = None
         try:
             expression_mut = (
-                transcript_gene_expression * vaf_rna
-                if vaf_rna is not None and vaf_rna >= 0.0
+                transcript_gene_expression * vaf
+                if vaf is not None and vaf >= 0.0
                 else None
             )
         except (TypeError, ValueError):
@@ -46,15 +45,17 @@ def _get_expression_annotation(
 
     def get_annotations(self, neoantigen: Neoantigen) -> List[Annotation]:
 
-        vaf = neoantigen.rna_variant_allele_frequency
-        if vaf is None or vaf == -1:
-            vaf = neoantigen.dna_variant_allele_frequency
-
         return [
             AnnotationFactory.build_annotation(
-                name="Mutated_rnaExpression", value=self._get_expression_annotation(
-                    transcript_gene_expression=neoantigen.rna_expression, vaf_rna=vaf)),
+                name="Mutated_rnaExpression_fromRNA", value=self._get_expression_annotation(
+                    transcript_gene_expression=neoantigen.rna_expression, vaf=neoantigen.rna_variant_allele_frequency)),
+            AnnotationFactory.build_annotation(
+                name="Mutated_rnaExpression_fromDNA", value=self._get_expression_annotation(
+                    transcript_gene_expression=neoantigen.rna_expression, vaf=neoantigen.dna_variant_allele_frequency)),
+            AnnotationFactory.build_annotation(
+                name="Mutated_imputedGeneExpression_fromRNA", value=self._get_expression_annotation(
+                    transcript_gene_expression=neoantigen.imputed_gene_expression, vaf=neoantigen.rna_variant_allele_frequency)),
             AnnotationFactory.build_annotation(
-                name="Mutated_imputedGeneExpression", value=self._get_expression_annotation(
-                    transcript_gene_expression=neoantigen.imputed_gene_expression, vaf_rna=vaf))
+                name="Mutated_imputedGeneExpression_fromDNA", value=self._get_expression_annotation(
+                    transcript_gene_expression=neoantigen.imputed_gene_expression, vaf=neoantigen.dna_variant_allele_frequency))
         ]
diff --git a/neofox/published_features/priority_score.py b/neofox/published_features/priority_score.py
@@ -21,9 +21,8 @@
 # import modules
 import math
 from typing import List
-
 from neofox.helpers.epitope_helper import EpitopeHelper
-from neofox.model.neoantigen import Annotation, PredictedEpitope, Neoantigen, Patient
+from neofox.model.neoantigen import Annotation, PredictedEpitope, Neoantigen
 from neofox.model.factories import AnnotationFactory
 from neofox.MHC_predictors.netmhcpan.combine_netmhcpan_pred_multiple_binders import (
     BestAndMultipleBinder,
@@ -43,8 +42,7 @@ def calc_logistic_function(self, mhc_score):
 
     def calc_priority_score(
         self,
-        vaf_dna,
-        vaf_rna,
+        vaf,
         transcript_gene_expr,
         no_mismatch,
         score_mut,
@@ -53,15 +51,12 @@ def calc_priority_score(
     ):
         """
         This function calculates the Priority Score using parameters for mhc I.
+        Bjerregard, 2017, Cancer Immunol Immunother
+        https://doi.org/10.1007/s00262-017-2001-3
         """
         priority_score = None
-        vaf = None
         try:
-            if vaf_dna is not None and vaf_dna != -1:
-                vaf = vaf_dna
-            elif vaf_rna is not None and vaf_rna != -1:
-                vaf = vaf_rna
-            if vaf:
+            if vaf is not None and vaf != -1:
                 l_mut = self.calc_logistic_function(score_mut)
                 l_wt = self.calc_logistic_function(score_wt)
                 priority_score = self.mupexi(
@@ -94,29 +89,42 @@ def get_annotations(
         returns number of mismatches between best MHCI / MHC II epitopes (rank) and their corresponding WTs
         """
         num_mismatches_mhc1 = None
-        priority_score = None
-        priority_score_imputed = None
+        priority_score_dna = None
+        priority_score_rna = None
+        priority_score_imputed_dna = None
+        priority_score_imputed_rna = None
         if netmhcpan.best_epitope_by_rank.wild_type_peptide and netmhcpan.best_epitope_by_rank.mutated_peptide:
             num_mismatches_mhc1 = EpitopeHelper.number_of_mismatches(
                 epitope_wild_type=netmhcpan.best_epitope_by_rank.wild_type_peptide,
                 epitope_mutation=netmhcpan.best_epitope_by_rank.mutated_peptide,
             )
-            vaf_rna = neoantigen.dna_variant_allele_frequency
-            if vaf_rna is None:
-                vaf_rna = neoantigen.rna_variant_allele_frequency
 
-            priority_score = self.calc_priority_score(
-                        vaf_dna=neoantigen.dna_variant_allele_frequency,
-                        vaf_rna=vaf_rna,
+            priority_score_dna = self.calc_priority_score(
+                        vaf=neoantigen.dna_variant_allele_frequency,
+                        transcript_gene_expr=neoantigen.rna_expression,
+                        no_mismatch=num_mismatches_mhc1,
+                        score_mut=netmhcpan.best_epitope_by_rank.rank_mutated,
+                        score_wt=netmhcpan.best_epitope_by_rank.rank_wild_type,
+                        mut_not_in_prot=mut_not_in_prot,
+                    )
+            priority_score_rna = self.calc_priority_score(
+                        vaf=neoantigen.rna_variant_allele_frequency,
                         transcript_gene_expr=neoantigen.rna_expression,
                         no_mismatch=num_mismatches_mhc1,
                         score_mut=netmhcpan.best_epitope_by_rank.rank_mutated,
                         score_wt=netmhcpan.best_epitope_by_rank.rank_wild_type,
                         mut_not_in_prot=mut_not_in_prot,
                     )
-            priority_score_imputed = self.calc_priority_score(
-                        vaf_dna=neoantigen.dna_variant_allele_frequency,
-                        vaf_rna=vaf_rna,
+            priority_score_imputed_dna = self.calc_priority_score(
+                        vaf=neoantigen.dna_variant_allele_frequency,
+                        transcript_gene_expr=neoantigen.imputed_gene_expression,
+                        no_mismatch=num_mismatches_mhc1,
+                        score_mut=netmhcpan.best_epitope_by_rank.rank_mutated,
+                        score_wt=netmhcpan.best_epitope_by_rank.rank_wild_type,
+                        mut_not_in_prot=mut_not_in_prot,
+                    )
+            priority_score_imputed_rna = self.calc_priority_score(
+                        vaf=neoantigen.rna_variant_allele_frequency,
                         transcript_gene_expr=neoantigen.imputed_gene_expression,
                         no_mismatch=num_mismatches_mhc1,
                         score_mut=netmhcpan.best_epitope_by_rank.rank_mutated,
@@ -129,13 +137,23 @@ def get_annotations(
             ),
             # priority score with rank score
             AnnotationFactory.build_annotation(
-                value=priority_score,
-                name="Priority_score",
+                value=priority_score_dna,
+                name="Priority_score_fromDNA",
+            ),
+            # imputed priority score with rank score
+            AnnotationFactory.build_annotation(
+                value=priority_score_imputed_rna,
+                name="Priority_score_imputed_fromRNA"
+            ),
+            # priority score with rank score f
+            AnnotationFactory.build_annotation(
+                value=priority_score_rna,
+                name="Priority_score_fromRNA",
             ),
             # imputed priority score with rank score
             AnnotationFactory.build_annotation(
-                value=priority_score_imputed,
-                name="Priority_score_imputed"
+                value=priority_score_imputed_dna,
+                name="Priority_score_imputed_fromDNA"
             )
         ]
         return annotations
@@ -145,26 +163,47 @@ def get_annotations_epitope_mhci(self, epitope: PredictedEpitope, vaf_tumor, tra
         return [
             AnnotationFactory.build_annotation(
                 value=self.calc_priority_score(
-                    vaf_dna=vaf_tumor,
-                    vaf_rna=vaf_rna,
+                    vaf=vaf_tumor,
                     transcript_gene_expr=transcript_exp,
                     no_mismatch=int(EpitopeHelper.get_annotation_by_name(
                         epitope.neofox_annotations.annotations, name='number_of_mismatches')),
                     score_mut=epitope.rank_mutated,
                     score_wt=epitope.rank_wild_type,
                     mut_not_in_prot=bool(EpitopeHelper.get_annotation_by_name(
                         epitope.neofox_annotations.annotations, name='mutation_not_found_in_proteome'))),
-                name='Priority_score'),
+                name='Priority_score_fromDNA'),
             AnnotationFactory.build_annotation(
                 value=self.calc_priority_score(
-                    vaf_dna=vaf_tumor,
-                    vaf_rna=vaf_rna,
+                    vaf=vaf_tumor,
                     transcript_gene_expr=gene_exp,
                     no_mismatch=int(EpitopeHelper.get_annotation_by_name(
                         epitope.neofox_annotations.annotations, name='number_of_mismatches')),
                     score_mut=epitope.rank_mutated,
                     score_wt=epitope.rank_wild_type,
                     mut_not_in_prot=bool(EpitopeHelper.get_annotation_by_name(
                         epitope.neofox_annotations.annotations, name='mutation_not_found_in_proteome'))),
-                name='Priority_score_imputed'),
+                name='Priority_score_imputed_fromDNA'),
+            AnnotationFactory.build_annotation(
+                value=self.calc_priority_score(
+                    vaf=vaf_rna,
+                    transcript_gene_expr=transcript_exp,
+                    no_mismatch=int(EpitopeHelper.get_annotation_by_name(
+                        epitope.neofox_annotations.annotations, name='number_of_mismatches')),
+                    score_mut=epitope.rank_mutated,
+                    score_wt=epitope.rank_wild_type,
+                    mut_not_in_prot=bool(EpitopeHelper.get_annotation_by_name(
+                        epitope.neofox_annotations.annotations, name='mutation_not_found_in_proteome'))),
+                name='Priority_score_fromRNA'),
+            AnnotationFactory.build_annotation(
+                value=self.calc_priority_score(
+                    vaf=vaf_rna,
+                    transcript_gene_expr=gene_exp,
+                    no_mismatch=int(EpitopeHelper.get_annotation_by_name(
+                        epitope.neofox_annotations.annotations, name='number_of_mismatches')),
+                    score_mut=epitope.rank_mutated,
+                    score_wt=epitope.rank_wild_type,
+                    mut_not_in_prot=bool(EpitopeHelper.get_annotation_by_name(
+                        epitope.neofox_annotations.annotations, name='mutation_not_found_in_proteome'))),
+                name='Priority_score_imputed_fromRNA'),
+
             ]
diff --git a/neofox/published_features/vaxrank/vaxrank.py b/neofox/published_features/vaxrank/vaxrank.py
@@ -43,6 +43,8 @@ def logistic_epitope_score(
            "The relationship between class I binding affinity
             and immunogenicity of potential cytotoxic T cell epitopes.
         adapted from: https://github.com/openvax/vaxrank/blob/master/vaxrank/epitope_prediction.py
+        Rubinsteyn, 2017, Front Immunol
+        https://doi.org/10.3389/fimmu.2017.01807
         """
         if ic50 >= ic50_cutoff:
             return 0.0