Merge pull request #134 from eastgenomics/Cosmic_handling

Cosmic handling
eastgenomics · May 2, 2023 · f596e57 · f596e57
2 parents a303df8 + 2cdcdeb
commit f596e57
Show file tree

Hide file tree

Showing 6 changed files with 133 additions and 16 deletions.
diff --git a/dxapp.json b/dxapp.json
@@ -3,11 +3,11 @@
   "title": "eggd_generate_variant_workbook",
   "summary": "Create Excel workbook from VEP annotated vcf",
   "dxapi": "1.0.0",
-  "version": "2.3.0",
+  "version": "2.4.0",
   "properties": {
-    "githubRelease": "v2.3.0"
+    "githubRelease": "v2.4.0"
   },
-  "whatsNew": "* v2.0.0 Rewrite of previous app to generate xlsx file from a VEP annotated VCF(s); * v2.0.1 Bug fix to correctly treat CHROM as string values; * v2.0.2 Bug fix for ACMG report template structure; * v2.0.3 Bug fixes for issues with hyperlinks, changed app name to eggd_generate_variant_workbook; * v2.1.0 Handle VCFs from GATK gCNV and Illumina TSO500, readability tweaks to variant sheets; * v2.1.1 Bug fix for typing of numeric values in hyperlinks; * v2.2.0 Added ability to pass in non VCF files (tsvs/csvs and images) to additional sheets, optional adding of links to DECIPHER with --decipher; * v2.3.0 Added conditional colouring of cells in variant sheets, new 'basic' summary sheet;",
+  "whatsNew": "* v2.0.0 Rewrite of previous app to generate xlsx file from a VEP annotated VCF(s); * v2.0.1 Bug fix to correctly treat CHROM as string values; * v2.0.2 Bug fix for ACMG report template structure; * v2.0.3 Bug fixes for issues with hyperlinks, changed app name to eggd_generate_variant_workbook; * v2.1.0 Handle VCFs from GATK gCNV and Illumina TSO500, readability tweaks to variant sheets; * v2.1.1 Bug fix for typing of numeric values in hyperlinks; * v2.2.0 Added ability to pass in non VCF files (tsvs/csvs and images) to additional sheets, optional adding of links to DECIPHER with --decipher; * v2.3.0 Added conditional colouring of cells in variant sheets, new 'basic' summary sheet;  * v2.4.0 Added handling for duplicate annotation in VEP fields (i.e. cosmic, CGC, etc..);",
   "authorizedUsers": [
     "org-emee_1"
   ],

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,13 @@
 colour==0.1.5
-python-Levenshtein==0.12.2
 openpyxl==3.0.9
-pandas==1.3.5
+pandas==1.3.5
+et-xmlfile==1.1.0
+filetype==1.1.0
+jarowinkler==1.2.1
+Levenshtein==0.20.2
+numpy==1.23.2
+python-dateutil==2.8.2
+python-Levenshtein==0.12.2
+pytz==2022.2.1
+rapidfuzz==2.5.0
+six==1.16.0
diff --git a/resources/home/dnanexus/generate_workbook/tests/test_columns.py b/resources/home/dnanexus/generate_workbook/tests/test_columns.py
@@ -25,6 +25,7 @@ def read_test_vcf(vcf_file):
     vcf_handler = vcf(argparse.Namespace(
         add_name=False, analysis='', clinical_indication='', exclude=None,
         filter=None, include=None, keep=False, merge=False,
+        add_comment_column=False,
         out_dir='',
         output='',
         panel='', print_columns=False, print_header=False, reads='',
@@ -57,6 +58,7 @@ def read_column_from_vcf(vcf, column) -> list:
 
     return output.stdout.decode().splitlines()
 
+
 class TestMainColumns():
     """
     Tests for ensuring the CHROM, POS, REF, ALT, ID, QUAL and FILTER
@@ -250,6 +252,69 @@ def test_format_sample_values_are_correct(self):
         )
 
 
+class TestVEPHandling():
+    """
+    Tests for splitColumns.unique_vep() that handles
+    duplicates in INFO/CSQ VEP columns.
+    """
+    # test vcf standard sample
+    test_vcf = os.path.join(TEST_DATA_DIR, "HD753-unittest_annotated.split.vcf")
+    # run dataframe through splitColumns.info() to split out INFO column
+    vcf_df = read_test_vcf(vcf_file=test_vcf)
+    vcf_df = splitColumns().split(vcf_df)
+
+
+    def test_parsed_correct_COSMICcMuts_values(self):
+        """
+        Test values read into dataframe for COSMICcMuts match the values
+        above from the VCF
+        """
+        # read COSMICcMuts values from vcf
+        output = subprocess.run(
+            (
+                f"grep -v '^#' {self.test_vcf} | grep -oh "
+                f"'COSMICcMuts=[A-Z0-9&\.]*;' | sort | uniq"
+            ), shell=True, capture_output=True
+        )
+        # clean up values
+        stdout = output.stdout.decode().splitlines()
+        stdout = sorted(list([
+            x.replace(';', '').replace('COSMICcMuts=', '') for x in stdout
+        ]))
+        stdout = [' & '.join(set(x.split("&"))) for x in stdout]
+        # get COSMICcMuts values from dataframe
+        df_values = sorted(list(self.vcf_df['CSQ_COSMICcMuts'].unique().tolist()))
+        assert all([str(x) == str(y) for x, y in zip(stdout, df_values)]), (
+            "COSMICcMuts values in VCF do not match those in dataframe"
+        )
+
+    def test_parsed_correct_COSMICncMuts_values(self):
+            """
+            Test values read into dataframe for COSMICncMuts match the values
+            above from the VCF
+            """
+            # read COSMICncMuts values from vcf
+            output = subprocess.run(
+                (
+                    f"grep -v '^#' {self.test_vcf} | grep -oh "
+                    f"'COSMICncMuts=[A-Z0-9&\.]*;' | sort | uniq"
+                ), shell=True, capture_output=True
+            )
+
+            # clean up values
+            stdout = output.stdout.decode().splitlines()
+            stdout = sorted(list([
+                x.replace(';', '').replace('COSMICncMuts=', '') for x in stdout
+            ]))
+            stdout = [' & '.join(set(x.split("&"))) for x in stdout]
+            # get COSMICncMuts values from dataframe
+            df_values = sorted(list(self.vcf_df['CSQ_COSMICncMuts'].unique().tolist()))
+
+            assert all([str(x) == str(y) for x, y in zip(stdout, df_values)]), (
+                "COSMICncMuts values in VCF do not match those in dataframe"
+            )
+
+
 if __name__ == "__main__":
     columns = TestMainColumns()
     columns.test_filter()

diff --git a/resources/home/dnanexus/generate_workbook/tests/test_vcf.py b/resources/home/dnanexus/generate_workbook/tests/test_vcf.py
@@ -112,6 +112,7 @@ class instance of vcf from utils
             add_name=True, analysis='',
             filter=None, keep=False, merge=False,
             reorder=[], exclude=None, include=None,
+            add_comment_column=False,
             out_dir='', output='',
             panel='', print_columns=False, print_header=False, reads='',
             rename=None, sample='', sheets=['variants'], summary=None,
@@ -430,12 +431,11 @@ def test_decipher_links_build_38():
         ])
 
         test_vcf = vcf(argparse.Namespace(decipher=True))
-        test_vcf.vcf = [df]
+        test_vcf.vcfs = [df]
         test_vcf.refs = ['38']  # Set reference = build 38
 
         # Call function to add hyperlinks
         vcf.add_hyperlinks(test_vcf)
-
         # Define expected string output
         valid_string = (
             '=HYPERLINK("https://www.deciphergenomics.org/sequence-variant/1-6'
@@ -503,6 +503,35 @@ def test_gnomad_build_38():
             "gnomAD AF link output incorrect for build 38 input"
         )
 
+    @staticmethod
+    def test_cosmic_build_37():
+        '''
+        Test that the COSMIC links are generated correctly for build 37
+        '''
+        # Intialise test dataframe with build 37 genome positions
+        df = pd.DataFrame([
+            {'CHROM': 1, 'POS': 2488153, 'REF': 'A',
+             'ALT': 'G', 'COSMICcMuts': 'COSV63186428'},
+        ])
+
+        test_vcf = vcf(argparse.Namespace())
+        test_vcf.vcfs = [df]
+        test_vcf.refs = ['37']  # Set reference = build 37
+
+        # Call function to add hyperlinks
+        vcf.add_hyperlinks(test_vcf)
+
+        valid_string = (
+            '=HYPERLINK("https://cancer.sanger.ac.uk/cosmic/search?'
+            'genome=37&q=COSV63186428", "COSV63186428")'
+        )
+
+        # Assert the output is as expected
+        assert test_vcf.vcfs[0]["COSMICcMuts"][0] == valid_string, (
+            "COSMICcMuts link output incorrect for build 37 input"
+        )
+
+
 if __name__ == "__main__":
     header = TestHeader()
     header.test_column_names()

diff --git a/resources/home/dnanexus/generate_workbook/utils/columns.py b/resources/home/dnanexus/generate_workbook/utils/columns.py
@@ -28,14 +28,15 @@ def split(self, vcf_df) -> Union[pd.DataFrame, int]:
         """
         vcf_df = self.info(vcf_df)
         vcf_df = self.format_fields(vcf_df)
-        vcf_df = self.unique_cosmic(vcf_df)
+        vcf_df = self.unique_vep(vcf_df)
 
         return vcf_df
 
 
-    def unique_cosmic(self, vcf_df) -> pd.DataFrame:
+    def unique_vep(self, vcf_df) -> pd.DataFrame:
         """
         Handle known bug in VEP annotation where it duplicates COSMIC IDs
+        This creates a
 
         Parameters
         ----------
@@ -47,10 +48,16 @@ def unique_cosmic(self, vcf_df) -> pd.DataFrame:
         vcf_df : pd.DataFrame
             dataframe of variants
         """
-        if 'COSMIC' in vcf_df.columns:
-            vcf_df['COSMIC'] = vcf_df['COSMIC'].apply(
-                lambda x: ' & '.join(set(x.split('&')))
+
+        # Find all columns that start with 'csq'
+        csq_columns = [col for col in vcf_df.columns if col.lower().startswith('csq')]
+
+        # Join the 'csq' columns using '&' and remove duplicates
+        for col in csq_columns:
+            vcf_df[col] = vcf_df[col].apply(
+                lambda x: '&'.join(sorted(set(x.split('&')))) if isinstance(x, str) else x
             )
+
         return vcf_df
 
 
@@ -174,20 +181,17 @@ def info(self, vcf_df) -> pd.DataFrame:
         info_keys = [x for x in info_keys if x]  # can end up with empty string
 
         info_values = []
-
         # info_pairs -> list of list of pairs, one list per variant
         for variant_pairs in info_pairs:
             # for every variants values, split them out to dict to add to df
             pair_values = {}
-
             for pair in variant_pairs:
                 if '=' in pair:
                     # key value pair
                     key, value = pair.split('=')
                 else:
                     # Flag value present (e.g STR)
                     key, value = pair, True
-
                 pair_values[key] = value
 
             info_values.append(pair_values)
@@ -199,4 +203,5 @@ def info(self, vcf_df) -> pd.DataFrame:
         # drop INFO and CSQ as we fully split them out
         vcf_df.drop(['INFO'], axis=1, inplace=True)
 
+
         return vcf_df
diff --git a/resources/home/dnanexus/generate_workbook/utils/vcf.py b/resources/home/dnanexus/generate_workbook/utils/vcf.py
@@ -45,7 +45,7 @@ def __init__(self, args) -> None:
         self.urls = {
             "csq_existing_variation": "https://www.ncbi.nlm.nih.gov/snp/",
             "csq_clinvar": "https://www.ncbi.nlm.nih.gov/clinvar/variation/",
-            "csq_cosmic": "https://cancer.sanger.ac.uk/cosmic/search?q=",
+            "csq_cosmic": "https://cancer.sanger.ac.uk/cosmic/search?genome=BUILD&q=",  # genome=37&q={ID}
             "csq_hgmd": "https://my.qiagendigitalinsights.com/bbp/view/hgmd/pro/mut.php?acc=",
             "csq_mastermind_mmid3": "https://mastermind.genomenon.com/detail?mutation=",
             "gnomad_base_url": "https://gnomad.broadinstitute.org/variant/CHROM-POS-REF-ALT",
@@ -480,6 +480,8 @@ def add_hyperlinks(self) -> None:
                 if 'gnomad' in col.lower():
                     # gnomAD columns won't be exact match on name to dict
                     url = self.urls.get('gnomad')
+                elif 'cosmic' in col.lower():
+                    url = self.urls.get('csq_cosmic')
                 else:
                     url = self.urls.get(col.lower(), None)
 
@@ -568,6 +570,13 @@ def make_hyperlink(self, column, url, value, build):
             # sheet so there is no need to display full length hyperlink
             value[column] = url.split('/')[-1]
 
+        elif 'cosmic' in column.lower():
+            # COSMIC requires the url to have the COSM ID added to the url
+            # stub differs based on genome build
+            url = url.replace('BUILD', str(build))
+            # Build COSMIC URL and set value to display equal to what is in
+            url = f'{url}{value[column]}'
+
         else:
             # other URLs with value appended to end
             url = f'{url}{value[column]}'