Skip to content

Commit

Permalink
Merge pull request #134 from eastgenomics/Cosmic_handling
Browse files Browse the repository at this point in the history
Cosmic handling
  • Loading branch information
jethror1 authored May 2, 2023
2 parents a303df8 + 2cdcdeb commit f596e57
Show file tree
Hide file tree
Showing 6 changed files with 133 additions and 16 deletions.
6 changes: 3 additions & 3 deletions dxapp.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
"title": "eggd_generate_variant_workbook",
"summary": "Create Excel workbook from VEP annotated vcf",
"dxapi": "1.0.0",
"version": "2.3.0",
"version": "2.4.0",
"properties": {
"githubRelease": "v2.3.0"
"githubRelease": "v2.4.0"
},
"whatsNew": "* v2.0.0 Rewrite of previous app to generate xlsx file from a VEP annotated VCF(s); * v2.0.1 Bug fix to correctly treat CHROM as string values; * v2.0.2 Bug fix for ACMG report template structure; * v2.0.3 Bug fixes for issues with hyperlinks, changed app name to eggd_generate_variant_workbook; * v2.1.0 Handle VCFs from GATK gCNV and Illumina TSO500, readability tweaks to variant sheets; * v2.1.1 Bug fix for typing of numeric values in hyperlinks; * v2.2.0 Added ability to pass in non VCF files (tsvs/csvs and images) to additional sheets, optional adding of links to DECIPHER with --decipher; * v2.3.0 Added conditional colouring of cells in variant sheets, new 'basic' summary sheet;",
"whatsNew": "* v2.0.0 Rewrite of previous app to generate xlsx file from a VEP annotated VCF(s); * v2.0.1 Bug fix to correctly treat CHROM as string values; * v2.0.2 Bug fix for ACMG report template structure; * v2.0.3 Bug fixes for issues with hyperlinks, changed app name to eggd_generate_variant_workbook; * v2.1.0 Handle VCFs from GATK gCNV and Illumina TSO500, readability tweaks to variant sheets; * v2.1.1 Bug fix for typing of numeric values in hyperlinks; * v2.2.0 Added ability to pass in non VCF files (tsvs/csvs and images) to additional sheets, optional adding of links to DECIPHER with --decipher; * v2.3.0 Added conditional colouring of cells in variant sheets, new 'basic' summary sheet; * v2.4.0 Added handling for duplicate annotation in VEP fields (i.e. cosmic, CGC, etc..);",
"authorizedUsers": [
"org-emee_1"
],
Expand Down
13 changes: 11 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
colour==0.1.5
python-Levenshtein==0.12.2
openpyxl==3.0.9
pandas==1.3.5
pandas==1.3.5
et-xmlfile==1.1.0
filetype==1.1.0
jarowinkler==1.2.1
Levenshtein==0.20.2
numpy==1.23.2
python-dateutil==2.8.2
python-Levenshtein==0.12.2
pytz==2022.2.1
rapidfuzz==2.5.0
six==1.16.0
65 changes: 65 additions & 0 deletions resources/home/dnanexus/generate_workbook/tests/test_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def read_test_vcf(vcf_file):
vcf_handler = vcf(argparse.Namespace(
add_name=False, analysis='', clinical_indication='', exclude=None,
filter=None, include=None, keep=False, merge=False,
add_comment_column=False,
out_dir='',
output='',
panel='', print_columns=False, print_header=False, reads='',
Expand Down Expand Up @@ -57,6 +58,7 @@ def read_column_from_vcf(vcf, column) -> list:

return output.stdout.decode().splitlines()


class TestMainColumns():
"""
Tests for ensuring the CHROM, POS, REF, ALT, ID, QUAL and FILTER
Expand Down Expand Up @@ -250,6 +252,69 @@ def test_format_sample_values_are_correct(self):
)


class TestVEPHandling():
"""
Tests for splitColumns.unique_vep() that handles
duplicates in INFO/CSQ VEP columns.
"""
# test vcf standard sample
test_vcf = os.path.join(TEST_DATA_DIR, "HD753-unittest_annotated.split.vcf")
# run dataframe through splitColumns.info() to split out INFO column
vcf_df = read_test_vcf(vcf_file=test_vcf)
vcf_df = splitColumns().split(vcf_df)


def test_parsed_correct_COSMICcMuts_values(self):
"""
Test values read into dataframe for COSMICcMuts match the values
above from the VCF
"""
# read COSMICcMuts values from vcf
output = subprocess.run(
(
f"grep -v '^#' {self.test_vcf} | grep -oh "
f"'COSMICcMuts=[A-Z0-9&\.]*;' | sort | uniq"
), shell=True, capture_output=True
)
# clean up values
stdout = output.stdout.decode().splitlines()
stdout = sorted(list([
x.replace(';', '').replace('COSMICcMuts=', '') for x in stdout
]))
stdout = [' & '.join(set(x.split("&"))) for x in stdout]
# get COSMICcMuts values from dataframe
df_values = sorted(list(self.vcf_df['CSQ_COSMICcMuts'].unique().tolist()))
assert all([str(x) == str(y) for x, y in zip(stdout, df_values)]), (
"COSMICcMuts values in VCF do not match those in dataframe"
)

def test_parsed_correct_COSMICncMuts_values(self):
"""
Test values read into dataframe for COSMICncMuts match the values
above from the VCF
"""
# read COSMICncMuts values from vcf
output = subprocess.run(
(
f"grep -v '^#' {self.test_vcf} | grep -oh "
f"'COSMICncMuts=[A-Z0-9&\.]*;' | sort | uniq"
), shell=True, capture_output=True
)

# clean up values
stdout = output.stdout.decode().splitlines()
stdout = sorted(list([
x.replace(';', '').replace('COSMICncMuts=', '') for x in stdout
]))
stdout = [' & '.join(set(x.split("&"))) for x in stdout]
# get COSMICncMuts values from dataframe
df_values = sorted(list(self.vcf_df['CSQ_COSMICncMuts'].unique().tolist()))

assert all([str(x) == str(y) for x, y in zip(stdout, df_values)]), (
"COSMICncMuts values in VCF do not match those in dataframe"
)


if __name__ == "__main__":
columns = TestMainColumns()
columns.test_filter()
Expand Down
33 changes: 31 additions & 2 deletions resources/home/dnanexus/generate_workbook/tests/test_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ class instance of vcf from utils
add_name=True, analysis='',
filter=None, keep=False, merge=False,
reorder=[], exclude=None, include=None,
add_comment_column=False,
out_dir='', output='',
panel='', print_columns=False, print_header=False, reads='',
rename=None, sample='', sheets=['variants'], summary=None,
Expand Down Expand Up @@ -430,12 +431,11 @@ def test_decipher_links_build_38():
])

test_vcf = vcf(argparse.Namespace(decipher=True))
test_vcf.vcf = [df]
test_vcf.vcfs = [df]
test_vcf.refs = ['38'] # Set reference = build 38

# Call function to add hyperlinks
vcf.add_hyperlinks(test_vcf)

# Define expected string output
valid_string = (
'=HYPERLINK("https://www.deciphergenomics.org/sequence-variant/1-6'
Expand Down Expand Up @@ -503,6 +503,35 @@ def test_gnomad_build_38():
"gnomAD AF link output incorrect for build 38 input"
)

@staticmethod
def test_cosmic_build_37():
'''
Test that the COSMIC links are generated correctly for build 37
'''
# Intialise test dataframe with build 37 genome positions
df = pd.DataFrame([
{'CHROM': 1, 'POS': 2488153, 'REF': 'A',
'ALT': 'G', 'COSMICcMuts': 'COSV63186428'},
])

test_vcf = vcf(argparse.Namespace())
test_vcf.vcfs = [df]
test_vcf.refs = ['37'] # Set reference = build 37

# Call function to add hyperlinks
vcf.add_hyperlinks(test_vcf)

valid_string = (
'=HYPERLINK("https://cancer.sanger.ac.uk/cosmic/search?'
'genome=37&q=COSV63186428", "COSV63186428")'
)

# Assert the output is as expected
assert test_vcf.vcfs[0]["COSMICcMuts"][0] == valid_string, (
"COSMICcMuts link output incorrect for build 37 input"
)


if __name__ == "__main__":
header = TestHeader()
header.test_column_names()
Expand Down
21 changes: 13 additions & 8 deletions resources/home/dnanexus/generate_workbook/utils/columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,15 @@ def split(self, vcf_df) -> Union[pd.DataFrame, int]:
"""
vcf_df = self.info(vcf_df)
vcf_df = self.format_fields(vcf_df)
vcf_df = self.unique_cosmic(vcf_df)
vcf_df = self.unique_vep(vcf_df)

return vcf_df


def unique_cosmic(self, vcf_df) -> pd.DataFrame:
def unique_vep(self, vcf_df) -> pd.DataFrame:
"""
Handle known bug in VEP annotation where it duplicates COSMIC IDs
This creates a
Parameters
----------
Expand All @@ -47,10 +48,16 @@ def unique_cosmic(self, vcf_df) -> pd.DataFrame:
vcf_df : pd.DataFrame
dataframe of variants
"""
if 'COSMIC' in vcf_df.columns:
vcf_df['COSMIC'] = vcf_df['COSMIC'].apply(
lambda x: ' & '.join(set(x.split('&')))

# Find all columns that start with 'csq'
csq_columns = [col for col in vcf_df.columns if col.lower().startswith('csq')]

# Join the 'csq' columns using '&' and remove duplicates
for col in csq_columns:
vcf_df[col] = vcf_df[col].apply(
lambda x: '&'.join(sorted(set(x.split('&')))) if isinstance(x, str) else x
)

return vcf_df


Expand Down Expand Up @@ -174,20 +181,17 @@ def info(self, vcf_df) -> pd.DataFrame:
info_keys = [x for x in info_keys if x] # can end up with empty string

info_values = []

# info_pairs -> list of list of pairs, one list per variant
for variant_pairs in info_pairs:
# for every variants values, split them out to dict to add to df
pair_values = {}

for pair in variant_pairs:
if '=' in pair:
# key value pair
key, value = pair.split('=')
else:
# Flag value present (e.g STR)
key, value = pair, True

pair_values[key] = value

info_values.append(pair_values)
Expand All @@ -199,4 +203,5 @@ def info(self, vcf_df) -> pd.DataFrame:
# drop INFO and CSQ as we fully split them out
vcf_df.drop(['INFO'], axis=1, inplace=True)


return vcf_df
11 changes: 10 additions & 1 deletion resources/home/dnanexus/generate_workbook/utils/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(self, args) -> None:
self.urls = {
"csq_existing_variation": "https://www.ncbi.nlm.nih.gov/snp/",
"csq_clinvar": "https://www.ncbi.nlm.nih.gov/clinvar/variation/",
"csq_cosmic": "https://cancer.sanger.ac.uk/cosmic/search?q=",
"csq_cosmic": "https://cancer.sanger.ac.uk/cosmic/search?genome=BUILD&q=", # genome=37&q={ID}
"csq_hgmd": "https://my.qiagendigitalinsights.com/bbp/view/hgmd/pro/mut.php?acc=",
"csq_mastermind_mmid3": "https://mastermind.genomenon.com/detail?mutation=",
"gnomad_base_url": "https://gnomad.broadinstitute.org/variant/CHROM-POS-REF-ALT",
Expand Down Expand Up @@ -480,6 +480,8 @@ def add_hyperlinks(self) -> None:
if 'gnomad' in col.lower():
# gnomAD columns won't be exact match on name to dict
url = self.urls.get('gnomad')
elif 'cosmic' in col.lower():
url = self.urls.get('csq_cosmic')
else:
url = self.urls.get(col.lower(), None)

Expand Down Expand Up @@ -568,6 +570,13 @@ def make_hyperlink(self, column, url, value, build):
# sheet so there is no need to display full length hyperlink
value[column] = url.split('/')[-1]

elif 'cosmic' in column.lower():
# COSMIC requires the url to have the COSM ID added to the url
# stub differs based on genome build
url = url.replace('BUILD', str(build))
# Build COSMIC URL and set value to display equal to what is in
url = f'{url}{value[column]}'

else:
# other URLs with value appended to end
url = f'{url}{value[column]}'
Expand Down

0 comments on commit f596e57

Please sign in to comment.