diff --git a/tests/test_data/ref_transcript_mismatch_reporter/input.protein_coding.vcf b/tests/test_data/ref_transcript_mismatch_reporter/input.protein_coding.vcf new file mode 100644 index 0000000..fadde49 --- /dev/null +++ b/tests/test_data/ref_transcript_mismatch_reporter/input.protein_coding.vcf @@ -0,0 +1,149 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##SentieonCommandLine.TNfilter= +##SentieonCommandLine.TNhaplotyper2= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##reference=/xx/hs37d5.fa +##tumor_sample=Tumor-666 +##normal_sample=Normal-666 +##bcftools_filterVersion=1.11+htslib-1.11 +##VEP="v103" time="2024-02-01 13:52:04" cache=/xx/homo_sapiens_refseq/103_GRCh37" ensembl-variation=103.06320c4 ensembl=103.4c8d44a ensembl-io=103.353f93a ensembl-funcgen=103.b53bef4 1000genomes="phase3" COSMIC="90" ClinVar="201912" ESP="20141103" HGMD-PUBLIC="20194" assembly="GRCh37.p13" dbSNP="153" gencode="GENCODE 19" genebuild="2011-04" gnomAD="r2.1" polyphen="2.2.2" refseq="2019-10-24 23:10:14 - GCF_000001405.25_GRCh37.p13_genomic.gff" regbuild="1.0" sift="sift5.2.2" +##INFO= +##FrameshiftSequence=Predicted sequence for frameshift mutations +##WildtypeProtein=The normal, non-mutated protein sequence +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Normal-666 Tumor-666 +chr12 48238361 . G GCCTCAATGAGGAGCACTCCAAGCAGTACCGCTGCCTCTCCTTCCAGCC . clustered_events AS_FilterStatus=SITE;AS_SB_TABLE=101,4|1,6;DP=118;ECNT=5;GERMQ=93;MBQ=37,34;MFRL=52,194;MMQ=60,60;MPOS=49;NALOD=1.48;NLOD=8.75;POPAF=6;TLOD=19.3;CSQ=CCTCAATGAGGAGCACTCCAAGCAGTACCGCTGCCTCTCCTTCCAGCC|stop_gained&protein_altering_variant|HIGH|VDR|7421|Transcript|NM_001364085.1|protein_coding|10/10||NM_001364085.1:c.1451_1452insGGCTGGAAGGAGAGGCAGCGGTACTGCTTGGAGTGCTCCTCATTGAGG|NP_001351014.1:p.Asn484delinsLysAlaGlyArgArgGlySerGlyThrAlaTrpSerAlaProHisTer|1611-1612|1451-1452|484|N/KAGRRGSGTAWSAPH*G|aac/aaGGCTGGAAGGAGAGGCAGCGGTACTGCTTGGAGTGCTCCTCATTGAGGc|||-1||EntrezGene|||rseq_mrna_nonmatch&rseq_5p_mismatch||||OK|||||||||||||||MEAMAASTSLPDPGDFDRNVPRICGVCGDRATGFHFNAMTCEGCKGFFRRSMKRKALFTCPFNGDCRITKDNRRHCQACRLKRCVDIGMMKEFILTDEEVQRKREMILKRKEEEALKDSLRPKLSEEQQRIIAILLDAHHKTYDPTYSDFCQFRPPVRVNDGGGSHPSRPNSRHTPSFSGDSSSSCSDHCITSSDMMDSSSFSNLDLSEEDSDDPSVTLELSQLSMLPHLADLVSYSIQKVIGFAKMIPGFRDLTSEDQIVLLKSSAIEVIMLRSNESFTMDDMSWTCGNQDYKYRVSDVTKAGHSLELIEPLIKFQVGLKKLNLHEEEHVLLMAICIVSPDRPGVQDAALIEAIQDRLSNTLQTYIRCRHPPPGSHLLYAKMIQKLADLRSLNEEHSKQYRCLSFQPECSMKLTPLVLEVFGNEISLGQPVAVPGWGCSSRATCQARGWRLLSSPPHPVWGSAPPLPPPLSTQPILSPVQPNPFPAGFSPVP GT:AD:AF:DP:F1R2:F2R1:SB 0/0:29,0:0.0318:29:17,0:12,0:29,0,0,0 0/1:76,7:0.0936:83:57,1:19,0:72,4,1,6 diff --git a/tests/test_data/ref_transcript_mismatch_reporter/output.protein_coding.filtered.vcf b/tests/test_data/ref_transcript_mismatch_reporter/output.protein_coding.filtered.vcf new file mode 100644 index 0000000..d3cb843 --- /dev/null +++ b/tests/test_data/ref_transcript_mismatch_reporter/output.protein_coding.filtered.vcf @@ -0,0 +1,148 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##SentieonCommandLine.TNfilter= +##SentieonCommandLine.TNhaplotyper2= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##reference=/xx/hs37d5.fa +##tumor_sample=Tumor-666 +##normal_sample=Normal-666 +##bcftools_filterVersion=1.11+htslib-1.11 +##VEP="v103" time="2024-02-01 13:52:04" cache=/xx/homo_sapiens_refseq/103_GRCh37" ensembl-variation=103.06320c4 ensembl=103.4c8d44a ensembl-io=103.353f93a ensembl-funcgen=103.b53bef4 1000genomes="phase3" COSMIC="90" ClinVar="201912" ESP="20141103" HGMD-PUBLIC="20194" assembly="GRCh37.p13" dbSNP="153" gencode="GENCODE 19" genebuild="2011-04" gnomAD="r2.1" polyphen="2.2.2" refseq="2019-10-24 23:10:14 - GCF_000001405.25_GRCh37.p13_genomic.gff" regbuild="1.0" sift="sift5.2.2" +##INFO= +##FrameshiftSequence=Predicted sequence for frameshift mutations +##WildtypeProtein=The normal, non-mutated protein sequence +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Normal-666 Tumor-666 diff --git a/tests/test_ref_transcript_mismatch_reporter.py b/tests/test_ref_transcript_mismatch_reporter.py index 7bf689f..d5c0917 100644 --- a/tests/test_ref_transcript_mismatch_reporter.py +++ b/tests/test_ref_transcript_mismatch_reporter.py @@ -80,3 +80,15 @@ def test_no_protein_position_filter(self): ref_transcript_mismatch_reporter.main(command) self.assertTrue(cmp(os.path.join(self.test_data_dir, 'output.no_protein_pos.filtered.vcf'), os.path.join(temp_path.name, 'input.filtered.vcf'))) temp_path.cleanup() + + def test_protein_coding(self): + temp_path = tempfile.TemporaryDirectory() + os.symlink(os.path.join(self.test_data_dir, 'input.protein_coding.vcf'), os.path.join(temp_path.name, 'input.vcf')) + command = [ + os.path.join(temp_path.name, 'input.vcf'), + "-f", + "hard" + ] + ref_transcript_mismatch_reporter.main(command) + self.assertTrue(cmp(os.path.join(self.test_data_dir, 'output.protein_coding.filtered.vcf'), os.path.join(temp_path.name, 'input.filtered.vcf'))) + temp_path.cleanup() diff --git a/vatools/ref_transcript_mismatch_reporter.py b/vatools/ref_transcript_mismatch_reporter.py index 3e2d9ac..e505889 100644 --- a/vatools/ref_transcript_mismatch_reporter.py +++ b/vatools/ref_transcript_mismatch_reporter.py @@ -6,7 +6,7 @@ from collections import OrderedDict import logging -def resolve_consequence(consequence_string): +def resolve_consequence(consequence_string, allele): if '&' in consequence_string: consequences = {consequence.lower() for consequence in consequence_string.split('&')} elif '.' in consequence_string: @@ -26,14 +26,17 @@ def resolve_consequence(consequence_string): consequence = 'inframe_ins' elif 'inframe_deletion' in consequences: consequence = 'inframe_del' - #TODO add support for protein_altering_variant - #elif 'protein_altering_variant' in consequences: - # if len(ref) > len(alt) and (len(ref) - len(alt)) % 3 == 0: - # consequence = 'inframe_del' - # elif len(alt) > len(ref) and (len(alt) - len(ref)) % 3 == 0: - # consequence = 'inframe_ins' - # else: - # consequence = None + elif 'protein_altering_variant' in consequences: + if '-' in allele: + allele = allele.replace('-', '') + if len(allele) % 3 == 0: + consequence = 'inframe_del' + else: + consequence = None + elif len(allele) % 3 == 0: + consequence = 'inframe_ins' + else: + consequence = None else: consequence = None return consequence @@ -129,6 +132,8 @@ def main(args_input = sys.argv[1:]): for transcript in entry.INFO['CSQ']: transcript_count += 1 for key, value in zip(csq_format, transcript.split('|')): + if key == 'Allele': + allele = value if key == 'WildtypeProtein': full_wildtype_sequence = value if key == 'Amino_acids': @@ -144,7 +149,7 @@ def main(args_input = sys.argv[1:]): if protein_position == '-': protein_position = value.split('/')[1] if key == 'Consequence': - variant_type = resolve_consequence(value) + variant_type = resolve_consequence(value, allele) if key == 'Feature': transcript_id = value