forked from bioinform/somaticseq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSSeq_merged.vcf2tsv.py
executable file
·1289 lines (1024 loc) · 63.9 KB
/
SSeq_merged.vcf2tsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
# 1-based index in this program.
# Sample command:
# python3 SSeq_merged.vcf2tsv.py -myvcf BINA.snp.vcf \
# -sniper somaticsniper/variants.vcf \
# -varscan varscan2/variants.snp.vcf \
# -jsm jointsnvmix2/variants.vcf \
# -vardict vardict/variants.snp.vcf.gz \
# -muse muse/variants.vcf \
# -nbam normal.indelrealigned.bam \
# -tbam tumor.indelrealigned.bam \
# -ref human_g1k_v37_decoy.fasta \
# -outfile SSeq2.snp.tsv
### Improvement since the 2015 Genome Biology publication:
# Supports MuSE, LoFreq, Scalpel
# Allow +/- INDEL lengh for insertion and deletion
# Uses pysam to extract information directly from BAM files, e.g., flanking indel, edit distance, discordance, etc.
# Implement optional minimal mapping quality (MQ) and base call quality (BQ) filter, for which pysam considers the reads in BAM files.
# Allow user to count only non-duplicate reads if -dedup option is invoked.
# Allow handling of VCF files with multiple lines (i.e., different variants) at the same chromosome coordinates, as well as ALT columns with multiple ALT calls.
# Deprecated HaplotypeCaller and SAMTools dependencies
# Get useful metrics from MuTect2
# -- 6/1/2016
import sys, argparse, math, gzip, os, pysam
import regex as re
import scipy.stats as stats
import genomic_file_handlers as genome
import pileup_reader as pileup
from read_info_extractor import *
from copy import copy
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
input_sites = parser.add_mutually_exclusive_group()
input_sites.add_argument('-myvcf', '--vcf-format', type=str, help='Input file is VCF formatted.', required=False, default=None)
input_sites.add_argument('-mybed', '--bed-format', type=str, help='Input file is BED formatted.', required=False, default=None)
input_sites.add_argument('-mypos', '--positions-list', type=str, help='A list of positions: tab seperating contig and positions.', required=False, default=None)
parser.add_argument('-nbam', '--normal-bam-file', type=str, help='Normal BAM File', required=True, default=None)
parser.add_argument('-tbam', '--tumor-bam-file', type=str, help='Tumor BAM File', required=True, default=None)
parser.add_argument('-truth', '--ground-truth-vcf', type=str, help='VCF of true hits', required=False, default=None)
parser.add_argument('-dbsnp', '--dbsnp-vcf', type=str, help='dbSNP VCF: do not use if input VCF is annotated', required=False, default=None)
parser.add_argument('-cosmic', '--cosmic-vcf', type=str, help='COSMIC VCF: do not use if input VCF is annotated', required=False, default=None)
parser.add_argument('-mutect', '--mutect-vcf', type=str, help='MuTect VCF', required=False, default=None)
parser.add_argument('-strelka', '--strelka-vcf', type=str, help='Strelka VCF', required=False, default=None)
parser.add_argument('-sniper', '--somaticsniper-vcf', type=str, help='SomaticSniper VCF', required=False, default=None)
parser.add_argument('-varscan', '--varscan-vcf', type=str, help='VarScan2 VCF', required=False, default=None)
parser.add_argument('-jsm', '--jsm-vcf', type=str, help='JointSNVMix2 VCF', required=False, default=None)
parser.add_argument('-vardict', '--vardict-vcf', type=str, help='VarDict VCF', required=False, default=None)
parser.add_argument('-muse', '--muse-vcf', type=str, help='MuSE VCF', required=False, default=None)
parser.add_argument('-lofreq', '--lofreq-vcf', type=str, help='LoFreq VCF', required=False, default=None)
parser.add_argument('-scalpel', '--scalpel-vcf', type=str, help='Scalpel VCF', required=False, default=None)
parser.add_argument('-ref', '--genome-reference', type=str, help='.fasta.fai file to get the contigs', required=True, default=None)
parser.add_argument('-dedup', '--deduplicate', action='store_true', help='Do not consider duplicate reads from BAM files. Default is to count everything', required=False, default=False)
parser.add_argument('-minMQ', '--minimum-mapping-quality',type=float, help='Minimum mapping quality below which is considered poor', required=False, default=1)
parser.add_argument('-minBQ', '--minimum-base-quality', type=float, help='Minimum base quality below which is considered poor', required=False, default=5)
parser.add_argument('-mincaller', '--minimum-num-callers', type=float, help='Minimum number of tools to be considered', required=False, default=0)
parser.add_argument('-scale', '--p-scale', type=str, help='phred, fraction, or none', required=False, default=None)
parser.add_argument('-outfile', '--output-tsv-file', type=str, help='Output TSV Name', required=False, default=os.sys.stdout)
args = parser.parse_args()
# Rename input:
is_vcf = args.vcf_format
is_bed = args.bed_format
is_pos = args.positions_list
nbam_fn = args.normal_bam_file
tbam_fn = args.tumor_bam_file
truth = args.ground_truth_vcf
cosmic = args.cosmic_vcf
dbsnp = args.dbsnp_vcf
mutect = args.mutect_vcf
strelka = args.strelka_vcf
sniper = args.somaticsniper_vcf
varscan = args.varscan_vcf
jsm = args.jsm_vcf
vardict = args.vardict_vcf
muse = args.muse_vcf
lofreq = args.lofreq_vcf
scalpel = args.scalpel_vcf
min_mq = args.minimum_mapping_quality
min_bq = args.minimum_base_quality
ref_fa = args.genome_reference
p_scale = args.p_scale
outfile = args.output_tsv_file
# Convert contig_sequence to chrom_seq dict:
fai_file = ref_fa + '.fai'
chrom_seq = genome.faiordict2contigorder(fai_file, 'fai')
# Determine input format:
if is_vcf:
mysites = is_vcf
elif is_bed:
mysites = is_bed
elif is_pos:
mysites = is_pos
else:
mysites = fai_file
print('No position supplied. Will evaluate the whole genome.', file=sys.stderr)
# Re-scale output or not:
if p_scale == None:
print('NO RE-SCALING', file=sys.stderr)
elif p_scale.lower() == 'phred':
p_scale = 'phred'
elif p_scale.lower() == 'fraction':
p_scale = 'fraction'
else:
p_scale = None
print('NO RE-SCALING', file=sys.stderr)
def rescale(x, original=None, rescale_to=p_scale, max_phred=1001):
if ( rescale_to == None ) or ( original.lower() == rescale_to.lower() ):
y = x if isinstance(x, int) else '%.2f' % x
elif original.lower() == 'fraction' and rescale_to == 'phred':
y = genome.p2phred(x, max_phred=max_phred)
y = '%.2f' % y
elif original.lower() == 'phred' and rescale_to == 'fraction':
y = genome.phred2p(x)
y = '%.2f' % y
return y
# Define NaN and Inf:
nan = float('nan')
inf = float('inf')
pattern_chr_position = genome.pattern_chr_position
# Normal/Tumor index in the Merged VCF file, or any other VCF file that puts NORMAL first.
idxN,idxT = 0,1
# Normal/Tumor index in VarDict VCF, or any other VCF file that puts TUMOR first.
vdT,vdN = 0,1
# Header for the output data, created here so I won't have to indent this line:
out_header = \
'{CHROM}\t\
{POS}\t\
{ID}\t\
{REF}\t\
{ALT}\t\
{if_MuTect}\t\
{if_Strelka}\t\
{if_VarScan2}\t\
{if_JointSNVMix2}\t\
{if_SomaticSniper}\t\
{if_VarDict}\t\
{if_LoFreq}\t\
{if_Scalpel}\t\
{MuSE_Tier}\t\
{Strelka_Score}\t\
{Strelka_QSS}\t\
{Strelka_TQSS}\t\
{VarScan2_Score}\t\
{SNVMix2_Score}\t\
{Sniper_Score}\t\
{VarDict_Score}\t\
{if_dbsnp}\t\
{COMMON}\t\
{if_COSMIC}\t\
{COSMIC_CNT}\t\
{N_DP}\t\
{nBAM_REF_MQ}\t\
{nBAM_ALT_MQ}\t\
{nBAM_Z_Ranksums_MQ}\t\
{nBAM_REF_BQ}\t\
{nBAM_ALT_BQ}\t\
{nBAM_Z_Ranksums_BQ}\t\
{nBAM_REF_NM}\t\
{nBAM_ALT_NM}\t\
{nBAM_NM_Diff}\t\
{nBAM_REF_Concordant}\t\
{nBAM_REF_Discordant}\t\
{nBAM_ALT_Concordant}\t\
{nBAM_ALT_Discordant}\t\
{nBAM_Concordance_FET}\t\
{N_REF_FOR}\t\
{N_REF_REV}\t\
{N_ALT_FOR}\t\
{N_ALT_REV}\t\
{nBAM_StrandBias_FET}\t\
{nBAM_Z_Ranksums_EndPos}\t\
{nBAM_REF_Clipped_Reads}\t\
{nBAM_ALT_Clipped_Reads}\t\
{nBAM_Clipping_FET}\t\
{nBAM_MQ0}\t\
{nBAM_Other_Reads}\t\
{nBAM_Poor_Reads}\t\
{nBAM_REF_InDel_3bp}\t\
{nBAM_REF_InDel_2bp}\t\
{nBAM_REF_InDel_1bp}\t\
{nBAM_ALT_InDel_3bp}\t\
{nBAM_ALT_InDel_2bp}\t\
{nBAM_ALT_InDel_1bp}\t\
{M2_RPA}\t\
{M2_NLOD}\t\
{M2_TLOD}\t\
{M2_STR}\t\
{M2_ECNT}\t\
{M2_HCNT}\t\
{M2_MAXED}\t\
{M2_MINED}\t\
{SOR}\t\
{MSI}\t\
{MSILEN}\t\
{SHIFT3}\t\
{MaxHomopolymer_Length}\t\
{SiteHomopolymer_Length}\t\
{T_DP}\t\
{tBAM_REF_MQ}\t\
{tBAM_ALT_MQ}\t\
{tBAM_Z_Ranksums_MQ}\t\
{tBAM_REF_BQ}\t\
{tBAM_ALT_BQ}\t\
{tBAM_Z_Ranksums_BQ}\t\
{tBAM_REF_NM}\t\
{tBAM_ALT_NM}\t\
{tBAM_NM_Diff}\t\
{tBAM_REF_Concordant}\t\
{tBAM_REF_Discordant}\t\
{tBAM_ALT_Concordant}\t\
{tBAM_ALT_Discordant}\t\
{tBAM_Concordance_FET}\t\
{T_REF_FOR}\t\
{T_REF_REV}\t\
{T_ALT_FOR}\t\
{T_ALT_REV}\t\
{tBAM_StrandBias_FET}\t\
{tBAM_Z_Ranksums_EndPos}\t\
{tBAM_REF_Clipped_Reads}\t\
{tBAM_ALT_Clipped_Reads}\t\
{tBAM_Clipping_FET}\t\
{tBAM_MQ0}\t\
{tBAM_Other_Reads}\t\
{tBAM_Poor_Reads}\t\
{tBAM_REF_InDel_3bp}\t\
{tBAM_REF_InDel_2bp}\t\
{tBAM_REF_InDel_1bp}\t\
{tBAM_ALT_InDel_3bp}\t\
{tBAM_ALT_InDel_2bp}\t\
{tBAM_ALT_InDel_1bp}\t\
{InDel_Length}\t\
{TrueVariant_or_False}'
## Running
with genome.open_textfile(mysites) as my_sites, open(outfile, 'w') as outhandle:
my_line = my_sites.readline().rstrip()
nbam = pysam.AlignmentFile(nbam_fn)
tbam = pysam.AlignmentFile(tbam_fn)
ref_fa = pysam.FastaFile(ref_fa)
if truth:
truth = genome.open_textfile(truth)
truth_line = truth.readline().rstrip()
while truth_line.startswith('#'):
truth_line = truth.readline().rstrip()
if cosmic:
cosmic = genome.open_textfile(cosmic)
cosmic_line = cosmic.readline().rstrip()
while cosmic_line.startswith('#'):
cosmic_line = cosmic.readline().rstrip()
if dbsnp:
dbsnp = genome.open_textfile(dbsnp)
dbsnp_line = dbsnp.readline().rstrip()
while dbsnp_line.startswith('#'):
dbsnp_line = dbsnp.readline().rstrip()
if mutect:
mutect = genome.open_textfile(mutect)
mutect_line = mutect.readline().rstrip()
while mutect_line.startswith('#'):
mutect_line = mutect.readline().rstrip()
if strelka:
strelka = genome.open_textfile(strelka)
strelka_line = strelka.readline().rstrip()
while strelka_line.startswith('#'):
strelka_line = strelka.readline().rstrip()
if sniper:
sniper = genome.open_textfile(sniper)
sniper_line = sniper.readline().rstrip()
while sniper_line.startswith('#'):
sniper_line = sniper.readline().rstrip()
if varscan:
varscan = genome.open_textfile(varscan)
varscan_line = varscan.readline().rstrip()
while varscan_line.startswith('#'):
varscan_line = varscan.readline().rstrip()
if jsm:
jsm = genome.open_textfile(jsm)
jsm_line = jsm.readline().rstrip()
while jsm_line.startswith('#'):
jsm_line = jsm.readline().rstrip()
if vardict:
vardict = genome.open_textfile(vardict)
vardict_line = vardict.readline().rstrip()
while vardict_line.startswith('#'):
vardict_line = vardict.readline().rstrip()
if muse:
muse = genome.open_textfile(muse)
muse_line = muse.readline().rstrip()
while muse_line.startswith('#'):
muse_line = muse.readline().rstrip()
if lofreq:
lofreq = genome.open_textfile(lofreq)
lofreq_line = lofreq.readline().rstrip()
while lofreq_line.startswith('#'):
lofreq_line = lofreq.readline().rstrip()
if scalpel:
scalpel = genome.open_textfile(scalpel)
scalpel_line = scalpel.readline().rstrip()
while scalpel_line.startswith('#'):
scalpel_line = scalpel.readline().rstrip()
# Get through all the headers:
while my_line.startswith('#') or my_line.startswith('track='):
my_line = my_sites.readline().rstrip()
# First line:
outhandle.write( out_header.replace('{','').replace('}','') + '\n' )
while my_line:
# If VCF, get all the variants with the same coordinate into a list:
if is_vcf:
my_vcf = genome.Vcf_line( my_line )
my_coordinates = [(my_vcf.chromosome, my_vcf.position)]
variants_at_my_coordinate = []
alt_bases = my_vcf.altbase.split(',')
for alt_i in alt_bases:
vcf_i = copy(my_vcf)
vcf_i.altbase = alt_i
variants_at_my_coordinate.append( vcf_i )
# As long as the "coordinate" stays the same, it will keep reading until it's different.
while my_coordinates[0] == (my_vcf.chromosome, my_vcf.position):
my_line = my_sites.readline().rstrip()
my_vcf = genome.Vcf_line( my_line )
if my_coordinates[0] == (my_vcf.chromosome, my_vcf.position):
alt_bases = my_vcf.altbase.split(',')
for alt_i in alt_bases:
vcf_i = copy(my_vcf)
vcf_i.altbase = alt_i
variants_at_my_coordinate.append( vcf_i )
elif is_bed:
bed_item = my_line.split('\t')
my_coordinates = genomic_coordinates( bed_item[0], int(bed_item[1])+1, int(bed_item[2]) )
elif is_pos:
pos_item = my_line.split('\t')
my_coordinates = genomic_coordinates( pos_item[0], int(pos_item[1]), int(pos_item[1]) )
elif fai_file:
fai_item = my_line.split('\t')
my_coordinates = genomic_coordinates( fai_item[0], 1, int(fai_item[1]) )
##### ##### ##### ##### ##### #####
for my_coordinate in my_coordinates:
######## If VCF, can get ref base, variant base, as well as other identifying information ########
if is_vcf:
ref_bases = []
alt_bases = []
indel_lengths = []
all_my_identifiers = []
for variant_i in variants_at_my_coordinate:
ref_base = variant_i.refbase
first_alt = variant_i.altbase.split(',')[0]
indel_length = len(first_alt) - len(ref_base)
ref_bases.append( ref_base )
alt_bases.append( first_alt )
indel_lengths.append( indel_length )
# Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied.
if_dbsnp = 1 if re.search(r'rs[0-9]+', variant_i.identifier) else 0
if_cosmic = 1 if re.search(r'COS[MN][0-9]+', variant_i.identifier) else 0
if_common = 1 if variant_i.get_info_value('COMMON') == '1' else 0
num_cases = variant_i.get_info_value('CNT') if variant_i.get_info_value('CNT') else nan
if variant_i.identifier == '.':
my_identifier_i = set()
else:
my_identifier_i = variant_i.identifier.split(';')
my_identifier_i = set( my_identifier_i )
all_my_identifiers.append( my_identifier_i )
## If not, 1) get ref_base, first_alt from other VCF files.
# 2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided)
else:
variants_at_my_coordinate = [None] # Just to have something to iterate
ref_base = first_alt = indel_length = None
# Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN.
if_dbsnp = if_cosmic = if_common = num_cases = nan
# Keep track of NumCallers:
num_callers = 0
#################################### Find the same coordinate in those VCF files ####################################
if args.mutect_vcf: got_mutect, mutect_variants, mutect_line = genome.find_vcf_at_coordinate(my_coordinate, mutect_line, mutect, chrom_seq)
if args.strelka_vcf: got_strelka, strelka_variants, strelka_line = genome.find_vcf_at_coordinate(my_coordinate, strelka_line, strelka, chrom_seq)
if args.varscan_vcf: got_varscan, varscan_variants, varscan_line = genome.find_vcf_at_coordinate(my_coordinate, varscan_line, varscan, chrom_seq)
if args.jsm_vcf: got_jsm, jsm_variants, jsm_line = genome.find_vcf_at_coordinate(my_coordinate, jsm_line, jsm, chrom_seq)
if args.somaticsniper_vcf: got_sniper, sniper_variants, sniper_line = genome.find_vcf_at_coordinate(my_coordinate, sniper_line, sniper, chrom_seq)
if args.vardict_vcf: got_vardict, vardict_variants, vardict_line = genome.find_vcf_at_coordinate(my_coordinate, vardict_line, vardict, chrom_seq)
if args.muse_vcf: got_muse, muse_variants, muse_line = genome.find_vcf_at_coordinate(my_coordinate, muse_line, muse, chrom_seq)
if args.lofreq_vcf: got_lofreq, lofreq_variants, lofreq_line = genome.find_vcf_at_coordinate(my_coordinate, lofreq_line, lofreq, chrom_seq)
if args.scalpel_vcf: got_scalpel, scalpel_variants, scalpel_line = genome.find_vcf_at_coordinate(my_coordinate, scalpel_line, scalpel, chrom_seq)
if args.ground_truth_vcf: got_truth, truth_variants, truth_line = genome.find_vcf_at_coordinate(my_coordinate, truth_line, truth, chrom_seq)
if args.dbsnp_vcf: got_dbsnp, dbsnp_variants, dbsnp_line = genome.find_vcf_at_coordinate(my_coordinate, dbsnp_line, dbsnp, chrom_seq)
if args.cosmic_vcf: got_cosmic, cosmic_variants, cosmic_line = genome.find_vcf_at_coordinate(my_coordinate, cosmic_line, cosmic, chrom_seq)
# Now, use pysam to look into the BAM file(s), variant by variant from the input:
for ith_call, my_call in enumerate( variants_at_my_coordinate ):
if is_vcf:
# The particular line in the input VCF file:
variant_id = ( (my_call.chromosome, my_call.position), my_call.refbase, my_call.altbase )
ref_base = ref_bases[ith_call]
first_alt = alt_bases[ith_call]
indel_length = indel_lengths[ith_call]
my_identifiers = all_my_identifiers[ith_call]
else:
variant_id = ( (my_coordinate[0], my_coordinate[1]), ref_base, first_alt )
#################### Collect MuTect ####################:
if args.mutect_vcf:
if variant_id in mutect_variants:
mutect_variant_i = mutect_variants[variant_id]
mutect_classification = 1 if (mutect_variant_i.get_info_value('SOMATIC') or 'PASS' in mutect_variant_i.filters) else 0
# MuTect2 has some useful information:
rpa = mutect2_RPA(mutect_variant_i)
nlod = mutect2_nlod(mutect_variant_i)
tlod = mutect2_tlod(mutect_variant_i)
tandem = mutect2_STR(mutect_variant_i)
ecnt = mutect2_ECNT(mutect_variant_i)
hcnt = mutect2_HCNT(mutect_variant_i)
maxED = mutect2_maxED(mutect_variant_i)
minED = mutect2_minED(mutect_variant_i)
rpa = sum(rpa)/len(rpa)
# If ref_base, first_alt, and indel_length unknown, get it here:
if not ref_base: ref_base = mutect_variant_i.refbase
if not first_alt: first_alt = mutect_variant_i.altbase
if indel_length == None: indel_length = len(first_alt) - len(ref_base)
else:
# Not called by mutect
mutect_classification = 0
rpa = nlod = tlod = tandem = ecnt = hcnt = maxED = minED = nan
num_callers += mutect_classification
else:
# Assign a bunch of NaN's
mutect_classification = nan
rpa = nlod = tlod = tandem = ecnt = hcnt = maxED = minED = nan
#################### Collect Strelka ####################:
if args.strelka_vcf:
if variant_id in strelka_variants:
strelka_variant_i = strelka_variants[variant_id]
strelka_classification = 1 if 'PASS' in strelka_variant_i.filters else 0
somatic_evs = strelka_variant_i.get_info_value('SomaticEVS')
qss = strelka_variant_i.get_info_value('QSS')
tqss = strelka_variant_i.get_info_value('TQSS')
else:
strelka_classification = 0
somatic_evs = qss = tqss = nan
else:
strelka_classification = nan
somatic_evs = qss = tqss = nan
#################### Collect VarScan ####################:
if args.varscan_vcf:
if variant_id in varscan_variants:
varscan_variant_i = varscan_variants[ variant_id ]
varscan_classification = 1 if varscan_variant_i.get_info_value('SOMATIC') else 0
score_varscan2 = int(varscan_variant_i.get_info_value('SSC'))
# If ref_base, first_alt, and indel_length unknown, get it here:
if not ref_base: ref_base = varscan_variant_i.refbase
if not first_alt: first_alt = varscan_variant_i.altbase
if indel_length == None: indel_length = len(first_alt) - len(ref_base)
else:
varscan_classification = 0
score_varscan2 = nan
num_callers += varscan_classification
else:
varscan_classification = score_varscan2 = nan
#################### Collect JointSNVMix ####################:
if args.jsm_vcf:
if variant_id in jsm_variants:
jsm_variant_i = jsm_variants[ variant_id ]
jointsnvmix2_classification = 1
aaab = float( jsm_variant_i.get_info_value('AAAB') )
aabb = float( jsm_variant_i.get_info_value('AABB') )
jointsnvmix2_p = 1 - aaab - aabb
score_jointsnvmix2 = genome.p2phred(jointsnvmix2_p, max_phred=50)
# If ref_base, first_alt, and indel_length unknown, get it here:
if not ref_base: ref_base = jsm_variant_i.refbase
if not first_alt: first_alt = jsm_variant_i.altbase
if indel_length == None: indel_length = len(first_alt) - len(ref_base)
else:
jointsnvmix2_classification = 0
score_jointsnvmix2 = nan
num_callers += jointsnvmix2_classification
else:
jointsnvmix2_classification = score_jointsnvmix2 = nan
#################### Collect SomaticSniper ####################:
if args.somaticsniper_vcf:
if variant_id in sniper_variants:
sniper_variant_i = sniper_variants[ variant_id ]
sniper_classification = 1 if sniper_variant_i.get_sample_value('SS', idxT) == '2' else 0
if sniper_classification == 1:
score_somaticsniper = sniper_variant_i.get_sample_value('SSC', idxT)
score_somaticsniper = int(score_somaticsniper) if score_somaticsniper else nan
else:
score_somaticsniper = nan
# If ref_base, first_alt, and indel_length unknown, get it here:
if not ref_base: ref_base = sniper_variant_i.refbase
if not first_alt: first_alt = sniper_variant_i.altbase
if indel_length == None: indel_length = len(first_alt) - len(ref_base)
else:
sniper_classification = 0
score_somaticsniper = nan
num_callers += sniper_classification
else:
sniper_classification = score_somaticsniper = nan
#################### Collect VarDict ####################:
if args.vardict_vcf:
if variant_id in vardict_variants:
vardict_variant_i = vardict_variants[ variant_id ]
if (vardict_variant_i.filters == 'PASS') and ('Somatic' in vardict_variant_i.info):
vardict_classification = 1
elif 'Somatic' in vardict_variant_i.info:
vardict_filters = vardict_variant_i.filters.split(';')
disqualifying_filters = ('d7' in vardict_filters or 'd5' in vardict_filters) or \
('DIFF0.2' in vardict_filters) or \
('LongAT' in vardict_filters) or \
('MAF0.05' in vardict_filters) or \
('MSI6' in vardict_filters) or \
('NM4' in vardict_filters or 'NM4.25' in vardict_filters) or \
('pSTD' in vardict_filters) or \
('SN1.5' in vardict_filters) or \
( 'P0.05' in vardict_filters and float(vardict_variant_i.get_info_value('SSF') ) >= 0.15 ) or \
( ('v3' in vardict_filters or 'v4' in vardict_filters) and int(vardict_variant_i.get_sample_value('VD', 0))<3 )
no_bad_filter = not disqualifying_filters
filter_fail_times = len(vardict_filters)
if no_bad_filter and filter_fail_times<=2:
vardict_classification = 0.5
else:
vardict_classification = 0
else:
vardict_classification = 0
# Somatic Score:
score_vardict = vardict_variant_i.get_info_value('SSF')
if score_vardict:
score_vardict = float(score_vardict)
score_vardict = genome.p2phred(score_vardict, max_phred=100)
else:
score_vardict = nan
# MSI, MSILEN, and SHIFT3:
msi = find_MSI(vardict_variant_i)
msilen = find_MSILEN(vardict_variant_i)
shift3 = find_SHIFT3(vardict_variant_i)
# If ref_base, first_alt, and indel_length unknown, get it here:
if not ref_base: ref_base = vardict_variant_i.refbase
if not first_alt: first_alt = vardict_variant_i.altbase
if indel_length == None: indel_length = len(first_alt) - len(ref_base)
else:
vardict_classification = 0
msi = msilen = shift3 = score_vardict = nan
num_callers += vardict_classification
else:
vardict_classification = msi = msilen = shift3 = score_vardict = nan
#################### Collect MuSE ####################:
if args.muse_vcf:
if variant_id in muse_variants:
muse_variant_i = muse_variants[ variant_id ]
if muse_variant_i.filters == 'PASS':
muse_classification = 1
elif muse_variant_i.filters == 'Tier1':
muse_classification = 0.9
elif muse_variant_i.filters == 'Tier2':
muse_classification = 0.8
elif muse_variant_i.filters == 'Tier3':
muse_classification = 0.7
elif muse_variant_i.filters == 'Tier4':
muse_classification = 0.6
elif muse_variant_i.filters == 'Tier5':
muse_classification = 0.5
else:
muse_classification = 0
# If ref_base, first_alt, and indel_length unknown, get it here:
if not ref_base: ref_base = muse_variant_i.refbase
if not first_alt: first_alt = muse_variant_i.altbase
if indel_length == None: indel_length = len(first_alt) - len(ref_base)
else:
muse_classification = 0
num_callers += muse_classification
else:
muse_classification = nan
#################### Collect LoFreq ####################:
if args.lofreq_vcf:
if variant_id in lofreq_variants:
lofreq_variant_i = lofreq_variants[ variant_id ]
lofreq_classification = 1 if lofreq_variant_i.filters == 'PASS' else 0
# If ref_base, first_alt, and indel_length unknown, get it here:
if not ref_base: ref_base = lofreq_variant_i.refbase
if not first_alt: first_alt = lofreq_variant_i.altbase
if indel_length == None: indel_length = len(first_alt) - len(ref_base)
else:
lofreq_classification = 0
num_callers += lofreq_classification
else:
lofreq_classification = nan
#################### Collect Scalpel ####################:
if args.scalpel_vcf:
if variant_id in scalpel_variants:
scalpel_variant_i = scalpel_variants[ variant_id ]
if scalpel_variant_i.get_info_value('SOMATIC'):
if scalpel_variant_i.filters == 'PASS':
scalpel_classification = 1
else:
scalpel_classification = 0.5
else:
scalpel_classification = 0
# If ref_base, first_alt, and indel_length unknown, get it here:
if not ref_base: ref_base = scalpel_variant_i.refbase
if not first_alt: first_alt = scalpel_variant_i.altbase
if indel_length == None: indel_length = len(first_alt) - len(ref_base)
else:
scalpel_classification = 0
num_callers += scalpel_classification
else:
scalpel_classification = nan
# Potentially write the output only if it meets this threshold:
if num_callers >= args.minimum_num_callers:
########## Ground truth file ##########
if args.ground_truth_vcf:
if variant_id in truth_variants.keys():
judgement = 1
my_identifiers.add('TruePositive')
else:
judgement = 0
my_identifiers.add('FalsePositive')
else:
judgement = nan
########## dbSNP ########## Will overwrite dbSNP info from input VCF file
if args.dbsnp_vcf:
if variant_id in dbsnp_variants.keys():
dbsnp_variant_i = dbsnp_variants[variant_id]
if_dbsnp = 1
if_common = 1 if dbsnp_variant_i.get_info_value('COMMON') == '1' else 0
rsID = dbsnp_variant_i.identifier.split(',')
for ID_i in rsID:
my_identifiers.add( ID_i )
else:
if_dbsnp = if_common = 0
########## COSMIC ########## Will overwrite COSMIC info from input VCF file
if args.cosmic_vcf:
if variant_id in cosmic_variants.keys():
cosmic_variant_i = cosmic_variants[variant_id]
# If designated as SNP, make it "non-cosmic" and make CNT=nan.
if cosmic_variant_i.get_info_value('SNP'):
if_cosmic = 0
num_cases = nan
else:
if_cosmic = 1
num_cases = cosmic_variant_i.get_info_value('CNT')
num_cases = num_cases if num_cases else nan
# COSMIC ID still intact:
cosmicID = cosmic_variant_i.identifier.split(',')
for ID_i in cosmicID:
my_identifiers.add( ID_i )
else:
if_cosmic = num_cases = 0
########## ######### ######### INFO EXTRACTION FROM BAM FILES ########## ######### #########
# Normal BAM file:
n_reads = nbam.fetch( my_coordinate[0], my_coordinate[1]-1, my_coordinate[1] )
n_ref_read_mq = []
n_alt_read_mq = []
n_ref_read_bq = []
n_alt_read_bq = []
n_ref_edit_distance = []
n_alt_edit_distance = []
n_ref_concordant_reads = n_alt_concordant_reads = n_ref_discordant_reads = n_alt_discordant_reads = 0
n_ref_for = n_ref_rev = n_alt_for = n_alt_rev = N_dp = 0
n_ref_SC_reads = n_alt_SC_reads = n_ref_notSC_reads = n_alt_notSC_reads = 0
n_MQ0 = 0
n_ref_pos_from_end = []
n_alt_pos_from_end = []
n_ref_flanking_indel = []
n_alt_flanking_indel = []
n_noise_read_count = n_poor_read_count = 0
for read_i in n_reads:
if not read_i.is_unmapped and dedup_test(read_i):
N_dp += 1
code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read(read_i, my_coordinate[1]-1 )
if read_i.mapping_quality < min_mq and mean(read_i.query_qualities) < min_bq:
n_poor_read_count += 1
if read_i.mapping_quality == 0:
n_MQ0 += 1
# Reference calls:
if code_i == 1 and base_call_i == ref_base[0]:
n_ref_read_mq.append( read_i.mapping_quality )
n_ref_read_bq.append( read_i.query_qualities[ith_base] )
try:
n_ref_edit_distance.append( read_i.get_tag('NM') )
except KeyError:
pass
# Concordance
if read_i.is_proper_pair and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
n_ref_concordant_reads += 1
elif (not read_i.is_proper_pair) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
n_ref_discordant_reads += 1
# Orientation
if (not read_i.is_reverse) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
n_ref_for += 1
elif read_i.is_reverse and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
n_ref_rev += 1
# Soft-clipped reads?
if read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip:
n_ref_SC_reads += 1
else:
n_ref_notSC_reads += 1
# Distance from the end of the read:
if ith_base != None:
n_ref_pos_from_end.append( min(ith_base, read_i.query_length-ith_base) )
# Flanking indels:
n_ref_flanking_indel.append( flanking_indel_i )
# Alternate calls:
# SNV, or Deletion, or Insertion where I do not check for matching indel length
elif (indel_length == 0 and code_i == 1 and base_call_i == first_alt) or \
(indel_length < 0 and code_i == 2 and indel_length == indel_length_i) or \
(indel_length > 0 and code_i == 3):
n_alt_read_mq.append( read_i.mapping_quality )
n_alt_read_bq.append( read_i.query_qualities[ith_base] )
try:
n_alt_edit_distance.append( read_i.get_tag('NM') )
except KeyError:
pass
# Concordance
if read_i.is_proper_pair and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
n_alt_concordant_reads += 1
elif (not read_i.is_proper_pair) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
n_alt_discordant_reads += 1
# Orientation
if (not read_i.is_reverse) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
n_alt_for += 1
elif read_i.is_reverse and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
n_alt_rev += 1
# Soft-clipped reads?
if read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip:
n_alt_SC_reads += 1
else:
n_alt_notSC_reads += 1
# Distance from the end of the read:
if ith_base != None:
n_alt_pos_from_end.append( min(ith_base, read_i.query_length-ith_base) )
# Flanking indels:
n_alt_flanking_indel.append( flanking_indel_i )
# Inconsistent read or 2nd alternate calls:
else:
n_noise_read_count += 1
# Done extracting info from tumor BAM. Now tally them:
n_ref_mq = mean(n_ref_read_mq)
n_alt_mq = mean(n_alt_read_mq)
n_z_ranksums_mq = stats.ranksums(n_alt_read_mq, n_ref_read_mq)[0]
n_ref_bq = mean(n_ref_read_bq)
n_alt_bq = mean(n_alt_read_bq)
n_z_ranksums_bq = stats.ranksums(n_alt_read_bq, n_ref_read_bq)[0]
n_ref_NM = mean(n_ref_edit_distance)
n_alt_NM = mean(n_alt_edit_distance)
n_z_ranksums_NM = stats.ranksums(n_alt_edit_distance, n_ref_edit_distance)[0]
n_NM_Diff = n_alt_NM - n_ref_NM - abs(indel_length)
n_concordance_fet = stats.fisher_exact(( (n_ref_concordant_reads, n_alt_concordant_reads), (n_ref_discordant_reads, n_alt_discordant_reads) ))[1]
n_strandbias_fet = stats.fisher_exact(( (n_ref_for, n_alt_for), (n_ref_rev, n_alt_rev) ))[1]
n_clipping_fet = stats.fisher_exact(( (n_ref_notSC_reads, n_alt_notSC_reads), (n_ref_SC_reads, n_alt_SC_reads) ))[1]
n_z_ranksums_endpos = stats.ranksums(n_alt_pos_from_end, n_ref_pos_from_end)[0]
n_ref_indel_1bp = n_ref_flanking_indel.count(1)
n_ref_indel_2bp = n_ref_flanking_indel.count(2) + n_ref_indel_1bp
n_ref_indel_3bp = n_ref_flanking_indel.count(3) + n_ref_indel_2bp + n_ref_indel_1bp
n_alt_indel_1bp = n_alt_flanking_indel.count(1)
n_alt_indel_2bp = n_alt_flanking_indel.count(2) + n_alt_indel_1bp
n_alt_indel_3bp = n_alt_flanking_indel.count(3) + n_alt_indel_2bp + n_alt_indel_1bp
########################################################################################
# Tumor BAM file:
t_reads = tbam.fetch( my_coordinate[0], my_coordinate[1]-1, my_coordinate[1] )
t_ref_read_mq = []
t_alt_read_mq = []
t_ref_read_bq = []
t_alt_read_bq = []
t_ref_edit_distance = []
t_alt_edit_distance = []
t_ref_concordant_reads = t_alt_concordant_reads = t_ref_discordant_reads = t_alt_discordant_reads = 0
t_ref_for = t_ref_rev = t_alt_for = t_alt_rev = T_dp = 0
t_ref_SC_reads = t_alt_SC_reads = t_ref_notSC_reads = t_alt_notSC_reads = 0
t_MQ0 = 0
t_ref_pos_from_end = []
t_alt_pos_from_end = []
t_ref_flanking_indel = []
t_alt_flanking_indel = []
t_noise_read_count = t_poor_read_count = 0
for read_i in t_reads:
if not read_i.is_unmapped and dedup_test(read_i):
T_dp += 1
code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read(read_i, my_coordinate[1]-1 )
if read_i.mapping_quality < min_mq and mean(read_i.query_qualities) < min_bq:
t_poor_read_count += 1
if read_i.mapping_quality == 0:
t_MQ0 += 1
# Reference calls:
if code_i == 1 and base_call_i == ref_base[0]:
t_ref_read_mq.append( read_i.mapping_quality )
t_ref_read_bq.append( read_i.query_qualities[ith_base] )