From 55aa96cecfe563fd75f7f2281479b3bd0ce8f241 Mon Sep 17 00:00:00 2001 From: Susanna Kiwala Date: Fri, 13 Oct 2023 10:45:37 -0500 Subject: [PATCH] Downgrade duplicate variant error to a warning and skip duplicates. --- .../input.duplicate_variant.vcf.gz | Bin 0 -> 3997 bytes .../output.duplicate_variant.tsv | 2 ++ tests/test_vep_annotation_reporter.py | 14 ++++++++++++++ vatools/vep_annotation_reporter.py | 8 ++++++-- 4 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 tests/test_data/vep_annotation_reporter/input.duplicate_variant.vcf.gz create mode 100644 tests/test_data/vep_annotation_reporter/output.duplicate_variant.tsv diff --git a/tests/test_data/vep_annotation_reporter/input.duplicate_variant.vcf.gz b/tests/test_data/vep_annotation_reporter/input.duplicate_variant.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..8cd8c833d860e01e6bf64c69f155c566bd8e5a3d GIT binary patch literal 3997 zcmV;O4`T2iiwFn{U@2t)18Ht>b#yLdb#QEHV_|elRYvu7lZ z=5@b*{p$lH_YXfjMx*dCNymO(?-L@9b2a@^4YMGPCV7;^ z_3zzql7?9r=NaY4&(tKzvglzHQvN6p)7a0Wk9d0&;zK!=52K74C4rAG{{BBOxZX59 z%`L|_lTYm@f3!Y69Hf4Oqf!1mrN476c`A>rUfNINhcK;IF1_bTSZ~I8I0)1A0c6ZP z3aEZK$%h#QvjZtYDh(e&;69dfCSOKHad|39`(a9rqAU;ln>`cuDSQzyH#_;PV3ui9@ zo#As91^$Sd4E+qwQbT_-n$A0lr$!8Pm!Nxm(L+fA#)MhVjWb* zOyvoc#6ShA7>4PVoK5ww$k|la1^fgp7KLyrcW;3AA4D|jO#Z!G^o#WNcCSedgMyv2&%Hr;4+&{-K z*M5X|K&Jp>P+lB_3*J~2k5L~VyQJVX>4OX)4%P&SK6fV66+{1XlnfY}n!I0L3-&;P zi~5u%@n8#m=*PK${1*gt!t`pkTtPCPfc?ZdmE|z!ERO;hdlzO^Q(E<`xuFr(%dav|nC0E3wgMw`r)Q8bwVR{u1A{bl|iXZ)8s8Tx6T3NTAh2Nr8Xn%9K7 zTPjP&(*qgAwP#=kdB8&mo@^f5HkhF~GkQK-f5VU7V=sww43K?hE7S^-(^8kzSyv|fdf05f$>G&7UWrguh=d#YVF^{GQo7l?|C`p`QyaL7-D-?lI zl<5tme&Yo5{ghByI7q_*xWmi`qJ6zcS%sgM+hfxgT3_zn0(DG$8TgY#j z+J3&-@;{e3(o&MuL8aRHRjCzmr00{bEK7JaFo|g_3W{tOH<-=>eEuP07 zPX4dV;&C1k_LzwN`iX4j7CUC)T8Ogz(gUw$&MSlbou0Y$p5qz+KZp?RO^3FFbqAE_ zgY7GeJm`8h8|vYiQn7npAJmoadwc1}|3vlu+%K2nMP<1#9Vf~Dj##>;v4!K_c46qw zeVH$M9*;vv+XL#Z4{pH4CW{%dpIMwrVmDPRLbEnFUmAVWEe9{*;D2?BC`) z#gEN9uj^=q_RF1JG;Pm?w}#O{LHRl<0@ap#b43=4ik3T0n&jZ_mIvf<6em4Di251G z(g$e@?K4ho<3aBuHocNcZ6Z&$6|F#?E8A+Jo3eolr>V4N()Q5Z&n}#A7`D=Ds)dKl zVsWU0dUv{xG_O!hC^u2lfOC+AHaHfE)m^=5ukPBqwz_-gHCA_#j7p2ynnc}! zcoPLUR`nO>C~RlhtRBV4drZOGWy9zZV(Qb(Y+$q<3RoF0C=XzNIhoSo_x=;u@u<82 z2H4P6-2=xOmzCcoyvJ81Gn`$_2AEsns_oBaxrhA+!<4D@FsggAiz*+3$I9S$>V27~ zd{_t7Qhj@jE$fhFhND@;|3glH!~t8vUR*HYD>+&FaF;{=4S z;{>SNj}s8SjuQZSKTbgSI!@p#_TvPEuj7RFT_^M>)=6o&ch-e-hc24xg{$cokgZ*` zf0oUQ>5bkCEXl?6p2|g|Xa987P!xufg!%h-eMCSc2StsxeCqD+^vr4G|2c5aR4KqQ!Y(-Km-@ds^W+0#|4@GHCq6@QlFFg@;{qsWW`a zD^IaiiKB&zSj_xoFGz5CQV7=!Q*4vn1QmhhjsM8_l(Tw@znN6zbimYvMwAsbhA)7y zfZf8qZWMtLTnBPcC>C91X{Pp0u_S?ES^gcCv^S_QEbPL#@6wDFC5{(r;ydLISqxp} z=UX*qwpHb6QDA1R=+4#J`Gu3iSl4zD) zY;lV%ezqmSG>a=x6jNph<;xAjNRkLgiURVb9_aNWz<}MhNEh-<64d;%Z}5L9r%?U55M4FbPvNJ)Pbv8Ubk(auA#f8p_>h4cybGMJjLrG zM>g%2r7LYyZh4B+X`A;Q6?u;17@pg4WlL^$TW!>CSf1ADxUS+f8k(yh6{&^^Y}*a9 z>2@^3HsQ5vs;X;yy4ygVPRHrOjI@q{RK@d}jjr9b6dNJMG7QUhT?f$HY$n|7Rvg1sUDwkz4aRM?3=MNNK_=w2T-kJN+k>e&ZKu_0Ii_l8UEMR=P1kTW zSJy1v)l6HHP0ehna$7c9nqwHql^X~(%oc(H+sH$1qt!Akz2#}3Nu(A_YcBW-UF9S; zyrQc`iFuk1|3%Y(dRweWRbG(3G|uH$aSfax2~us_HSpI`u}!tZGjIZ1W#|phfW5*w zc?RMbPSY{8Jl&Dq2Ka!6dDJlH8IG?CmFhOkYsEIz4w&uj-2wks%048zto&{`F01g!+EM<0~cwukg~ zOM}HdYLxSWD2h8#zFL&e$#$chuZf!|uhInN1m#DM@-wc!u_?&bJDBird2L=jR0 z690*3I6co`nWk#G_exhco51MppuT)C+5?kWLwr+jYVQ7s7w{c6rowC@e(`gH7zK4; zm|$ISvqG5?9-Wv0AZBwG(3uQzIB77$bEXhGd+~A=JwkaO;Ji;qYtn zIsr}tNYgO)@i&lqQJkkqT=3UIu}yXpS85`SQCh(JsV08Ap~t=<^iHFM5Ni^*9S=^q zgn&r(LIuH}Zw0Ym$$#WIkW}4OXRz1RVw-AL4g{u8^XoYfTcyt^2Rc~roJKZu-0)&( zP841&<7ZU6;8hcOiIf56;*AvXviT@7Hb_sMCd1;uP-fRB4{$ z#q+wiQ{xZ1JblZ1JKJf)QRl%`hGr$U!f)rnaKSbiB*Hvh+pIHLW>IG{T($gmB3KX* z7O;B`sqkSrW%40>>;6T%F$><^jJ1zTY0c1V(QJBSFI7pMT+NKyxyTBZ=7v zg{(w#OCa$jk|d`554-1yr8D~MFj88l6(-L*GX{_edt}B_vR&WQ7g<(VX2!EW)x%cF zcYn^r9h`CkeIk#3tBi;r>b{sePpNj~&LYFpBzHa-fD;6Ldjx^;RCgh$R%}!4LJ-f@ z1cD%fAd;Q_Db(?Qke#j$FIlSNWi4%eM*NEt@rnEg!qoFz~ zzE!Vzx^3D`#pyag1{atuIIB2{skrTqXIe-%bzL``y50rtb?=R46E19-4Nq@%6x6Yr zZ{$iY4DUu(Z&c^TXx}*I1#`{Z(3g^Mga1i4tSGXVtSDU4q{#(oazUD05chY=1r2F( zaeDO(aSWu%1!;0YV1D{ACrvI$lM8~@V?pZ)--{$oE(ppA%1KR5np_aiKs>{le8r74 zxgbq0NRtcF+Vx=+eTlM4cQ0{OH5019bxaa!;tFnS{xktP>^2bRw0 zv%{px#VKXRq{+p}h1O&EAkcAmh7)=8OPXAq7QhLDzCD6SlM8|%lAV5Y)R86^rv^Qe zb`r>wCKm+q1o8y(Gc1!P7X2l+!RjY*S>qk%VRa&g%3CQUAOHo5peph!N0Puu_i DW$m=W literal 0 HcmV?d00001 diff --git a/tests/test_data/vep_annotation_reporter/output.duplicate_variant.tsv b/tests/test_data/vep_annotation_reporter/output.duplicate_variant.tsv new file mode 100644 index 0000000..2bf263c --- /dev/null +++ b/tests/test_data/vep_annotation_reporter/output.duplicate_variant.tsv @@ -0,0 +1,2 @@ +CHROM POS REF ALT SYMBOL +chr17 7675088 C T TP53 diff --git a/tests/test_vep_annotation_reporter.py b/tests/test_vep_annotation_reporter.py index b32a1bf..7834513 100644 --- a/tests/test_vep_annotation_reporter.py +++ b/tests/test_vep_annotation_reporter.py @@ -120,3 +120,17 @@ def test_vcf_with_multiple_transcripts_and_no_pick(self): vep_annotation_reporter.main(command) self.assertTrue(cmp(os.path.join(self.test_data_dir, 'output.merge_multiple_transcripts.tsv'), os.path.join(temp_path.name, 'input.tsv'))) temp_path.cleanup() + + def test_vcf_with_duplicate_variant(self): + logging.disable(logging.NOTSET) + with LogCapture() as l: + temp_path = tempfile.TemporaryDirectory() + os.symlink(os.path.join(self.test_data_dir, 'input.duplicate_variant.vcf.gz'), os.path.join(temp_path.name, 'input.vcf.gz')) + command = [ + os.path.join(temp_path.name, 'input.vcf.gz'), + 'SYMBOL', + ] + vep_annotation_reporter.main(command) + self.assertTrue(cmp(os.path.join(self.test_data_dir, 'output.duplicate_variant.tsv'), os.path.join(temp_path.name, 'input.tsv'))) + temp_path.cleanup() + l.check_present(('root', 'WARNING', "VEP entry at CHR chr17, POS 7675088, REF C , ALT T already exists. Skipping subsequent entries.")) diff --git a/vatools/vep_annotation_reporter.py b/vatools/vep_annotation_reporter.py index b70e733..5ad5c8e 100644 --- a/vatools/vep_annotation_reporter.py +++ b/vatools/vep_annotation_reporter.py @@ -8,6 +8,7 @@ import tempfile import csv import binascii +import logging def define_parser(): parser = argparse.ArgumentParser( @@ -146,7 +147,7 @@ def extract_vep_fields(args): else: vep[chr][pos][ref][alt] = None else: - sys.exit("VEP entry for at CHR %s, POS %s, REF %s , ALT % already exists" % (chr, pos, ref, alt) ) + logging.warning("VEP entry at CHR %s, POS %s, REF %s , ALT %s already exists. Skipping subsequent entries." % (chr, pos, ref, alt) ) vcf_reader.close() return vep @@ -192,6 +193,7 @@ def main(args_input = sys.argv[1:]): with open(output_file, 'w') as output_filehandle: writer = csv.DictWriter(output_filehandle, fieldnames = ['CHROM', 'POS', 'REF', 'ALT'] + args.vep_fields, delimiter = "\t") writer.writeheader() + rows = [] for variant in vcf_reader: row = { 'CHROM': str(variant.CHROM), @@ -200,7 +202,9 @@ def main(args_input = sys.argv[1:]): 'ALT' : ','.join(map(lambda a: a.serialize(), variant.ALT)), } row = add_vep_fields_to_row(args, row, vep) - writer.writerow(row) + if row not in rows: + rows.append(row) + writer.writerows(rows) if __name__ == '__main__': main()