-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathvarscan2mutator.py
74 lines (67 loc) · 2.21 KB
/
varscan2mutator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
import sys
from datetime import datetime
def decode_variant(ref, code):
if code in set('ACGT'):
return code
elif code in set('BDHVN.'):
return None
else:
if code == 'R':
candidates = ['A', 'G']
elif code == 'Y':
candidates = ['C', 'T']
elif code == 'S':
candidates = ['G', 'C']
elif code == 'W':
candidates = ['A', 'T']
elif code == 'K':
candidates = ['G', 'T']
elif code == 'M':
candidates = ['A', 'C']
else:
print(code)
assert False
if ref in candidates:
candidates.remove(ref)
assert len(candidates) == 1
return candidates[0]
else:
return None
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
print(datetime.now(), "Starting")
n_written = 0
n_multimutlines = 0
n_skipped = 0
with open(outfile, 'w') as fout:
fout.write("#CHR\tPOS\tREF\tVAR\n")
with open(infile, 'r') as fin:
fin.readline()
prev_line = ("chr1", "-1")
for line in fin:
if line.startswith('##'):
continue
chr, pos, ref, code = line.split('\t')[:4]
if len(chr) > 5:
continue
if (chr, pos) == prev_line:
n_multimutlines += 1
continue
else:
prev_line = (chr, pos)
var_allele = decode_variant(ref, code)
if var_allele is None:
n_skipped += 1
continue
else:
n_written += 1
fout.write("\t".join([chr, pos, ref, var_allele]) + '\n')
print(datetime.now(), "Done. Wrote {} variants, skipped {} variants that were not the first at the site, skipped {} variants with multiple/ambiguous SNVs.".format(n_written, n_multimutlines, n_skipped))
if __name__ == '__main__':
n_args = 2
if len(sys.argv) != n_args + 1:
print("Incorrect number of arguments (expecting %d, saw %d)" % (n_args, len(sys.argv) - 1))
quit(1)
main()