-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathExtract_white0.85.py
149 lines (120 loc) · 3.36 KB
/
Extract_white0.85.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
Extract whites only from the already parsed out VQHIGH variants
Extract variants that have missing/unknow genotypes in more then 10 percent wellderly or inova
Extract variants that cave median covereage <10 percent or >100
There are variants that don't have any covereage at all, keep those for now
TO DO: deal with the clustered variants, not many really
"""
import os, sys, gzip, datetime
def main(chrom):
infile = "/gpfs/group/stsi/data/projects/wellderly/GenomeComb/vcf_wellderly_inova_withCorrectGenotypes/"+chrom+".vqhigh.vcf.gz"
outfile = "/gpfs/group/stsi/data/projects/wellderly/GenomeComb/vcf_wellderly_inova_withCorrectGenotypes/"+chrom+".filtered0.85.vcf.gz"
whites = "/gpfs/group/stsi/data/projects/wellderly/GenomeComb/vcf_wellderly_inova_withCorrectGenotypes/white_0.85.txt"
i = gzip.open(infile)
o = gzip.open(outfile, 'w')
w = open(whites)
whites_id = []
header = []
#index the whites
ln = w.readline()
ln = ln.strip()
whites_id = ln.split()
#whites_id.append(ln)
print "Total whites " + str(len(whites_id))
print whites_id
counter = 0
good_counter = 0
buffe = ""
for line in i:
if line[:2] == "##":
print "header"
o.write(line)
elif line[0] == "#":
o.write(line)
line = line.strip()
header = line.split("\t")
#print header
else:
counter += 1
if counter%10000 ==0:
print "Total lines"
print str(counter)
print "Good lines"
print str(good_counter)
sys.stdout.flush()
if good_counter%100 == 0:
o.write(buffe)
buffe=""
o.flush()
line = line.strip()
line = line.split("\t")
#line_for_cluster = tp_line.split(":")
#print "cluster data"
#print line_for_cluster[-2]
missing_well=0
missing_inova=0
median_covereage=0
total_cov_count = 0
white_line = []
for index, gen in enumerate(line[9:]):
index = index +9
#Extract whites only
indiv_name = header[index]
#print indiv_name
if indiv_name in whites_id:
white_line.append(gen)
tp_gen = gen.split(":")
if "." in tp_gen[0]:
if indiv_name[-3] == 'DID':
missing_well += 1
else:
missing_inova += 1
try:
if '?' not in tp_gen[5]:
try:
median_covereage += int(tp_gen[5])
total_cov_count += 1
except:
print "Bad covereage " + tp_gen[5]
except:
continue
#If we have more then 10% missing:
if missing_well > 57 or missing_inova >74:
#print "too many missing"
continue
#Verify covereage
elif total_cov_count > 0:
if (median_covereage/total_cov_count) <10 or (median_covereage/total_cov_count) > 100:
#print "Weird covereage " + str(median_covereage/total_cov_count)
continue
else:
#print line[:9]
good_counter += 1
buffe = buffe + "\t".join(line[:9]) + "\t" + "\t".join(white_line) + "\n"
#print header[index]
#print header
#break
'''
if good_counter%100 == 0:
o.write(buffe)
buffe = ""
'''
o.write(buffe)
w.close()
i.close()
o.close()
print "Total lines"
print str(counter)
print "Good lines"
print str(good_counter)
print "End"
if __name__ == '__main__':
print "Python Version: " + sys.version
#args = parse_args()
import time
#start = time.time()
main(sys.argv[1])
#main(sys.argv[1], sys.argv[2], sys.argv[3])
#end = time.time()
#print 'This took {0} seconds'.format(end - start)
#main()