-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcf_depth.py
109 lines (86 loc) · 3.12 KB
/
vcf_depth.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
##########################
# Author: B. Anderson
# Date: Nov 2021
# Modified: Mar 2022 (report stats; corrected for ipyrad behaviour)
# Description: capture and plot read depth information from a VCF file (VCF 4.0)
##########################
import sys
import argparse
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statistics
# instantiate the parser
parser = argparse.ArgumentParser(description = 'A script to capture and plot read depth by site from a VCF file')
# add arguments to parse
parser.add_argument('vcf_file', type = str, help = 'The VCF file to convert')
parser.add_argument('-o', type = str, dest = 'output', help = 'Name of the output pdf without extension [default \"output\"]')
parser.add_argument('-m', type = str, dest = 'maxd', help = 'Maximum potential read depth to graph [default 500]')
# parse the command line
if len(sys.argv[1:]) == 0: # if there are no arguments
parser.print_help(sys.stderr)
sys.exit(1)
args = parser.parse_args()
vcf_file = args.vcf_file
out_pre = args.output
maxd = args.maxd
if not vcf_file:
parser.print_help(sys.stderr)
sys.exit(1)
if not out_pre:
out_pre = 'output'
if not maxd:
maxd = 500
else:
maxd = int(maxd)
# process the VCF to grab sample labels and depths for each SNP locus
with open(vcf_file, 'r') as vcf:
snps = []
count_snps = 0
for line in vcf:
if line.startswith('#'): # a header INFO line
if line.startswith('#CHROM'): # the line with sample names
sample_labels = line.rstrip().split()[9:]
else:
calls = line.rstrip().split()[9:]
depths = []
for call in calls:
''' The format is GT:DP:CATG, so 0/0:85:0,85,0,0 for a homozygous AA with 85 depth
We want to grab the depth (DP)
'''
gt = call.split(':')[0].split('/')
if gt != ['.', '.']: # if not an N (shouldn't be needed, but currently is)
depths.append(int(call.split(':')[1]))
snps.append(depths)
count_snps = count_snps + 1
print('Read in a VCF file with ' + str(len(sample_labels)) + ' samples and ' + str(count_snps) + ' SNP loci')
# calculate the mean and max depth per site
# the mean is based on only present data (no missing = 0 depth)
means = []
maxs = []
for site in snps:
means.append(round(statistics.mean([item for item in site if item > 0])))
maxs.append(round(max(site)))
# a function to set a max for graphing
def use_max(my_list):
return(min(maxd, max(my_list)))
# plot histograms
fig, (ax1, ax2) = plt.subplots(2, 1, sharex = True,
figsize = (10, 10))
plt.subplots_adjust(hspace = 0.2)
ax1.hist(means, density = False, range = [0, use_max(means)],
bins = round(use_max(means) / 5))
ax1.set_title('Mean depth per site')
ax1.set_ylabel('Sites')
ax2.hist(maxs, density = False, range = [0, use_max(maxs)],
bins = round(use_max(maxs) / 5))
ax2.set_title('Maximum depth per site')
ax2.set_ylabel('Sites')
ax2.set_xlabel('Depth')
ax2.set_xlim(0, use_max(maxs) + 0.01 * use_max(maxs))
plt.savefig(out_pre + '_depth.pdf')
# report statistics
print('Overall mean depth: ' + str(statistics.mean(means)))
print('Minimum mean depth: ' + str(min(means)))
print('Maximum mean depth: ' + str(max(means)))