-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_kraken_output.py
executable file
·158 lines (109 loc) · 8.5 KB
/
parse_kraken_output.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#! /Users/jenniferlspillane/opt/anaconda3/bin/python
#This script assumes that you have run kraken2 on fasta reads, though most of it will work for contigs as well.
#It requires the kraken output that details how the kmers have been assigned (-k) and assumes there isn't anything else in that file.
#You can easily get this output by redirecting the standard output from kraken2.
#By default, it will give you information about what percentage of kmers in each read kraken2 designated "classified" or "unclassified",
#and the status it assigned the read as a whole, in a format that is good for ggplot.
#You can also optionally use it to pull out header names of reads that are below a certain amount "classified" that you can use to pull these sequences from the original files.
#You'll also need to designate a threshold value to allow reads into the new fasta files,
#though if you don't, it will default to allowing in reads with up to 0.25 classified kmers.
import argparse
import Bio.SeqIO
def calculate_percents(kmer_sets):
#setting up the lists of both kinds of kmers with 0s so that it's never empty for the math later
unclassified_kmers = [0.0]
classified_kmers = [0.0]
percentage = 0.0
switch = True
for item in kmer_sets:
#split on the colons to separate the id from the number
num_kmers = item.split(":")
#if the id is 0 (unclassified), we add that number of kmers to the list, if it's an "A", we ignore it, every other kmer number gets added to the other list.
if num_kmers[0].strip() == "0":
unclassified_kmers.append(float(num_kmers[1]))
elif num_kmers[0].strip() == "A":
continue
else:
classified_kmers.append(float(num_kmers[1]))
total_kmers = ((sum(classified_kmers)) + (sum(unclassified_kmers)))
#print(total_kmers)
if total_kmers == 0.0:
switch = False
else:
#calculate the percentage of classified kmers in the total kmers from the read for the first read
percentage = (sum(classified_kmers)) / total_kmers
#print(percentage, switch)
return [percentage, switch]
def parse_kraken():
fasta_ids_for_output = []
try:
with open(args.kraken_output, "r") as kraken_file:
with open(args.output, "a") as output_file:
with open("{0}".format(args.new_list_file), "a") as list_file:
#an option that stores "true", so that the function can handle paired or unpaired reads
if args.paired == True:
for line in kraken_file:
#splitting the lines on tabs first
fields = line.split('\t')
#this stores the letter at the beginning of the line, either "C" or "U"
read_class = fields[0].strip()
#the read name is harvested from the header by kraken2 - should be useful for pulling reads later
read_name = fields[1].strip()
#splits into the kmer assignments for each read in the pair
assignment = fields[4].split("|:|")
#need to split again on whitespace to get the individual id and number pairs
kmer_sets1 = (assignment[0].strip()).split()
kmer_sets2 = (assignment[1].strip()).split()
#call the calculation function for each read in the pair and store the result
percent_classified1 = calculate_percents(kmer_sets1)
percent_classified2 = calculate_percents(kmer_sets2)
if percent_classified1[1] and percent_classified2[1]:
#make a tuple to keep everything together
output_info = (read_class, read_name, percent_classified1[0], percent_classified2[0])
#write the output file so that the format works for ggplot (each read in the pair on a different line)
output_file.write("{0}\t{1}\tread1\t{2}\n{0}\t{1}\tread2\t{3}\n".format(output_info[0], output_info[1], output_info[2], output_info[3]))
if args.new_list_file is not None:
#check to see if the percentages are both above the threshold argument
if percent_classified1[0] <= args.threshold and percent_classified2[0] <= args.threshold:
list_file.write("{0}\n".format(read_name))
else:
continue
#now do the same thing, but without the paired reads, the format of the file is different
else:
for line in kraken_file:
#splitting the lines on tabs first
fields = line.split('\t')
#this stores the letter at the beginning of the line, either "C" or "U"
read_class = fields[0].strip()
#the read name is harvested from the header by kraken2 - should be useful for pulling reads later
read_name = fields[1].strip()
#splits into the kmer assignments for each read in the pair
kmer_sets3 = fields[4].split()
#call the calculation function for the read and store the result
percent_classified = calculate_percents(kmer_sets3)
if percent_classified[1]:
#make a tuple to keep everything together and append to the list of all the lines we'll write to the output file
output_info = (read_class, read_name, percent_classified[0])
#lines_for_output.append(output_info)
#output file can be more basic as there aren't as many things to keep straight
output_file.write("{0}\t{1}\t{2}\n".format(output_info[0], output_info[1], output_info[2]))
if args.new_list_file is not None:
#check to see if the percentages are both above the threshold argument
if percent_classified[0] <= args.threshold:
list_file.write("{0}\n".format(read_name))
else:
continue
except IOError:
print("Issue reading or writing file")
parser = argparse.ArgumentParser(description = "Arguments for parsing the kraken2 stdout output")
parser.add_argument("-k", "--kraken_output", required = True, help = "path to the file containing the kraken output for individual reads/sets")
parser.add_argument("-p", "--paired", action = "store_true", help = "a flag to signal that the kraken output comes from paired-end reads")
parser.add_argument("-o", "--output", required = True, help = "path to a csv file containing the ratios of classified and unclassified kmers for each read/set")
parser.add_argument("-l", "--new_list_file", help = "an optional argument consisting of a file name/path that will produce a file (or files if paired) with a list of all sequences passing the threshold")
parser.add_argument("-t", "--threshold", type = float, default = 0.25, help = "the percentage of classified kmers that will be tolerated in the final list of sequences (the -n option)")
args = parser.parse_args()
parse_kraken()
#./parse_kraken_output.py -k top_test_conus_kraken.out -p -o revised_test_percentages1.csv
#./parse_kraken_output.py -k top_test_conus_kraken.out -p -o revised_test_percentages2.csv -l conus_list_file.txt -t 0.25
#./parse_kraken_output.py -k top_test_pleuro_kraken.out -o test_percentages2.csv
#./parse_kraken_output.py -k bac_kraken_output/SRR10987117.out -p -o bac_kraken_classified_percents/SRR10987117_percentages.csv