forked from AntonelliLab/seqcap_processor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
assemble_reads.py
258 lines (232 loc) · 8.89 KB
/
assemble_reads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#!/usr/local/opt/python/bin/python
#author: Tobias Hofmann, [email protected]
import os
import sys
import re
import glob
import shutil
import argparse
import commands
import subprocess
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#%%% Input %%%
# Complete path function
class CompletePath(argparse.Action):
"""give the full path of an input file/folder"""
def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, os.path.abspath(os.path.expanduser(values)))
# Get arguments
def get_args():
parser = argparse.ArgumentParser(
description="Assemble trimmed Illumina read files (fastq)",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
'--input',
required=True,
action=CompletePath,
default=None,
help='Call the folder that contains the trimmed reads, organized in a separate subfolder for each sample. The name of the subfolder has to start with the sample name, delimited with an underscore [_]'
)
parser.add_argument(
'--output',
required=True,
action=CompletePath,
default=None,
help='The output directory where results will be safed'
)
parser.add_argument(
'--assembler',
choices=["trinity", "abyss"],
default="abyss",
help="""The assembler to use."""
)
parser.add_argument(
'--trinity',
default="/usr/local/bin/trinityrnaseq_r20140717/Trinity",
action=CompletePath,
help='The path to the Trinity executable'
)
parser.add_argument(
'--abyss',
default="/usr/local/packages/anaconda2/bin/abyss-pe",
action=CompletePath,
help='The path to the abyss-pe executable'
)
parser.add_argument(
'--kmer',
type=int,
default=35,
help='Set the kmer value'
)
parser.add_argument(
'--contig_length',
type=int,
default=200,
help='Set the minimum contig length for Trinity assembly. Contigs that are shorter than this threshold will be discarded.'
)
parser.add_argument(
'--single_reads',
action='store_true',
default=False,
help='Use this flag if you additionally want to use single reads for the assembly'
)
parser.add_argument(
'--cores',
type=int,
default=1,
help='For parallel processing you can set the number of cores you want to run Trinity on.'
)
return parser.parse_args()
# Preparation for calling input variables and files
args = get_args()
# Set working directory
out_folder = args.output
out_dir = "%s/stats" %out_folder
if not os.path.exists(out_dir):
os.makedirs(out_dir)
# Get all the other input variables
input_folder = args.input
min_length = args.contig_length
trinity = args.trinity
cores = args.cores
abyss = args.abyss
kmer = args.kmer
home_dir = os.getcwd()
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#%%% Functions %%%
def assembly_trinity(forw,backw,output_folder,id_sample):
print "De-novo assembly with Trinity of sample %s:" %id_sample
command = [
trinity,
"--seqType",
"fq",
"--left",
forw,
"--right",
backw,
"--CPU",
str(cores),
"--min_contig_length",
str(min_length),
"--JM",
"20G",
"--output",
output_folder
]
try:
print "Building contigs........"
with open(os.path.join(output_folder, "%s_trinity_screen_out.txt" %id_sample), 'w') as log_err_file:
p = subprocess.Popen(command, stdout=log_err_file)
p.communicate()
print "%s assembled. Trinity-stats are printed into %s" %(id_sample, os.path.join(output_folder, "%s_trinity_screen_out.txt" %sample_id))
except:
print "Could not assemble %s" %id_sample
def assembly_abyss(forw,backw,singlef,singleb,output_folder,id_sample):
print "De-novo assembly with abyss of sample %s:" %id_sample
command = [
abyss,
"k={}".format(kmer),
"j={}".format(cores),
'name={}'.format(id_sample),
'in={} {}'.format(forw,backw)
]
if args.single_reads:
command.append('se={} {}'.format(singlef,singleb))
try:
print "Building contigs........"
with open(os.path.join(output_folder, "%s_abyss_screen_out.txt" %id_sample), 'w') as log_err_file:
p = subprocess.Popen(command, stdout=log_err_file)
p.communicate()
print "%s assembled. Statistics are printed into %s" %(id_sample, os.path.join(output_folder, "%s_abyss_screen_out.txt" %sample_id))
except:
print "Could not assemble %s" %id_sample
def get_stats(sample_output_folder,sample_id):
print "Extracting statistics for", sample_id
# Read counts
read_count_cmd = subprocess.Popen(["cat", "%s/both.fa.read_count" %sample_output_folder], stdout=subprocess.PIPE)
read_count = read_count_cmd.communicate()[0]
# Assembled read counts
assembled_reads_cmd = subprocess.Popen(["wc", "-l", "%s/chrysalis/readsToComponents.out.sort" %sample_output_folder], stdout=subprocess.PIPE)
assembled_reads = assembled_reads_cmd.communicate()[0]
assembled_reads_count, file = assembled_reads.split(" ")
# Contig count
unimportant = ""
contig_count_cmd = subprocess.Popen(["tail", "-n", "1", "%s/chrysalis/readsToComponents.out.sort" %sample_output_folder], stdout=subprocess.PIPE)
contig_count_pre = contig_count_cmd.communicate()[0]
print contig_count_pre
contig_count, header, percent, sequence = contig_count_pre.split("\t")
with open(os.path.join(sample_output_folder, "%s_stats.txt" %sample_id), 'w') as stat_file:
stat_file.write("Statistics for sample %s\n" %sample_id)
stat_file.write("Read-count in trimmed fastq read-files : %s" %read_count)
stat_file.write("Reads assembled into contigs : %s\n" %assembled_reads_count)
stat_file.write("Assembled contigs : %s\n" %contig_count)
def cleanup_trinity_assembly_folder(sample_output_folder, sample_id):
# This function is copied (and slightly modified) from phyluce, written by Brant Faircloth
print "Removing unnecessary files from the Trinity folder for %s" %sample_id
files = glob.glob(os.path.join(sample_output_folder, '*'))
# check the names to make sure we're not deleting something improperly
names = [os.path.basename(f) for f in files]
try:
assert "Trinity.fasta" in names
assert "%s_trinity_screen_out.txt" %sample_id in names
except:
raise IOError("Neither Trinity.fasta nor %s_trinity_screen_out.txt were found in output." %sample_id)
for file in files:
if not os.path.basename(file) in ("Trinity.fasta", "%s_trinity_screen_out.txt" %sample_id, "%s_stats.txt" %sample_id):
if os.path.isfile(file) or os.path.islink(file):
os.remove(file)
elif os.path.isdir(file):
shutil.rmtree(file)
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#%%% Workflow %%%
print "\n\nRunning %s parallel on %d cores" %(args.assembler,cores)
for subfolder, dirs, files in os.walk(input_folder):
subfolder_path_elements = re.split("%s/" %input_folder, subfolder)
if subfolder_path_elements[-1] != input_folder:
sample_folder = subfolder_path_elements[-1]
sample_id = re.split("_", sample_folder)[0]
# Loop through each sample-folder and find read-files
sample_output_folder = "%s/%s" %(out_dir,sample_id)
if not os.path.exists(sample_output_folder):
os.makedirs(sample_output_folder)
for misc1, misc2, fastq in os.walk(subfolder):
forward = ""
backward = ""
single_f = ""
single_b = ""
for element in fastq:
if sample_id in element and element.endswith("READ1.fastq"):
forward = "%s/%s" %(subfolder,element)
if sample_id in element and element.endswith("READ2.fastq"):
backward = "%s/%s" %(subfolder,element)
if sample_id in element and element.endswith("READ1-single.fastq"):
single_f = "%s/%s" %(subfolder,element)
if sample_id in element and element.endswith("READ2-single.fastq"):
single_b = "%s/%s" %(subfolder,element)
if forward != "" and backward != "":
print "\n", "#" * 50
print "Processing sample", sample_id, "\n"
if args.assembler == "trinity":
assembly_trinity(forward,backward,sample_output_folder,sample_id)
get_stats(sample_output_folder,sample_id)
cleanup_trinity_assembly_folder(sample_output_folder,sample_id)
print "\n", "#" * 50
mv_cmd = "mv %s/Trinity.fasta %s/%s.fasta" %(sample_output_folder,out_folder,sample_id)
os.system(mv_cmd)
elif args.assembler == "abyss":
assembly_abyss(forward,backward,single_f,single_b,sample_output_folder,sample_id)
files = glob.glob(os.path.join(home_dir,'*'))
links = [f for f in files if os.path.islink(f)]
for l in links:
if l.endswith("-contigs.fa"):
contig_file = os.path.realpath(l)
mv_contig = "mv %s %s/../../%s.fa" %(contig_file,sample_output_folder,sample_id)
os.system(mv_contig)
mv_cmd1 = "mv %s/%s* %s" %(home_dir,sample_id,sample_output_folder)
os.system(mv_cmd1)
mv_cmd2 = "mv %s/coverage.hist %s" %(home_dir,sample_output_folder)
os.system(mv_cmd2)
else:
print "\nError: Read-files for sample %s could not be found. Please check if subfolders/sample-folders are named in this pattern: 'sampleID_clean' and if the cleaned fastq files in the sample-folder end with 'READ1.fastq' and 'READ2.fastq' respectively." %sample_id
raise SystemExit