-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathrun_deepseq.pl
executable file
·451 lines (352 loc) · 12.2 KB
/
run_deepseq.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
#! /usr/bin/perl -w
use strict;
use FindBin;
use lib "$FindBin::Bin";
use modules::Deepseq;
use modules::Exception;
use modules::SystemCall;
use Data::Dumper;
use File::Basename;
use Cwd 'abs_path';
use Pod::Usage;
use Getopt::Long;
use vars qw(%OPT);
# Options from command line:
GetOptions(
\%OPT,
"help|h",
"man|m",
"filename_stub=s",
"read1_fastq=s",
"read2_fastq=s",
"start_command=s",
"end_command=s",
"working_dir=s",
"no_adaptor",
"no_uid",
"uid_len1=i",
"uid_len2=i",
"coord_bed=s",
"ref_fasta=s",
"bwa=s",
"samtools=s",
"sm_count=i",
"sm_portion=f",
"threads=i",
"graph",
"min_seqlen=i",
"min_group=i",
"conf_file=s",
"no_pair_match",
"cut_length=i",
"no_revcom",
"combine_reads",
"uid_done"
) || modules::Exception->throw("ERROR: Problem with command line arguments");;
pod2usage(-verbose => 2) if $OPT{man};
pod2usage(1) if ($OPT{help} || !$OPT{filename_stub} || !$OPT{read1_fastq} || !$OPT{read2_fastq} || !$OPT{coord_bed}) ;
=pod
=head1 SYNOPSIS
run_deepseq.pl -filename_stub unique_samplename -read1_fastq fastq1 -read2_fastq fastq2 -start_command command_to_start_from(default=first_command) -end_command last_command_to_run(default=last_command) -working_dir working_dir(default=pwd/filename_stub_RAND) -no_adaptor no_adaptor_sequence(default=present) -uid_len1 length_of_5prime_uid_length(default=10) -uid_len2 length_of_3prime_uid_length(default=0) -bwa full_bwa_path(default=/usr/bin/bwa) -samtools full_samtools_path(default=/usr/bin/samtools) -ref_fasta full_path_to_reference_fasta(must contain bwa index files as well) -coord_bed bed_file_containing_target_coordinates -sm_count min_number_of_reads_for_supermutant(default=10) -sm_portion min_fraction_of_variant_bases_within_UID_reads(default=0.90) -threads num_threads_for_bwa(default=1) -config create_config_file -graph generate_variant_graphs_for_each_genomic_region(requires R) -min_seqlen minimum_sequence_length_to_keep_seq(default=0) -min_group min_num_of_passing_groups_to_qualify_for_supermutant(default=2) -conf_file deepseq_conf_fiel(default=deepseq.conf) -no_pair_match don't_require_barcodes_in_read_pairs_to_match -cut_length remove_this_many_bases(default=uid1+uid2) -no_revcom no_revcom_for_read2 -combine_reads combine_barcodes_from_R1_and_R2 -uid_done uid_already_in_fastq(format_is_">read1:UID=SEQ")
Required flags: -filename_stub -read1_fastq -read2_fastq -coord_bed
=head1 OPTIONS
-help brief help message
-man full documentation
=head1 NAME
run_deepseq.pl -> Wrapper script for deepseq run
=head1 DESCRIPTION
date
a script that ...
=head1 AUTHOR
Matthew Field
=head1 EXAMPLE
sample -f file
=cut
my $bwa;
my $bwa_index;
my $samtools;
my $ref_fasta;
my $PRINT_TO_LOGFILE = 1;
my $PRINT_TO_STDOUT = 1;
my (undef,$base) = fileparse(abs_path($0));
my $conf_file = defined $OPT{conf_file}?$OPT{conf_file}:$base.'/deepseq.conf';
my $sys_call = modules::SystemCall->new();
#Set the variables either using a conf file or parse them from the command line
if (-e $conf_file) {
#Read the config file to get paths needed
open(FILE,"$conf_file") || modules::Exception->throw("Can't open file $conf_file\n");
while (<FILE>) {
chomp;
if (/samtools=(.+)$/) {
$samtools = $1;
if ( !-e $samtools ) {
modules::Exception->throw("Samtools $samtools doesn't exist");
}
} elsif (/bwa=(.+)$/) {
$bwa = $1;
if ( !-e $bwa ) {
modules::Exception->throw("BWA $bwa doesn't exist");
}
} elsif (/ref_fasta=(.+)$/) {
$ref_fasta = $1;
if ( !-e $ref_fasta ) {
modules::Exception->throw("Ref fasta $ref_fasta doesn't exist");
}
} elsif (/bwa_index=(.+)$/) {
$bwa_index = $1;
if ( !-e $bwa_index ) {
if (!-e $bwa_index.'.sa') {
modules::Exception->throw("BWA index file $bwa_index doesn't exist");
}
}
}
}
} else {
#otherwise all the variables need to be defined on the command line or available in /usr/bin
if (!$OPT{ref_fasta}) {
modules::Exception->throw("ERROR: Without config file must use -ref_fasta");
} else {
$ref_fasta = $OPT{ref_fasta};
}
my $ref_fasta_abs = abs_path($ref_fasta);
#Check the bwa index is present....
my $bwa_index_file = $ref_fasta_abs . '.sa';
if (!-e $bwa_index_file) {
#Sometimes it ref.fa.sa but sometimes it's ref.sa so check both
print "Can't find the index file $bwa_index_file\n";
($bwa_index_file = $ref_fasta_abs) =~ s/\.fa$/\.sa/;
print "Checking for index file $bwa_index_file\n";
if (!-e $bwa_index_file) {
modules::Exception->throw("ERROR: Need to generate the index files for bwa in the referece directory; either 1) run bwa beforehand with 'bwa index -a bwtsw ref.fa' OR 2) run configure_deepseq.pl first");
}
}
#bwa only requires the file prefix name
($bwa_index = $bwa_index_file) =~ s/\.sa$//;
if ($OPT{bwa}) {
$bwa = $OPT{bwa};
} elsif (-e '/usr/bin/bwa') {
$bwa = '/usr/bin/bwa';
} else {
modules::Exception->throw("ERROR: Without config file must use -bwa or have bwa at /usr/bin/bwa");
}
if ($OPT{samtools}) {
$samtools = $OPT{samtools};
} elsif (-e '/usr/bin/samtools') {
$samtools = '/usr/bin/samtools';
} else {
modules::Exception->throw("ERROR: Without config file must use -samtools or have samtools at /usr/bin/samtools");
}
}
#Check the fasta is available
if (!-e $ref_fasta) {
modules::Exception->throw("ERROR: Problem with $ref_fasta");
}
my $ref_fasta_abs = abs_path($ref_fasta);
my $pwd = `pwd`;
chomp $pwd;
#now get the sequence specific variables
my $filename_stub = $OPT{filename_stub};
my $read1_fastq = $OPT{read1_fastq};
my $read2_fastq = $OPT{read2_fastq};
if ($read1_fastq =~ /bz2$/ || $read1_fastq =~ /gz$/ || $read1_fastq =~ /zip$/) {
modules::Exception->throw("ERROR: Can't work with compressed files; please uncompress first");
}
my $working_dir;
if (defined $OPT{working_dir}) {
$working_dir = $OPT{working_dir};
} else {
$working_dir = $pwd . '/' . $filename_stub . '_' . $$;
if (! -e $working_dir){
`mkdir $working_dir`;
}
}
if ($OPT{graph}) {
if (!-e "$working_dir/graphs") {
`mkdir $working_dir/graphs`;
}
}
my $adaptor = defined $OPT{no_adaptor}?0:1; #default is that there is adaptor sequence info
my $uid = defined $OPT{no_uid}?0:1; #default is that there is uid info
my $uid_len1 = defined $OPT{uid_len1}?$OPT{uid_len1}:10;
my $uid_len2 = defined $OPT{uid_len2}?$OPT{uid_len2}:0;
my $coord_bed = $OPT{coord_bed};
if ( !-e $coord_bed ) {
modules::Exception->throw("File $coord_bed doesn't exist");
}
#Change all directories files to absolute path
my $working_dir_abs = abs_path($working_dir);
my $bwa_abs = abs_path($bwa);
my $samtools_abs = abs_path($samtools);
my $coord_bed_abs = abs_path($coord_bed);
my $read1_fastq_abs = abs_path($read1_fastq);
my $read2_fastq_abs = abs_path($read2_fastq);
#Check they all exist
if ( !-e $ref_fasta_abs ) {
modules::Exception->throw("File $ref_fasta_abs doesn't exist");
}
if ( !-e $read1_fastq_abs || !-e $read2_fastq_abs ) {
modules::Exception->throw("File $read1_fastq_abs || !-e $read2_fastq_abs doesn't exist");
}
if ( !-e $working_dir_abs ) {
modules::Exception->throw("File $working_dir_abs doesn't exist");
}
if ( !-e $bwa_abs ) {
modules::Exception->throw("File $bwa_abs doesn't exist");
}
if ( !-e $samtools_abs ) {
modules::Exception->throw("File $samtools_abs doesn't exist");
}
if ( !-e $coord_bed_abs ) {
modules::Exception->throw("File $coord_bed_abs doesn't exist");
}
my $read1_fastq_short = my $read2_fastq_short;
unless (chdir $working_dir_abs) {
modules::Exception->throw("Didn't manage to change to working dir [$working_dir_abs]. Still in this directory [$pwd]");
#unless $pwd eq $working_dir;
} else {
#Create symlinks for read files (don't copy; too time consuming)
`ln -s $read1_fastq_abs .`;
`ln -s $read2_fastq_abs .`;
#pass these as we've created symlinks now
($read1_fastq_short) = basename($read1_fastq_abs);
($read2_fastq_short) = basename($read2_fastq_abs);
}
#Set the required arguments
my %args = (
-filename_stub => $filename_stub,
-read1_fastq => $read1_fastq_short,
-read2_fastq => $read2_fastq_short,
-working_dir => $working_dir_abs,
-uid_len1 => $uid_len1,
-uid_len2 => $uid_len2,
-coord_bed => $coord_bed_abs,
-ref_fasta => $ref_fasta_abs,
-bwa => $bwa_abs,
-bwa_index => $bwa_index,
-samtools => $samtools_abs,
-base=>$base
);
#Optional arguments
if (!$uid) {
$args{-no_uid} = 1;
}
if ($OPT{no_pair_match}) {
$args{-no_pair_match} = 1;
}
if ($OPT{no_revcom}) {
$args{-no_revcom} = 1;
}
if ($OPT{combine_reads}) {
$args{-combine_reads} = 1;
}
if ($OPT{cut_length}) {
$args{-cut_length} = $OPT{cut_length};
}
if (defined $OPT{sm_count}) {
$args{-sm_count} = $OPT{sm_count};
}
if (defined $OPT{sm_portion}) {
$args{-sm_portion} = $OPT{sm_portion};
}
if (defined $OPT{threads}) {
$args{-num_threads} = $OPT{threads};
}
if (defined $OPT{min_seqlen}) {
$args{-min_seqlen} = $OPT{min_seqlen};
}
if (defined $OPT{min_group}) {
$args{-min_group} = $OPT{min_group};
}
if (defined $OPT{uid_done}) {
$args{-uid_done} = 1;
}
my $start_command = defined $OPT{'start_command'}?$OPT{start_command}:'check_fastq';
my $start_number = 1;
my $end_command = defined $OPT{'end_command'}?$OPT{end_command}:'graph';
# Start a log/results file
my $logfile = $filename_stub . '.log';
my $OUT;
if (! -e $logfile){
#$logfile = $working_dir_abs . '/' . $filename_stub . '.log';
open($OUT, ">$logfile")
or modules::Exception->throw("Unable to start logfile [$logfile] : $!\n");
} else {
open($OUT, ">>$logfile")
or modules::Exception->throw("Unable to start logfile [$logfile] : $!\n");
}
### select $OUT; # write as much blurb to logfile instead of STDOUT (turn this back to STDOUT with 'select STDOUT;')
print $OUT "\n### Log edited : " . &low_rent_timestamp() . " ###\n\n" if $PRINT_TO_LOGFILE;
print STDOUT "Logfile (check here also for error messages) : $logfile\n" if $PRINT_TO_STDOUT;
print $OUT "Working directory : $working_dir\n" if $PRINT_TO_LOGFILE;
# Create deepseq object
my $deepseq = modules::Deepseq->new(
%args
);
if ($start_command ne 'check_fastq') {
if (!$deepseq->check_step($start_command)) {
modules::Exception->throw("ERROR: Can't start at step $start_command; options are check_fastq,pool_reads,bwa,call_variants,report,or graph");
}
$start_number = $deepseq->get_step_number($start_command);
}
if ($end_command ne 'graph') {
if (!$deepseq->check_step($end_command)) {
modules::Exception->throw("ERROR: Can't end at step $end_command; options are check_fastq,pool_reads,bwa,call_variants,report,or graph");
}
}
#If there is adaptor sequence(s) get it from the fastq and mark it in each sequence to avoid creating supergroup of adaptors
if ($adaptor) {
if (my $adaptor_seqs = $deepseq->get_adaptor()) {
my $adaptor_str = join(",",@{$adaptor_seqs});
print "Will remove the following adaptors:\n$adaptor_str\n";
$deepseq->set_adaptor($adaptor_seqs);
}
}
# Retrieve commands from config file
my $commands = $deepseq->get_commands();
# Try to run each command in order
for my $command_count (sort keys %{$commands}) {
my @commands = @{$commands->{$command_count}{commands}};
my @outs = @{$commands->{$command_count}{stdout}};
if (@commands != @outs) {
modules::Exception->throw("ERROR: Different number of commands and stdout flags");
}
my $block = $commands->{$command_count}{block};
if ($start_number > 1 && $command_count < $start_number) {
next;
}
if ($block eq 'graph') {
next unless $OPT{graph};
}
print "\n\nRunning block $block\n\n";
for (my $i = 0; $i < @commands; $i++) {
print $OUT "\tRun " . $commands[$i] . "\n" if $PRINT_TO_LOGFILE;
if ($outs[$i] == 1) {
my $output = `$commands[$i]`;
print $OUT "$output" if $PRINT_TO_LOGFILE;
} else {
$sys_call->run($commands[$i]);
}
}
if ($end_command eq $block) {
exit;
}
}
# Close logfile
close($OUT);
# Return to previous directory
chdir $pwd;
sub low_rent_timestamp() {
my ($sec,
$min,
$hour,
$mday,
$month,
$year,
$wday,
$yday,
$isdst) = localtime(time);
$year += 1900;
$month += 1;
my $timestamp_str = $year . '-' . $month . '-' . $mday . ' ' . $hour . ':' . $min . ':' . $sec;
return $timestamp_str;
}