-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathphyloFlash_makedb.pl
executable file
·603 lines (485 loc) · 18 KB
/
phyloFlash_makedb.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
#!/usr/bin/env perl
use strict;
use warnings;
=pod
=head1 NAME
phyloFlash_makedb.pl - prepares the phyloFlash dbdir
=head1 SYNOPSIS
### download databases via FTP
phyloFlash_makedb.pl --remote
### use local copies
phyloFlash_makedb.pl --silva_file F<path/to/silva_db> --univec_file F<path/to/univec_db>
## Get help
phyloFlash_makedb.pl --help
phyloFlash_makedb.pl --man
=head1 DESCRIPTION
=over 3
=item -
Assumes the script is executed in a dir with write access and
=item -
Dependencies: wget, vsearch, bbmap and Emirge
=item -
This script generates approx. 5 Gbytes of extracted databases (9 Gb if Vsearch
UDB database is also produced, with vsearch v2.5.0+)
=back
=head1 ARGUMENTS
=head2 INPUT FILES
=over 8
=item --remote
Download databases via FTP
=item --silva_file F<path/to/silva_db>
Path to local copy of SILVA database file. Ignored if --remote flag is used.
This should be the Fasta-formatted SILVA SSURef file, clustered at 99% identity,
with SILVA taxonomy strings in file header, and sequences truncated to SSU gene
boundaries. The file name should be in the form
I<SILVA_[Release]_SSURef_Nr99_tax_silva_trunc.fasta.gz> (release 132 and before) or
I<SILVA_[Release]_SSURef_NR99_tax_silva_trunc.fasta.gz> (from release 138 onwards)
=item --univec_file F<path/to/univec_db>
Path to local copy of Univec database file. Ignored if --remote flag is used.
=back
=head2 OPTIONAL TOOLS
=over 8
=item --emirge
Index database with Bowtie v1 for Emirge. Requires I<bowtie-build> to be in path.
Default: Yes (turn off with --noemirge)
=item --sortmerna
Index database for Sortmerna, if you wish to use it as an alternative to BBmap
for extracting rRNA reads from the read file. Requires I<indexdb_rna> to be in
path.
Default: No (--nosortmerna).
=back
=head2 CONFIGURATION AND HELP
=over 8
=item --keep
Do not delete intermediary files
=item --overwrite
Overwrite files if already present. Files are not overwritten by default,
allowing you to restart the DB build process if it was interrupted (but you will
have to do find and delete corrupted files manually).
Default: No ("--nooverwrite")
=item --ref_minlength
Minimum length of database sequences to keep after trimming of vectors and
contaminant sequences. Default of 800 bp is about 50% for SSU rRNA gene length,
but should be changed if a custom database is used.
Default: 800
=item --CPUs I<N>
Number of processors to use
Default: All available
=item --mem I<N>
Memory limit (in Gb) for indexing tools. At least 10 is recommended.
Default: 10
=item --log I<FILE>
Write phyloFlash_makedb.pl log to a file.
Default: None
=item --check_env
Check that required dependencies are available in your path. If specifying
optional tools I<--sortmerna> and I<--emirge>, put the I<--check_env> argument
at the end of the command.
=item --help
Help message
=item --man
Full manual page
=item --version
Report version
=back
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2014-2018 by Harald Gruber-Vodicka <[email protected]>
Elmar Pruesse <[email protected]>
Brandon Seah <[email protected]>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
=cut
use FindBin;
use lib $FindBin::RealBin;
use PhyloFlash;
use Pod::Usage;
use Getopt::Long;
use Net::FTP;
use Digest::MD5;
use IO::Uncompress::AnyUncompress qw(anyuncompress $AnyUncompressError);
use Cwd;
use Storable;
use File::Spec;
use File::Basename;
# URLS
my $silva_url = "ftp.arb-silva.de/current/Exports/*_SSURef_N?99_tax_silva_trunc.fasta.gz";
my $univec_url = "ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec";
# constants
my $dbsource;
my $silva_file = "";
my $univec_file = "";
my $cwd = getcwd;
my $timer = new Timer;
my $cpus = get_cpus # num cpus to use
my $keep = 0; # Keep temp files
my $sortmerna = 0; # Index sortmerna tool
my $emirge = 1; # Index emirge
my $memlimitGb = 10; # In Gb
my $makedb_log;
my $ref_minlength=800;
# commandline arguments
my $use_remote = 0; # Download SILVA and Univec databases via FTP
my $overwrite = 0;
# parse cmdline
pod2usage(-verbose=>0) if !@ARGV;
GetOptions("remote|r" => \$use_remote,
"silva_file=s" => \$silva_file,
"univec_file=s" => \$univec_file,
"emirge!" => \$emirge,
"sortmerna!" => \$sortmerna,
"CPUs=i" => \$cpus, # override default CPU usage if necessary
"mem=i" => \$memlimitGb,
"check_env" => sub { process_dependencies();
check_environment();
exit; },
"keep|k" => \$keep,
"overwrite!" => \$overwrite,
"ref_minlength=i" => \$ref_minlength,
"log=s" => \$makedb_log,
"help|h" => sub{pod2usage(-verbose=>1);},
"man|m" => sub {pod2usage(-verbose=>2);},
"version" => sub { welcome();
exit(); },
)
or pod2usage(-verbose=>0,-exit=>2);
if ($use_remote==0 && ($silva_file eq "" | $univec_file eq "")) {
pod2usage("Please specify paths to local copies of SILVA and Univec databases.");
}
process_dependencies();
check_environment();
## MAIN ########################################################################
welcome();
# check whether user has local copy of Univec and Silva files, or if
# they should be downloaded
if ($use_remote==1 && $univec_file eq "") {
msg("downloading latest univec from ncbi");
$univec_file = file_download($univec_url);
}
elsif ($use_remote==0 && $univec_file ne "") {
msg("using local copy of univec: $univec_file");
}
else {
err("No univec file found and downloading disabled.");
}
if ($use_remote==1 && $silva_file eq "") {
msg("downloading latest SSU RefNR from www.arb-silva.de");
$silva_file = file_download($silva_url);
}
elsif ($use_remote==0 && $silva_file ne "") {
msg("using local copy of Silva SSU RefNR: $silva_file");
}
else {
err("No SILVA database found and downloading disabled.");
}
# extract SILVA version
# get file basename, in case a path is specified
my ($silva_file_filename,
$silva_file_dirs,
$silva_file_suffix) = fileparse($silva_file);
my ($silva_release) = ($silva_file_filename =~ m/SILVA_([^_]+)_/);
if (!$&) {
err("Unable to extract version from SILVA database filename:",
"Expected 'SILVA_<version>_...' in '$silva_file'.");
}
# create database directory
my $dbdir = "./".$silva_release."/";
if (! -e $dbdir) {
mkdir $dbdir
or err("Failed to create dir $silva_release", $!);
}
msg("unpacking SILVA database");
if (! -e "$dbdir/SILVA_SSU.fasta" || $overwrite == 1 ) {
anyuncompress $silva_file => "$dbdir/SILVA_SSU.fasta"
or err("unpacking failed: $AnyUncompressError");
} else {
msg ("File $dbdir/SILVA_SSU.fasta exists, not overwriting");
}
my @lsu_in_ssh;
if (! -e "$dbdir/SILVA_SSU.noLSU.fasta" || $overwrite == 1) {
@lsu_in_ssh = find_LSU("$dbdir/SILVA_SSU.noLSU.fasta");
msg ("Removing sequences with potential LSU contamination");
fasta_copy_except("$dbdir/SILVA_SSU.fasta",
"$dbdir/SILVA_SSU.noLSU.fasta",
@lsu_in_ssh,
$overwrite);
unlink "$dbdir/SILVA_SSU.fasta" unless ($keep==1);
unlink glob "tmp.barrnap_hits.*" unless ($keep==1);
} else {
msg ("LSU-filtered file found, not overwriting");
}
mask_repeats("$dbdir/SILVA_SSU.noLSU.fasta",
"$dbdir/SILVA_SSU.noLSU.masked.fasta",
$overwrite);
unlink "$dbdir/SILVA_SSU.noLSU.fasta" unless ($keep==1);
univec_trim($univec_file,
"$dbdir/SILVA_SSU.noLSU.masked.fasta",
"$dbdir/SILVA_SSU.noLSU.masked.trimmed.fasta",
$overwrite,
$ref_minlength);
unlink "$dbdir/SILVA_SSU.noLSU.masked.fasta" unless ($keep==1);
# Index database into UDB file, if Vsearch v2.5.0+
# Speeds up run time in search phase of phyloFlash as db can be directly read to mem
my $vsearch_ver_check = check_vsearch_version();
if (defined $vsearch_ver_check) {
my $fasta_in = "$dbdir/SILVA_SSU.noLSU.masked.trimmed.fasta";
my $udb_out = "$dbdir/SILVA_SSU.noLSU.masked.trimmed.udb";
make_vsearch_udb($fasta_in,
$udb_out,
$overwrite);
}
#the cleaned, masked and trimmed databases are clustered
# 1) at 99id with full labels for bbmap
# 2) at 96id for emirge and sortmerna
cluster("./$silva_release/SILVA_SSU.noLSU.masked.trimmed.fasta",
"./$silva_release/SILVA_SSU.noLSU.masked.trimmed.NR99.fasta",
"0.99",
$cpus,
$overwrite);
# no unlink -> trimmed.fasta needed for vsearch
fasta_copy_iupac_randomize("./$silva_release/SILVA_SSU.noLSU.masked.trimmed.NR99.fasta",
"./$silva_release/SILVA_SSU.noLSU.masked.trimmed.NR99.fixed.fasta",
$overwrite);
bbmap_db("./$silva_release/SILVA_SSU.noLSU.masked.trimmed.NR99.fixed.fasta",
"./$silva_release/",
$overwrite);
cluster("./$silva_release/SILVA_SSU.noLSU.masked.trimmed.NR99.fasta",
"./$silva_release/SILVA_SSU.noLSU.masked.trimmed.NR96.fasta",
"0.96",
$cpus,
$overwrite);
unlink "$dbdir/SILVA_SSU.noLSU.masked.trimmed.NR99.fasta" unless ($keep==1);
fasta_copy_iupac_randomize("./$silva_release/SILVA_SSU.noLSU.masked.trimmed.NR96.fasta",
"./$silva_release/SILVA_SSU.noLSU.masked.trimmed.NR96.fixed.fasta",
$overwrite);
unlink "$dbdir/SILVA_SSU.noLSU.masked.trimmed.NR96.fasta" unless ($keep==1);
bowtie_index("./$silva_release/SILVA_SSU.noLSU.masked.trimmed.NR96.fixed.fasta") if $emirge == 1;
if ($sortmerna == 1) {
sortmerna_index("./$silva_release/SILVA_SSU.noLSU.masked.trimmed.NR96.fixed.fasta",
$overwrite);
hash_SILVA_acc_taxstrings_from_fasta ("./$silva_release/SILVA_SSU.noLSU.masked.trimmed.NR96.fixed.fasta",
$overwrite);
}
# Clean up log files at end - if breaks in middle they will be available for debug
unless ($keep==1) {
unlink "tmp.bbmask_mask_repeats.log"; # from mask_repeats
unlink "tmp.bbduk_remove_univec.log"; # from univec_trim
if (defined $vsearch_ver_check) {
unlink "tmp.vsearch_make_udb.log"; # from make_vsearch_udb
}
unlink "tmp.bbmap_index.log"; # from bbmap_db
unlink "tmp.bowtiebuild.log"; # from bowtie_index
if ($sortmerna == 1) {
unlink "tmp.indexdb_rna.log"; # from sortmerna_index
}
}
finish();
write_logfile($makedb_log) if defined $makedb_log;
## SUBS ########################################################################
sub welcome {
print STDERR "\nThis is phyloFlash_makedb.pl from phyloFlash.pl v$VERSION\n\n";
}
sub process_dependencies {
# binaries needed by phyloFlash
require_tools((
barrnapHGV => "$FindBin::RealBin/barrnap-HGV/bin/barrnap_HGV",
grep => "grep",
bbmask => "bbmask.sh",
bbduk => "bbduk.sh",
bbmap => "bbmap.sh",
vsearch => 'vsearch',
));
# Binaries for optional tools
if ($emirge == 1) {
require_tools ((
bowtiebuild => "bowtie-build",
));
}
if ($sortmerna == 1) {
# If Sortmerna is required, add to dependencies list
require_tools ((
indexdb_rna => "indexdb_rna",
));
}
}
# searching for LSU genes in the SSU RefNR using a modified b
# barrnap version that only utilizes LSU hmm profiles
# barrnap was changed to use a different db folder that
# only contains the LSU profiles
sub find_LSU {
msg("searching for LSU contamination in SSU RefNR");
my @lsus;
my $fh;
foreach ('bac', 'arch', 'euk') {
my $log = "tmp.barrnap_hits.$_.barrnap.out";
my $res = "tmp.barrnap_hits.$_.gff";
#1 or run_prog("barrnapHGV",
run_prog("barrnapHGV",
" --kingdom $_ "
. "--threads $cpus "
. "--evalue 1e-10 "
. "--gene lsu "
. "--reject 0.01 "
. "./$silva_release/SILVA_SSU.fasta ",
$res, $log);
open_or_die(\$fh, "-|",
"grep -E '23S_rRNA\|28S_rRNA' $res");
while (my $row = <$fh>) {
$row =~ s/\t.*//;
chomp($row);
push @lsus, $row;
}
}
return @lsus;
}
sub make_vsearch_udb {
my ($infile, $outfile, $overwrite) = @_;
msg ("Indexing $infile to make UDB file $outfile with Vsearch");
my $log = "tmp.vsearch_make_udb.log";
if (! -e $outfile || $overwrite == 1) {
my @vsearch_params = ("--threads $cpus",
'--notrunclabels', # Keep full header including taxstring
"--makeudb_usearch $infile",
"--output $outfile",
);
run_prog("vsearch",
join (" ", @vsearch_params),
undef,
$log
);
} else {
msg ("WARNING: File $outfile already exists. Not overwriting");
}
}
#bbmask masks low entropy regions and repeats in the fasta file
sub mask_repeats {
my ($src, $dst, $overwrite) = @_;
msg("masking low entropy regions in SSU RefNR");
my $log = "tmp.bbmask_mask_repeats.log";
if (! -e $dst || $overwrite == 1) {
run_prog("bbmask",
" overwrite=t "
. "-Xmx".$memlimitGb."g "
. "threads=$cpus "
. "in=$src "
. "out=$dst "
. "minkr=4 maxkr=8 mr=t minlen=20 minke=4 maxke=8 "
. "fastawrap=0 ",
undef,
$log);
} else {
msg ("File $dst exists, not overwriting");
}
}
#db is screened against UniVec and hit bases are trimmed using bbduk
sub univec_trim {
my ($univec, $src, $dst,$overwrite, $ref_minlength) = @_;
msg("removing UniVec contamination in SSU RefNR");
my $log = "tmp.bbduk_remove_univec.log";
if (! -e $dst || $overwrite == 1 ) {
run_prog("bbduk",
"ref=$univec "
. " overwrite=t "
. "-Xmx".$memlimitGb."g "
. "threads=$cpus "
. "fastawrap=0 "
. "ktrim=r ow=t minlength=$ref_minlength mink=11 hdist=1 "
. "in=$src "
. "out=$dst "
. "stats=$dst.UniVec_contamination_stats.txt",
undef,
$log);
} else {
msg ("File $dst exists, not overwriting");
}
}
#the NR99 file is fixed and a reference file is generated for bbmap
sub bbmap_db {
my ($ref, $path, $overwrite) = @_;
msg("creating bbmap reference");
my $log = "tmp.bbmap_index.log";
if (!-d "$path/ref" || $overwrite == 1) {
run_prog("bbmap",
" -Xmx".$memlimitGb."g " # Original 4 Gb limit was not enough
. "threads=$cpus "
. "ref=$ref "
. "path=$path ",
undef,
$log);
} else {
msg ("WARNING: Folder $path exists, not overwriting");
}
}
#the NR96 file is fixed and a bowtie index is built
sub bowtie_index {
my ($fasta) = @_;
my $btidx = $fasta =~ s/\.fasta$/\.bt/r;
my $log = "tmp.bowtiebuild.log";
msg("creating bowtie index (for emirge)");
run_prog("bowtiebuild",
"$fasta $btidx -q",
undef,
$log);
}
# Generate index for Sortmerna from the filtered fixed SSU clustered at 96% id
sub sortmerna_index {
my ($fasta,$overwrite) = @_;
my $memlimitMb = $memlimitGb * 1000;
my $prefix = $fasta =~ s/\.fasta$//r;
my $log = "tmp.indexdb_rna.log";
msg ("creating sortmerna index");
if (! -e "$prefix.bursttrie_0.dat" || $overwrite == 1) {
my @indexdb_args = ("--ref $fasta,$prefix",
"-m $memlimitMb",
"-v");
run_prog("indexdb_rna",
join (" ", @indexdb_args),
undef,
$log
)
} else {
msg ("WARNING: SortMeRNA indices for file prefix $prefix already exist, not overwriting");
}
}
sub hash_SILVA_acc_taxstrings_from_fasta {
# Hash of accession numbers and taxonomy strings from SILVA fasta headers
# Store as a perl hash image with Storable
# For later when wrangling sortmerna SAM output to bbmap-like format
my ($fasta) = @_;
my $prefix = $fasta =~ s/\.fasta$//r;
msg ("Hashing SILVA accession numbers to taxonomy strings");
if (! -e "$prefix.acc2taxstring.hashimage" || $overwrite == 1) {
my %hash;
open(my $fh, "<", $fasta) or die ("$!");
while (my $line = <$fh>) {
if ($line =~ m/^>(.+)/) {
my $header = $1;
chomp $header;
my ($id, @taxsplit) = split / /, $header; # Split on first space
$hash{$id} = join " ", @taxsplit; # Rejoin tax string if it has spaces
}
}
close($fh);
store \%hash, "$prefix.acc2taxstring.hashimage";
} else {
msg ("WARNING: File $prefix.acc2taxstring.hashimage already exists, not overwriting");
}
}
#--------------------------------------------- final timing stats and goodbye
sub finish {
my $dbdir_abs = File::Spec->rel2abs($dbdir); # Convert to absolute path
msg("Walltime used: ".$timer->minutes);;
msg("The databases for Silva release $silva_release are ready for phyloFlash
When running phyloFlash, please provide the location of
the databases with the following option:
-dbhome $dbdir_abs
or add the following line to your .bashrc or .bash_profile:
export PHYLOFLASH_DBHOME=$dbdir_abs");
}