diff --git a/CHANGELOG.md b/CHANGELOG.md
index ce0fc99e..df1449e1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### `Added`
- [#454](https://github.com/nf-core/taxprofiler/pull/454) Updated to nf-core pipeline template v2.13.1 (added by @LilyAnderssonLee & @sofstam)
+- [#461](https://github.com/nf-core/taxprofiler/pull/461) Turned on 'strict' Nextflow evaluation runs (added by @jfy133)
+- [#461](https://github.com/nf-core/taxprofiler/pull/461) Optimised database compression so each compressed input database is untarred once, and shared amongst each run with different parameters (added by @jfy133)
+- [#461](https://github.com/nf-core/taxprofiler/pull/461) Added new parameter to optionally save uncompressed databases (added by @jfy133)
### `Fixed`
diff --git a/conf/modules.config b/conf/modules.config
index e3fb0f18..f5a5e631 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -18,6 +18,15 @@ process {
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
+ withName: UNTAR {
+ ext.prefix = { "${archive.simpleName}" }
+ publishDir = [
+ path: { "${params.outdir}/untar/databases" },
+ mode: params.publish_dir_mode,
+ enabled: params.save_untarred_databases
+ ]
+ }
+
withName: FASTQC {
ext.args = '--quiet'
ext.prefix = { "${meta.id}_${meta.run_accession}_raw" }
@@ -512,6 +521,14 @@ process {
]
}
+ withName: KRAKENTOOLS_KREPORT2KRONA {
+ publishDir = [
+ enabled: false,
+ mode: params.publish_dir_mode,
+ pattern: '*.txt'
+ ]
+ }
+
withName: KRONA_CLEANUP {
ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}" }
publishDir = [
diff --git a/docs/output.md b/docs/output.md
index cf4678c3..2cebd463 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -10,6 +10,7 @@ The directories listed below will be created in the results directory after the
The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
+- [UNTAR](#untar) - Optionally saved decompressed input databases
- [FastQC](#fastqc) - Raw read QC
- [falco](#fastqc) - Alternative to FastQC for raw read QC
- [fastp](#fastp) - Adapter trimming for Illumina data
@@ -40,6 +41,21 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
![](images/taxprofiler_tube.png)
+### untar
+
+untar is used in nf-core/taxprofiler to decompress various input files ending in `.tar.gz`. This process is mainly used for decompressing input database archive files.
+
+
+Output files
+
+- `untar/`
+ - `database/`
+ - ``: directory containing contents of the decompressed archive
+
+
+
+This directory will only be present if `--save_untarred_databases` is supplied. The contained directories can be useful for moving the decompressed directories to a central 'cache' location allowing users to re-use the same databases. This is useful to save unnecessary computational time of decompressing the archives on every run.
+
### FastQC or Falco
diff --git a/nextflow.config b/nextflow.config
index 26d23d96..9cf67533 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -6,6 +6,8 @@
----------------------------------------------------------------------------------------
*/
+nextflow.enable.strict = true
+
// Global default params, used in configs
params {
@@ -30,6 +32,7 @@ params {
email_on_fail = null
plaintext_email = false
monochrome_logs = false
+ monochromeLogs = false // required so nf-validation nextflow.enabled.strict works nicely together
hook_url = null
help = false
version = false
@@ -51,12 +54,13 @@ params {
// Schema validation default options
validationFailUnrecognisedParams = false
validationLenientMode = false
- validationSchemaIgnoreParams = 'genomes,igenomes_base,fasta'
+ validationSchemaIgnoreParams = 'genomes,igenomes_base,fasta,monochromeLogs'
validationShowHiddenParams = false
validate_params = true
// Databases
- databases = null
+ databases = null
+ save_untarred_databases = false
// FASTQ preprocessing
skip_preprocessing_qc = false
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 73364791..1ce1ee54 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -34,6 +34,12 @@
"description": "Path to comma-separated file containing information about databases and profiling parameters for each taxonomic profiler",
"help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/dev/usage#full-database-sheet).\n\nProfilers will only be executed if a corresponding database are supplied. \n\nWe recommend storing this database sheet somewhere centrally and accessible by others members of your lab/institutions, as this file will likely be regularly reused."
},
+ "save_untarred_databases": {
+ "type": "boolean",
+ "fa_icon": "fas fa-database",
+ "description": "Specify to save decompressed user-supplied TAR archives of databases",
+ "help_text": "If input databases are supplied as gzipped TAR archives, in some cases you may want to move and re-use these for future runs. Specifying this parameter will save these to `--outdir results/` under a directory called `untar`."
+ },
"outdir": {
"type": "string",
"format": "directory-path",
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index cc6955cf..6e079164 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -153,21 +153,37 @@ workflow TAXPROFILER {
skip: true
}
// Filter the channel to untar only those databases for tools that are selected to be run by the user.
- ch_input_untar = ch_dbs_for_untar.untar
+ // Also, to ensure only untar once per file, group together all databases of one file
+ ch_inputdb_untar = ch_dbs_for_untar.untar
.filter { db_meta, db_path ->
params[ "run_${db_meta.tool}" ]
}
- UNTAR ( ch_input_untar )
-
- ch_final_dbs = ch_dbs_for_untar.skip.mix( UNTAR.out.untar )
- ch_final_dbs
- .map { db_meta, db -> [ db_meta.db_params ]
- def corrected_db_params = db_meta.db_params == null ? '' : db_meta.db_params
- db_meta.db_params = corrected_db_params
- [ db_meta, db ]
+ .groupTuple(by: 1)
+ .map {
+ meta, dbfile ->
+ def new_meta = [ 'id': dbfile.baseName ] + [ 'meta': meta ]
+ [new_meta , dbfile ]
}
+
+ // Untar the databases
+ UNTAR ( ch_inputdb_untar )
ch_versions = ch_versions.mix( UNTAR.out.versions.first() )
+ // Spread out the untarred and shared databases
+ ch_outputdb_from_untar = UNTAR.out.untar
+ .map {
+ meta, db ->
+ [meta.meta, db]
+ }
+ .transpose(by: 0)
+
+ ch_final_dbs = ch_dbs_for_untar.skip
+ .mix( ch_outputdb_from_untar )
+ .map { db_meta, db ->
+ def corrected_db_params = db_meta.db_params ? [ db_params: db_meta.db_params ] : [ db_params: '' ]
+ [ db_meta + corrected_db_params, db ]
+ }
+
/*
MODULE: Run FastQC
*/