From 4d6e11140b62c56902da0c4a7ac81c8db1ea2ea0 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 21 Mar 2024 09:56:49 +0100 Subject: [PATCH 01/16] Try adding nextflow strict --- nextflow.config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nextflow.config b/nextflow.config index 26d23d96..519e2648 100644 --- a/nextflow.config +++ b/nextflow.config @@ -6,6 +6,8 @@ ---------------------------------------------------------------------------------------- */ +nextflow.enable.strict = true + // Global default params, used in configs params { @@ -30,6 +32,7 @@ params { email_on_fail = null plaintext_email = false monochrome_logs = false + monochromeLogs = false hook_url = null help = false version = false From aaf6d62ad069684d7623d0116c342ccf349e035a Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 21 Mar 2024 12:15:46 +0100 Subject: [PATCH 02/16] Start fixing untar MALT db error --- nextflow.config | 2 +- workflows/taxprofiler.nf | 26 +++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/nextflow.config b/nextflow.config index 519e2648..a591e653 100644 --- a/nextflow.config +++ b/nextflow.config @@ -32,7 +32,7 @@ params { email_on_fail = null plaintext_email = false monochrome_logs = false - monochromeLogs = false + monochromeLogs = false // required so nf-validation nextflow.enabled.strict works nicely together hook_url = null help = false version = false diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index ccee03f3..04acdbed 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -155,13 +155,33 @@ workflow TAXPROFILER { skip: true } // Filter the channel to untar only those databases for tools that are selected to be run by the user. - ch_input_untar = ch_dbs_for_untar.untar + // Only untar once, and spread out again after + ch_inputdb_untar = ch_dbs_for_untar.untar .filter { db_meta, db_path -> params[ "run_${db_meta.tool}" ] } - UNTAR ( ch_input_untar ) + .groupTuple(by: 1) + .map { + meta, dbfile -> + def new_meta = [ 'id': dbfile.baseName ] + [ 'meta': meta ] + [new_meta , dbfile ] + } + .dump(tag: 'for_untar') + + UNTAR ( ch_inputdb_untar ) + + ch_outputdb_from_untar = UNTAR.out.untar + .map { + meta, db -> + [meta.meta, db] + } + .dump(tag: 'post_untar') + .transpose(by: 1) + .dump(tag: 'from_untar') + + // TODO spread UNTARed stuff - ch_final_dbs = ch_dbs_for_untar.skip.mix( UNTAR.out.untar ) + ch_final_dbs = ch_dbs_for_untar.skip.mix( ch_outputdb_from_untar ) ch_final_dbs .map { db_meta, db -> [ db_meta.db_params ] def corrected_db_params = db_meta.db_params == null ? '' : db_meta.db_params From d8f136d119925d91251b48111b18475340b66ce8 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 21 Mar 2024 13:35:32 +0100 Subject: [PATCH 03/16] Optimise database untarring and prevent publishing clashses --- workflows/taxprofiler.nf | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 04acdbed..12087324 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -155,7 +155,7 @@ workflow TAXPROFILER { skip: true } // Filter the channel to untar only those databases for tools that are selected to be run by the user. - // Only untar once, and spread out again after + // Also, to ensure only untar once per file, group together all databases of one file ch_inputdb_untar = ch_dbs_for_untar.untar .filter { db_meta, db_path -> params[ "run_${db_meta.tool}" ] @@ -168,18 +168,16 @@ workflow TAXPROFILER { } .dump(tag: 'for_untar') + // Untar the databases UNTAR ( ch_inputdb_untar ) + // Spread out the untarred and shared databases ch_outputdb_from_untar = UNTAR.out.untar .map { meta, db -> [meta.meta, db] } - .dump(tag: 'post_untar') - .transpose(by: 1) - .dump(tag: 'from_untar') - - // TODO spread UNTARed stuff + .transpose(by: 0) ch_final_dbs = ch_dbs_for_untar.skip.mix( ch_outputdb_from_untar ) ch_final_dbs From c099269c42bedfaf040e38b3bc2169d41df27365 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 21 Mar 2024 13:37:49 +0100 Subject: [PATCH 04/16] Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ce0fc99e..084630db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` - [#454](https://github.com/nf-core/taxprofiler/pull/454) Updated to nf-core pipeline template v2.13.1 (added by @LilyAnderssonLee & @sofstam) +- []() Turned on 'strict' Nextflow evaluation runs (added by @jfy133) +- []() Optimised database compression so each compressed input database is untarred once, and shared amongst each run with different parameters (added by @jfy133) ### `Fixed` From 28c250e0969aa2256bf66f38ca080e3881b10f11 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 21 Mar 2024 16:04:10 +0100 Subject: [PATCH 05/16] Update changelog --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 084630db..1c5b7775 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` - [#454](https://github.com/nf-core/taxprofiler/pull/454) Updated to nf-core pipeline template v2.13.1 (added by @LilyAnderssonLee & @sofstam) -- []() Turned on 'strict' Nextflow evaluation runs (added by @jfy133) -- []() Optimised database compression so each compressed input database is untarred once, and shared amongst each run with different parameters (added by @jfy133) +- [#461](https://github.com/nf-core/taxprofiler/pull/461) Turned on 'strict' Nextflow evaluation runs (added by @jfy133) +- [#461](https://github.com/nf-core/taxprofiler/pull/461) Optimised database compression so each compressed input database is untarred once, and shared amongst each run with different parameters (added by @jfy133) ### `Fixed` From c5b112d3e7d975d80c27a353f156d6e7b32cedf4 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 21 Mar 2024 16:09:52 +0100 Subject: [PATCH 06/16] Add nf-validation specific parameter to ignore list --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index a591e653..850572a9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -54,7 +54,7 @@ params { // Schema validation default options validationFailUnrecognisedParams = false validationLenientMode = false - validationSchemaIgnoreParams = 'genomes,igenomes_base,fasta' + validationSchemaIgnoreParams = 'genomes,igenomes_base,fasta,monochromeLogs' validationShowHiddenParams = false validate_params = true From 06d7ecf702db4194298c442d59cf6548898dd16f Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 22 Mar 2024 08:53:58 +0100 Subject: [PATCH 07/16] Remove a debugging dump --- workflows/taxprofiler.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 12087324..acb39439 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -166,7 +166,6 @@ workflow TAXPROFILER { def new_meta = [ 'id': dbfile.baseName ] + [ 'meta': meta ] [new_meta , dbfile ] } - .dump(tag: 'for_untar') // Untar the databases UNTAR ( ch_inputdb_untar ) @@ -186,6 +185,7 @@ workflow TAXPROFILER { db_meta.db_params = corrected_db_params [ db_meta, db ] } + .dump(tag: 'final_dbs') ch_versions = ch_versions.mix( UNTAR.out.versions.first() ) /* From b527a8554274c1e17106e6dd7aa7a56b54314ce2 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 22 Mar 2024 09:31:34 +0100 Subject: [PATCH 08/16] Remove final dump --- workflows/taxprofiler.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index acb39439..d9c88182 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -185,7 +185,7 @@ workflow TAXPROFILER { db_meta.db_params = corrected_db_params [ db_meta, db ] } - .dump(tag: 'final_dbs') + ch_versions = ch_versions.mix( UNTAR.out.versions.first() ) /* From e6097e6d80e503f824ac65b1cc0f591db86a2477 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 22 Mar 2024 09:31:39 +0100 Subject: [PATCH 09/16] Actually remove it --- workflows/taxprofiler.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index d9c88182..14f17ddb 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -169,6 +169,7 @@ workflow TAXPROFILER { // Untar the databases UNTAR ( ch_inputdb_untar ) + ch_versions = ch_versions.mix( UNTAR.out.versions.first() ) // Spread out the untarred and shared databases ch_outputdb_from_untar = UNTAR.out.untar @@ -186,7 +187,6 @@ workflow TAXPROFILER { [ db_meta, db ] } - ch_versions = ch_versions.mix( UNTAR.out.versions.first() ) /* MODULE: Run FastQC From 4dc09007bc47630862de0960f7d1aa8b79f3f888 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 28 Mar 2024 08:35:58 +0000 Subject: [PATCH 10/16] Fix occasional split errors, due to db_params replacement not actually being assigned to a channel, ensure new metamap made on db_param cleanup --- workflows/taxprofiler.nf | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 14f17ddb..87d43af4 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -179,14 +179,12 @@ workflow TAXPROFILER { } .transpose(by: 0) - ch_final_dbs = ch_dbs_for_untar.skip.mix( ch_outputdb_from_untar ) - ch_final_dbs - .map { db_meta, db -> [ db_meta.db_params ] - def corrected_db_params = db_meta.db_params == null ? '' : db_meta.db_params - db_meta.db_params = corrected_db_params - [ db_meta, db ] - } - + ch_final_dbs = ch_dbs_for_untar.skip + .mix( ch_outputdb_from_untar ) + .map { db_meta, db -> + def corrected_db_params = db_meta.db_params == null ? [ db_params: '' ] : [ db_params: db_meta.db_params ] + [ db_meta + corrected_db_params, db ] + } /* MODULE: Run FastQC From 630a25c7d9cd586acb1e4396d79db943a84fbbc3 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 28 Mar 2024 09:44:32 +0000 Subject: [PATCH 11/16] Update workflows/taxprofiler.nf Co-authored-by: Moritz E. Beber --- workflows/taxprofiler.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 87d43af4..36af3074 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -182,7 +182,7 @@ workflow TAXPROFILER { ch_final_dbs = ch_dbs_for_untar.skip .mix( ch_outputdb_from_untar ) .map { db_meta, db -> - def corrected_db_params = db_meta.db_params == null ? [ db_params: '' ] : [ db_params: db_meta.db_params ] + def corrected_db_params = db_meta.db_params ? [ db_params: db_meta.db_params ] : [ db_params: '' ] [ db_meta + corrected_db_params, db ] } From 1b4c5f0369de78e6e7dd42aa4b2209fe60bdbe15 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 4 Apr 2024 09:32:25 +0000 Subject: [PATCH 12/16] Add option to save uncompressed databases, turn off Kraken2 to krona conversion output file publishing --- conf/modules.config | 17 +++++++++++++++++ docs/output.md | 16 ++++++++++++++++ nextflow.config | 3 ++- nextflow_schema.json | 6 ++++++ 4 files changed, 41 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 6fb846b4..8014309a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,6 +18,15 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + withName: UNTAR { + ext.prefix = { "${archive.simpleName}" } + publishDir = [ + path: { "${params.outdir}/untar/databases" }, + mode: params.publish_dir_mode, + enabled: params.save_untarred_databases + ] + } + withName: FASTQC { ext.args = '--quiet' ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } @@ -512,6 +521,14 @@ process { ] } + withName: KRAKENTOOLS_KREPORT2KRONA { + publishDir = [ + enabled: false, + mode: params.publish_dir_mode, + pattern: '*.txt' + ] + } + withName: KRONA_CLEANUP { ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}" } publishDir = [ diff --git a/docs/output.md b/docs/output.md index cf4678c3..2cebd463 100644 --- a/docs/output.md +++ b/docs/output.md @@ -10,6 +10,7 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +- [UNTAR](#untar) - Optionally saved decompressed input databases - [FastQC](#fastqc) - Raw read QC - [falco](#fastqc) - Alternative to FastQC for raw read QC - [fastp](#fastp) - Adapter trimming for Illumina data @@ -40,6 +41,21 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d ![](images/taxprofiler_tube.png) +### untar + +untar is used in nf-core/taxprofiler to decompress various input files ending in `.tar.gz`. This process is mainly used for decompressing input database archive files. + +
+Output files + +- `untar/` + - `database/` + - ``: directory containing contents of the decompressed archive + +
+ +This directory will only be present if `--save_untarred_databases` is supplied. The contained directories can be useful for moving the decompressed directories to a central 'cache' location allowing users to re-use the same databases. This is useful to save unnecessary computational time of decompressing the archives on every run. + ### FastQC or Falco
diff --git a/nextflow.config b/nextflow.config index 850572a9..9cf67533 100644 --- a/nextflow.config +++ b/nextflow.config @@ -59,7 +59,8 @@ params { validate_params = true // Databases - databases = null + databases = null + save_untarred_databases = false // FASTQ preprocessing skip_preprocessing_qc = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 73364791..641bef19 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -34,6 +34,12 @@ "description": "Path to comma-separated file containing information about databases and profiling parameters for each taxonomic profiler", "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/dev/usage#full-database-sheet).\n\nProfilers will only be executed if a corresponding database are supplied. \n\nWe recommend storing this database sheet somewhere centrally and accessible by others members of your lab/institutions, as this file will likely be regularly reused." }, + "save_untarred_databases": { + "type": "boolean", + "fa_icon": "fas fa-database", + "description": "Specify to save decompressed user-supplied TAR archives of databases", + "help_text": "If input databases are supplied as gzipped TAR archives, in some cases you may want to move and re-use these for future runs./n/n Specifying this parameter will save these to `--outdir results/` under a directory called `untar`." + }, "outdir": { "type": "string", "format": "directory-path", From 54a6cd75c7d6f08228a575ae355dc334cab54291 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 4 Apr 2024 12:05:54 +0200 Subject: [PATCH 13/16] Apply suggestions from code review --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c5b7775..4d31b0fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#454](https://github.com/nf-core/taxprofiler/pull/454) Updated to nf-core pipeline template v2.13.1 (added by @LilyAnderssonLee & @sofstam) - [#461](https://github.com/nf-core/taxprofiler/pull/461) Turned on 'strict' Nextflow evaluation runs (added by @jfy133) - [#461](https://github.com/nf-core/taxprofiler/pull/461) Optimised database compression so each compressed input database is untarred once, and shared amongst each run with different parameters (added by @jfy133) +[#461](https://github.com/nf-core/taxprofiler/pull/461) Added new parameter to optionally save uncompressed databases (added by @jfy133) ### `Fixed` From 968ae646a3bd77a96e1387ca1bb2563bc6c1275f Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Thu, 4 Apr 2024 10:24:50 +0000 Subject: [PATCH 14/16] [automated] Fix code linting --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d31b0fc..cea40bea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#454](https://github.com/nf-core/taxprofiler/pull/454) Updated to nf-core pipeline template v2.13.1 (added by @LilyAnderssonLee & @sofstam) - [#461](https://github.com/nf-core/taxprofiler/pull/461) Turned on 'strict' Nextflow evaluation runs (added by @jfy133) - [#461](https://github.com/nf-core/taxprofiler/pull/461) Optimised database compression so each compressed input database is untarred once, and shared amongst each run with different parameters (added by @jfy133) -[#461](https://github.com/nf-core/taxprofiler/pull/461) Added new parameter to optionally save uncompressed databases (added by @jfy133) + [#461](https://github.com/nf-core/taxprofiler/pull/461) Added new parameter to optionally save uncompressed databases (added by @jfy133) ### `Fixed` From f894a4b20518df88d7c721ba8983f0f6c501f2cc Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 4 Apr 2024 12:45:20 +0200 Subject: [PATCH 15/16] Update CHANGELOG.md Co-authored-by: Lili Andersson-Li <64467552+LilyAnderssonLee@users.noreply.github.com> --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cea40bea..df1449e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#454](https://github.com/nf-core/taxprofiler/pull/454) Updated to nf-core pipeline template v2.13.1 (added by @LilyAnderssonLee & @sofstam) - [#461](https://github.com/nf-core/taxprofiler/pull/461) Turned on 'strict' Nextflow evaluation runs (added by @jfy133) - [#461](https://github.com/nf-core/taxprofiler/pull/461) Optimised database compression so each compressed input database is untarred once, and shared amongst each run with different parameters (added by @jfy133) - [#461](https://github.com/nf-core/taxprofiler/pull/461) Added new parameter to optionally save uncompressed databases (added by @jfy133) +- [#461](https://github.com/nf-core/taxprofiler/pull/461) Added new parameter to optionally save uncompressed databases (added by @jfy133) ### `Fixed` From 2ebce4cd61ec380e6ceb02cafa18082826637508 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 4 Apr 2024 12:46:43 +0200 Subject: [PATCH 16/16] Update nextflow_schema.json Co-authored-by: Lili Andersson-Li <64467552+LilyAnderssonLee@users.noreply.github.com> --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 641bef19..1ce1ee54 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -38,7 +38,7 @@ "type": "boolean", "fa_icon": "fas fa-database", "description": "Specify to save decompressed user-supplied TAR archives of databases", - "help_text": "If input databases are supplied as gzipped TAR archives, in some cases you may want to move and re-use these for future runs./n/n Specifying this parameter will save these to `--outdir results/` under a directory called `untar`." + "help_text": "If input databases are supplied as gzipped TAR archives, in some cases you may want to move and re-use these for future runs. Specifying this parameter will save these to `--outdir results/` under a directory called `untar`." }, "outdir": { "type": "string",