From c6134cc777385aa4c9a0f21303a7e6fbde782deb Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 27 Mar 2024 16:04:31 +0100 Subject: [PATCH 1/6] chore: avoid mOTUs download progress report --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 44bb4c5f..e6d5d4df 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,7 +64,7 @@ jobs: run: | if [[ "${{ matrix.tags }}" == "test_motus" ]]; then wget https://raw.githubusercontent.com/motu-tool/mOTUs/master/motus/downloadDB.py - python downloadDB.py > download_db_log.txt + python downloadDB.py --no-download-progress echo 'tool,db_name,db_params,db_path' > 'database_motus.csv' echo "motus,db_mOTU,,db_mOTU" >> 'database_motus.csv' nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --databases ./database_motus.csv --outdir ./results_${{ matrix.tags }}; From f7caedcce62134f3e99acc9993ed5480a965fe86 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 27 Mar 2024 17:08:36 +0100 Subject: [PATCH 2/6] fix: transform channels to combine on DB name Change wording of function and parameters description. --- .../local/standardisation_profiles.nf | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf index 4592e9de..1f121b98 100644 --- a/subworkflows/local/standardisation_profiles.nf +++ b/subworkflows/local/standardisation_profiles.nf @@ -15,25 +15,28 @@ include { GANON_TABLE // Custom Functions /** -* Combine profiles with their original database, then separate into two channels. +* Combine profiles with their corresponding reference database, then separate into two channels. * -* The channel elements are assumed to be tuples one of [ meta, profile ], and the -* database to be of [db_key, meta, database_file]. +* The combined results are returned on multiple channels, where the element +* position for the profiles in one channel is the same as the position of the +* corresponding database element in the other channel. * -* @param ch_profile A channel containing a meta and the profilign report of a given profiler -* @param ch_database A channel containing a key, the database meta, and the database file/folders itself -* @return A multiMap'ed output channel with two sub channels, one with the profile and the other with the db +* @param ch_profiles A channel containing pairs of a meta map with an `id` key +* for a reference database, and all the corresponding profiling reports. +* @param ch_database A channel containing pairs of a database meta map and the +* database itself. +* @return A multiMap'ed output channel with two sub channels, one with the +* profiles (`profile`) and the other with the corresponding database (`db`). */ -def combineProfilesWithDatabase(ch_profile, ch_database) { - -return ch_profile - .map { meta, profile -> [meta.db_name, meta, profile] } - .combine(ch_database, by: 0) - .multiMap { - key, meta, profile, db_meta, db -> - profile: [meta, profile] - db: db - } +def combineProfilesWithDatabase(ch_profiles, ch_database) { + return ch_profiles + .map { meta, profile -> [meta.id, meta, profile] } + .combine(ch_database.map { db_meta, db -> [db_meta.db_name, db] }, by: 0) + .multiMap { + key, meta, profile, db -> + profile: [meta, profile] + db: db + } } workflow STANDARDISATION_PROFILES { From 3d305475f31d72048a55eb2df80d38070ba7c932 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 27 Mar 2024 17:40:06 +0100 Subject: [PATCH 3/6] refactor: extraction function to group profiles --- .../local/standardisation_profiles.nf | 88 +++++++++---------- 1 file changed, 41 insertions(+), 47 deletions(-) diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf index 1f121b98..c005ecd3 100644 --- a/subworkflows/local/standardisation_profiles.nf +++ b/subworkflows/local/standardisation_profiles.nf @@ -14,6 +14,21 @@ include { GANON_TABLE // Custom Functions +/** +* Group all profiles per reference database. +* +* @param ch_profiles A channel containing pairs of a meta map and the report of +* a given profiler, where meta must contain a key `db_name`. +* @return A channel with one element per reference database. Each element is a +* pair of a meta map with an `id` key and all corresponding profiles. +*/ +def groupProfiles(ch_profiles, groupTupleOptions = [:]) { + return ch_profiles + .map { meta, profile -> [meta.db_name, profile] } + .groupTuple(groupTupleOptions) + .map { db_name, profiles -> [[id: db_name], profiles] } +} + /** * Combine profiles with their corresponding reference database, then separate into two channels. * @@ -120,12 +135,7 @@ workflow STANDARDISATION_PROFILES { // Bracken - ch_profiles_for_bracken = ch_input_profiles.bracken - .map { [it[0]['db_name'], it[1]] } - .groupTuple() - .map { - [[id:it[0]], it[1]] - } + ch_profiles_for_bracken = groupProfiles(ch_input_profiles.bracken) BRACKEN_COMBINEBRACKENOUTPUTS ( ch_profiles_for_bracken ) @@ -134,13 +144,10 @@ workflow STANDARDISATION_PROFILES { // Collect and replace id for db_name for prefix // Have to sort by size to ensure first file actually has hits otherwise // the script fails - ch_profiles_for_centrifuge = ch_input_profiles.centrifuge - .map { [it[0]['db_name'], it[1]] } - .groupTuple(sort: {-it.size()} ) - .map { - [[id:it[0]], it[1]] - } - + ch_profiles_for_centrifuge = groupProfiles( + ch_input_profiles.centrifuge, + [sort: { -it.size() }] + ) KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE ( ch_profiles_for_centrifuge ) ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE.out.txt ) @@ -149,12 +156,7 @@ workflow STANDARDISATION_PROFILES { // Kaiju // Collect and replace id for db_name for prefix - ch_profiles_for_kaiju = ch_input_classifications.kaiju - .map { [it[0]['db_name'], it[1]] } - .groupTuple() - .map { - [[id:it[0]], it[1]] - } + ch_profiles_for_kaiju = groupProfiles(ch_input_classifications.kaiju) ch_input_for_kaiju2tablecombine = combineProfilesWithDatabase(ch_profiles_for_kaiju, ch_input_databases.kaiju) @@ -167,16 +169,23 @@ workflow STANDARDISATION_PROFILES { // Collect and replace id for db_name for prefix // Have to sort by size to ensure first file actually has hits otherwise // the script fails - ch_profiles_for_kraken2 = ch_input_profiles.kraken2 - .map { - meta, profiles -> - def new_meta = [:] - new_meta.tool = meta.tool == 'kraken2-bracken' ? 'kraken2' : meta.tool // replace to get the right output-format description - new_meta.id = meta.tool // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken - new_meta.db_name = meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}" // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken - [ new_meta, profiles ] - } - .groupTuple(sort: {-it.size()}) + ch_profiles_for_kraken2 = groupProfiles( + ch_input_profiles.kraken2.map { meta, profile -> + def new_meta = [ + // Replace the tool name to get the right output-format description. + tool: meta.tool == 'kraken2-bracken' ? 'kraken2' : meta.tool, + // Append so as to disambiguate when we have same databases + // for kraken2 step of bracken, with normal bracken. + id: meta.tool, + // Append so as to disambiguate when we have same databases + // for kraken2 step of bracken, with normal bracken. + db_name: meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}" + + ] + return [meta + new_meta, profile] + }, + [sort: { -it.size() }] + ) KRAKENTOOLS_COMBINEKREPORTS_KRAKEN ( ch_profiles_for_kraken2 ) ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.txt ) @@ -184,12 +193,7 @@ workflow STANDARDISATION_PROFILES { // MetaPhlAn - ch_profiles_for_metaphlan = ch_input_profiles.metaphlan - .map { [it[0]['db_name'], it[1]] } - .groupTuple() - .map { - [[id:it[0]], it[1]] - } + ch_profiles_for_metaphlan = groupProfiles(ch_input_profiles.metaphlan) METAPHLAN_MERGEMETAPHLANTABLES ( ch_profiles_for_metaphlan ) ch_multiqc_files = ch_multiqc_files.mix( METAPHLAN_MERGEMETAPHLANTABLES.out.txt ) @@ -201,12 +205,7 @@ workflow STANDARDISATION_PROFILES { // Therefore removing db info here, and publish merged at root mOTUs results // directory - ch_profiles_for_motus = ch_input_profiles.motus - .map { [it[0]['db_name'], it[1]] } - .groupTuple() - .map { - [[id:it[0]], it[1]] - } + ch_profiles_for_motus = groupProfiles(ch_input_profiles.motus) ch_input_for_motusmerge = combineProfilesWithDatabase(ch_profiles_for_motus, ch_input_databases.motus) @@ -215,12 +214,7 @@ workflow STANDARDISATION_PROFILES { // Ganon - ch_profiles_for_ganon = ch_input_profiles.ganon - .map { [it[0]['db_name'], it[1]] } - .groupTuple() - .map { - [[id:it[0]], it[1]] - } + ch_profiles_for_ganon = groupProfiles(ch_input_profiles.ganon) GANON_TABLE ( ch_profiles_for_ganon ) ch_multiqc_files = ch_multiqc_files.mix( GANON_TABLE.out.txt ) From 2c430aa7aaa6c6bc9494af56a8d35dbaf7b94019 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 27 Mar 2024 18:01:44 +0100 Subject: [PATCH 4/6] docs: add changelog entry --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index df1449e1..bbebe94c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Fixed` - [#336](https://github.com/nf-core/taxprofiler/issues/336) Replace samplesheet check with nf-validation for both sample and database input sheets (fix by @LilyAnderssonLee) +- [#460](https://github.com/nf-core/taxprofiler/issues/460) corrected the channel transformations to combine Kaiju and mOTUs reports with their reference databases (fix by @Midnighter) ### `Dependencies` From 46e6637b11c0b4f5bf8d192f932d9073563027cb Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 6 Apr 2024 15:10:40 +0200 Subject: [PATCH 5/6] fix: correct file name for combined kraken2 reports --- conf/modules.config | 2 +- .../local/standardisation_profiles.nf | 20 ++++++------------- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index f5a5e631..1956605e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -502,7 +502,7 @@ process { } withName: KRAKENTOOLS_COMBINEKREPORTS_KRAKEN { - ext.prefix = { "kraken2_${meta.db_name}_combined_reports" } + ext.prefix = { "kraken2_${meta.id}_combined_reports" } publishDir = [ path: { "${params.outdir}/kraken2/" }, mode: params.publish_dir_mode, diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf index c005ecd3..3b664204 100644 --- a/subworkflows/local/standardisation_profiles.nf +++ b/subworkflows/local/standardisation_profiles.nf @@ -170,22 +170,14 @@ workflow STANDARDISATION_PROFILES { // Have to sort by size to ensure first file actually has hits otherwise // the script fails ch_profiles_for_kraken2 = groupProfiles( - ch_input_profiles.kraken2.map { meta, profile -> - def new_meta = [ - // Replace the tool name to get the right output-format description. - tool: meta.tool == 'kraken2-bracken' ? 'kraken2' : meta.tool, - // Append so as to disambiguate when we have same databases - // for kraken2 step of bracken, with normal bracken. - id: meta.tool, - // Append so as to disambiguate when we have same databases - // for kraken2 step of bracken, with normal bracken. - db_name: meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}" - - ] - return [meta + new_meta, profile] + ch_input_profiles.kraken2.dump(tag: 'k2-profiles', pretty: true) + .map { meta, profile -> + // Replace database name, to get the right output description. + def db_name = meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}" + return [meta + [db_name: db_name], profile] }, [sort: { -it.size() }] - ) + ).dump(tag: 'k2-grouped', pretty: true) KRAKENTOOLS_COMBINEKREPORTS_KRAKEN ( ch_profiles_for_kraken2 ) ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.txt ) From 399b87c072f4e2784bc3491ad3ef34d9e93f6b45 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sun, 7 Apr 2024 02:51:21 +0200 Subject: [PATCH 6/6] chore: remove channel dumps used for debugging Co-authored-by: Lili Andersson-Li <64467552+LilyAnderssonLee@users.noreply.github.com> --- subworkflows/local/standardisation_profiles.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf index 3b664204..95cd9d3f 100644 --- a/subworkflows/local/standardisation_profiles.nf +++ b/subworkflows/local/standardisation_profiles.nf @@ -170,14 +170,14 @@ workflow STANDARDISATION_PROFILES { // Have to sort by size to ensure first file actually has hits otherwise // the script fails ch_profiles_for_kraken2 = groupProfiles( - ch_input_profiles.kraken2.dump(tag: 'k2-profiles', pretty: true) + ch_input_profiles.kraken2 .map { meta, profile -> // Replace database name, to get the right output description. def db_name = meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}" return [meta + [db_name: db_name], profile] }, [sort: { -it.size() }] - ).dump(tag: 'k2-grouped', pretty: true) + ) KRAKENTOOLS_COMBINEKREPORTS_KRAKEN ( ch_profiles_for_kraken2 ) ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.txt )