From c6134cc777385aa4c9a0f21303a7e6fbde782deb Mon Sep 17 00:00:00 2001
From: "Moritz E. Beber" <midnighter@posteo.net>
Date: Wed, 27 Mar 2024 16:04:31 +0100
Subject: [PATCH 1/6] chore: avoid mOTUs download progress report

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 44bb4c5f..e6d5d4df 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -64,7 +64,7 @@ jobs:
         run: |
           if [[ "${{ matrix.tags }}" == "test_motus" ]]; then
             wget https://raw.githubusercontent.com/motu-tool/mOTUs/master/motus/downloadDB.py
-            python downloadDB.py > download_db_log.txt
+            python downloadDB.py --no-download-progress
             echo 'tool,db_name,db_params,db_path' > 'database_motus.csv'
             echo "motus,db_mOTU,,db_mOTU" >> 'database_motus.csv'
             nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --databases ./database_motus.csv --outdir ./results_${{ matrix.tags }};

From f7caedcce62134f3e99acc9993ed5480a965fe86 Mon Sep 17 00:00:00 2001
From: "Moritz E. Beber" <midnighter@posteo.net>
Date: Wed, 27 Mar 2024 17:08:36 +0100
Subject: [PATCH 2/6] fix: transform channels to combine on DB name

Change wording of function and parameters description.
---
 .../local/standardisation_profiles.nf         | 35 ++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf
index 4592e9de..1f121b98 100644
--- a/subworkflows/local/standardisation_profiles.nf
+++ b/subworkflows/local/standardisation_profiles.nf
@@ -15,25 +15,28 @@ include { GANON_TABLE
 // Custom Functions
 
 /**
-* Combine profiles with their original database, then separate into two channels.
+* Combine profiles with their corresponding reference database, then separate into two channels.
 *
-* The channel elements are assumed to be tuples one of [ meta, profile ], and the
-* database to be of [db_key, meta, database_file].
+* The combined results are returned on multiple channels, where the element
+* position for the profiles in one channel is the same as the position of the
+* corresponding database element in the other channel.
 *
-* @param ch_profile A channel containing a meta and the profilign report of a given profiler
-* @param ch_database A channel containing a key, the database meta, and the database file/folders itself
-* @return A multiMap'ed output channel with two sub channels, one with the profile and the other with the db
+* @param ch_profiles A channel containing pairs of a meta map with an `id` key
+*   for a reference database, and all the corresponding profiling reports.
+* @param ch_database A channel containing pairs of a database meta map and the
+*   database itself.
+* @return A multiMap'ed output channel with two sub channels, one with the
+*   profiles (`profile`) and the other with the corresponding database (`db`).
 */
-def combineProfilesWithDatabase(ch_profile, ch_database) {
-
-return ch_profile
-    .map { meta, profile -> [meta.db_name, meta, profile] }
-    .combine(ch_database, by: 0)
-    .multiMap {
-        key, meta, profile, db_meta, db ->
-            profile: [meta, profile]
-            db: db
-    }
+def combineProfilesWithDatabase(ch_profiles, ch_database) {
+    return ch_profiles
+        .map { meta, profile -> [meta.id, meta, profile] }
+        .combine(ch_database.map { db_meta, db -> [db_meta.db_name, db] }, by: 0)
+        .multiMap {
+            key, meta, profile, db ->
+                profile: [meta, profile]
+                db: db
+        }
 }
 
 workflow STANDARDISATION_PROFILES {

From 3d305475f31d72048a55eb2df80d38070ba7c932 Mon Sep 17 00:00:00 2001
From: "Moritz E. Beber" <midnighter@posteo.net>
Date: Wed, 27 Mar 2024 17:40:06 +0100
Subject: [PATCH 3/6] refactor: extraction function to group profiles

---
 .../local/standardisation_profiles.nf         | 88 +++++++++----------
 1 file changed, 41 insertions(+), 47 deletions(-)

diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf
index 1f121b98..c005ecd3 100644
--- a/subworkflows/local/standardisation_profiles.nf
+++ b/subworkflows/local/standardisation_profiles.nf
@@ -14,6 +14,21 @@ include { GANON_TABLE
 
 // Custom Functions
 
+/**
+* Group all profiles per reference database.
+*
+* @param ch_profiles A channel containing pairs of a meta map and the report of
+*   a given profiler, where meta must contain a key `db_name`.
+* @return A channel with one element per reference database. Each element is a
+*   pair of a meta map with an `id` key and all corresponding profiles.
+*/
+def groupProfiles(ch_profiles, groupTupleOptions = [:]) {
+    return ch_profiles
+        .map { meta, profile -> [meta.db_name, profile] }
+        .groupTuple(groupTupleOptions)
+        .map { db_name, profiles -> [[id: db_name], profiles] }
+}
+
 /**
 * Combine profiles with their corresponding reference database, then separate into two channels.
 *
@@ -120,12 +135,7 @@ workflow STANDARDISATION_PROFILES {
 
     // Bracken
 
-    ch_profiles_for_bracken = ch_input_profiles.bracken
-                            .map { [it[0]['db_name'], it[1]] }
-                            .groupTuple()
-                            .map {
-                                [[id:it[0]], it[1]]
-                            }
+    ch_profiles_for_bracken = groupProfiles(ch_input_profiles.bracken)
 
     BRACKEN_COMBINEBRACKENOUTPUTS ( ch_profiles_for_bracken )
 
@@ -134,13 +144,10 @@ workflow STANDARDISATION_PROFILES {
     // Collect and replace id for db_name for prefix
     // Have to sort by size to ensure first file actually has hits otherwise
     // the script fails
-    ch_profiles_for_centrifuge = ch_input_profiles.centrifuge
-                                .map { [it[0]['db_name'], it[1]] }
-                                .groupTuple(sort: {-it.size()} )
-                                .map {
-                                    [[id:it[0]], it[1]]
-                                }
-
+    ch_profiles_for_centrifuge = groupProfiles(
+        ch_input_profiles.centrifuge,
+        [sort: { -it.size() }]
+    )
 
     KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE ( ch_profiles_for_centrifuge )
     ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE.out.txt )
@@ -149,12 +156,7 @@ workflow STANDARDISATION_PROFILES {
     // Kaiju
 
     // Collect and replace id for db_name for prefix
-    ch_profiles_for_kaiju = ch_input_classifications.kaiju
-                                .map { [it[0]['db_name'], it[1]] }
-                                .groupTuple()
-                                .map {
-                                    [[id:it[0]], it[1]]
-                                }
+    ch_profiles_for_kaiju = groupProfiles(ch_input_classifications.kaiju)
 
     ch_input_for_kaiju2tablecombine = combineProfilesWithDatabase(ch_profiles_for_kaiju, ch_input_databases.kaiju)
 
@@ -167,16 +169,23 @@ workflow STANDARDISATION_PROFILES {
     // Collect and replace id for db_name for prefix
     // Have to sort by size to ensure first file actually has hits otherwise
     // the script fails
-    ch_profiles_for_kraken2 = ch_input_profiles.kraken2
-                                .map {
-                                    meta, profiles ->
-                                        def new_meta = [:]
-                                        new_meta.tool = meta.tool == 'kraken2-bracken' ? 'kraken2' : meta.tool // replace to get the right output-format description
-                                        new_meta.id = meta.tool // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken
-                                        new_meta.db_name = meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}" // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken
-                                    [ new_meta, profiles ]
-                                }
-                                .groupTuple(sort: {-it.size()})
+    ch_profiles_for_kraken2 = groupProfiles(
+        ch_input_profiles.kraken2.map { meta, profile ->
+            def new_meta = [
+                // Replace the tool name to get the right output-format description.
+                tool: meta.tool == 'kraken2-bracken' ? 'kraken2' : meta.tool,
+                // Append so as to disambiguate when we have same databases
+                // for kraken2 step of bracken, with normal bracken.
+                id: meta.tool,
+                // Append so as to disambiguate when we have same databases
+                // for kraken2 step of bracken, with normal bracken.
+                db_name: meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}"
+
+            ]
+            return [meta + new_meta, profile]
+        },
+        [sort: { -it.size() }]
+    )
 
     KRAKENTOOLS_COMBINEKREPORTS_KRAKEN ( ch_profiles_for_kraken2 )
     ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.txt )
@@ -184,12 +193,7 @@ workflow STANDARDISATION_PROFILES {
 
     // MetaPhlAn
 
-    ch_profiles_for_metaphlan = ch_input_profiles.metaphlan
-                            .map { [it[0]['db_name'], it[1]] }
-                            .groupTuple()
-                            .map {
-                                [[id:it[0]], it[1]]
-                            }
+    ch_profiles_for_metaphlan = groupProfiles(ch_input_profiles.metaphlan)
 
     METAPHLAN_MERGEMETAPHLANTABLES ( ch_profiles_for_metaphlan )
     ch_multiqc_files = ch_multiqc_files.mix( METAPHLAN_MERGEMETAPHLANTABLES.out.txt )
@@ -201,12 +205,7 @@ workflow STANDARDISATION_PROFILES {
     // Therefore removing db info here, and publish merged at root mOTUs results
     // directory
 
-    ch_profiles_for_motus = ch_input_profiles.motus
-                                .map { [it[0]['db_name'], it[1]] }
-                                .groupTuple()
-                                .map {
-                                    [[id:it[0]], it[1]]
-                                }
+    ch_profiles_for_motus = groupProfiles(ch_input_profiles.motus)
 
     ch_input_for_motusmerge = combineProfilesWithDatabase(ch_profiles_for_motus, ch_input_databases.motus)
 
@@ -215,12 +214,7 @@ workflow STANDARDISATION_PROFILES {
 
     // Ganon
 
-    ch_profiles_for_ganon = ch_input_profiles.ganon
-                            .map { [it[0]['db_name'], it[1]] }
-                            .groupTuple()
-                            .map {
-                                [[id:it[0]], it[1]]
-                            }
+    ch_profiles_for_ganon = groupProfiles(ch_input_profiles.ganon)
 
     GANON_TABLE ( ch_profiles_for_ganon )
     ch_multiqc_files = ch_multiqc_files.mix( GANON_TABLE.out.txt )

From 2c430aa7aaa6c6bc9494af56a8d35dbaf7b94019 Mon Sep 17 00:00:00 2001
From: "Moritz E. Beber" <midnighter@posteo.net>
Date: Wed, 27 Mar 2024 18:01:44 +0100
Subject: [PATCH 4/6] docs: add changelog entry

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index df1449e1..bbebe94c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Fixed`
 
 - [#336](https://github.com/nf-core/taxprofiler/issues/336) Replace samplesheet check with nf-validation for both sample and database input sheets (fix by @LilyAnderssonLee)
+- [#460](https://github.com/nf-core/taxprofiler/issues/460) corrected the channel transformations to combine Kaiju and mOTUs reports with their reference databases (fix by @Midnighter)
 
 ### `Dependencies`
 

From 46e6637b11c0b4f5bf8d192f932d9073563027cb Mon Sep 17 00:00:00 2001
From: "Moritz E. Beber" <midnighter@posteo.net>
Date: Sat, 6 Apr 2024 15:10:40 +0200
Subject: [PATCH 5/6] fix: correct file name for combined kraken2 reports

---
 conf/modules.config                           |  2 +-
 .../local/standardisation_profiles.nf         | 20 ++++++-------------
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index f5a5e631..1956605e 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -502,7 +502,7 @@ process {
     }
 
     withName: KRAKENTOOLS_COMBINEKREPORTS_KRAKEN {
-        ext.prefix = { "kraken2_${meta.db_name}_combined_reports" }
+        ext.prefix = { "kraken2_${meta.id}_combined_reports" }
         publishDir = [
             path: { "${params.outdir}/kraken2/" },
             mode: params.publish_dir_mode,
diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf
index c005ecd3..3b664204 100644
--- a/subworkflows/local/standardisation_profiles.nf
+++ b/subworkflows/local/standardisation_profiles.nf
@@ -170,22 +170,14 @@ workflow STANDARDISATION_PROFILES {
     // Have to sort by size to ensure first file actually has hits otherwise
     // the script fails
     ch_profiles_for_kraken2 = groupProfiles(
-        ch_input_profiles.kraken2.map { meta, profile ->
-            def new_meta = [
-                // Replace the tool name to get the right output-format description.
-                tool: meta.tool == 'kraken2-bracken' ? 'kraken2' : meta.tool,
-                // Append so as to disambiguate when we have same databases
-                // for kraken2 step of bracken, with normal bracken.
-                id: meta.tool,
-                // Append so as to disambiguate when we have same databases
-                // for kraken2 step of bracken, with normal bracken.
-                db_name: meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}"
-
-            ]
-            return [meta + new_meta, profile]
+        ch_input_profiles.kraken2.dump(tag: 'k2-profiles', pretty: true)
+        .map { meta, profile ->
+            // Replace database name, to get the right output description.
+            def db_name = meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}"
+            return [meta + [db_name: db_name], profile]
         },
         [sort: { -it.size() }]
-    )
+    ).dump(tag: 'k2-grouped', pretty: true)
 
     KRAKENTOOLS_COMBINEKREPORTS_KRAKEN ( ch_profiles_for_kraken2 )
     ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.txt )

From 399b87c072f4e2784bc3491ad3ef34d9e93f6b45 Mon Sep 17 00:00:00 2001
From: "Moritz E. Beber" <midnighter@posteo.net>
Date: Sun, 7 Apr 2024 02:51:21 +0200
Subject: [PATCH 6/6] chore: remove channel dumps used for debugging

Co-authored-by: Lili Andersson-Li <64467552+LilyAnderssonLee@users.noreply.github.com>
---
 subworkflows/local/standardisation_profiles.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf
index 3b664204..95cd9d3f 100644
--- a/subworkflows/local/standardisation_profiles.nf
+++ b/subworkflows/local/standardisation_profiles.nf
@@ -170,14 +170,14 @@ workflow STANDARDISATION_PROFILES {
     // Have to sort by size to ensure first file actually has hits otherwise
     // the script fails
     ch_profiles_for_kraken2 = groupProfiles(
-        ch_input_profiles.kraken2.dump(tag: 'k2-profiles', pretty: true)
+        ch_input_profiles.kraken2
         .map { meta, profile ->
             // Replace database name, to get the right output description.
             def db_name = meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}"
             return [meta + [db_name: db_name], profile]
         },
         [sort: { -it.size() }]
-    ).dump(tag: 'k2-grouped', pretty: true)
+    )
 
     KRAKENTOOLS_COMBINEKREPORTS_KRAKEN ( ch_profiles_for_kraken2 )
     ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.txt )