Merge pull request #464 from nf-core/fix-combine

Fix combine
nf-core · Apr 7, 2024 · a9afbc0 · a9afbc0
2 parents 593c013 + 399b87c
commit a9afbc0
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 65 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -64,7 +64,7 @@ jobs:
         run: |
           if [[ "${{ matrix.tags }}" == "test_motus" ]]; then
             wget https://raw.githubusercontent.com/motu-tool/mOTUs/master/motus/downloadDB.py
-            python downloadDB.py > download_db_log.txt
+            python downloadDB.py --no-download-progress
             echo 'tool,db_name,db_params,db_path' > 'database_motus.csv'
             echo "motus,db_mOTU,,db_mOTU" >> 'database_motus.csv'
             nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --databases ./database_motus.csv --outdir ./results_${{ matrix.tags }};

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Fixed`
 
 - [#336](https://github.com/nf-core/taxprofiler/issues/336) Replace samplesheet check with nf-validation for both sample and database input sheets (fix by @LilyAnderssonLee)
+- [#460](https://github.com/nf-core/taxprofiler/issues/460) corrected the channel transformations to combine Kaiju and mOTUs reports with their reference databases (fix by @Midnighter)
 
 ### `Dependencies`
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -502,7 +502,7 @@ process {
     }
 
     withName: KRAKENTOOLS_COMBINEKREPORTS_KRAKEN {
-        ext.prefix = { "kraken2_${meta.db_name}_combined_reports" }
+        ext.prefix = { "kraken2_${meta.id}_combined_reports" }
         publishDir = [
             path: { "${params.outdir}/kraken2/" },
             mode: params.publish_dir_mode,

diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf
@@ -15,25 +15,43 @@ include { GANON_TABLE
 // Custom Functions
 
 /**
-* Combine profiles with their original database, then separate into two channels.
+* Group all profiles per reference database.
 *
-* The channel elements are assumed to be tuples one of [ meta, profile ], and the
-* database to be of [db_key, meta, database_file].
+* @param ch_profiles A channel containing pairs of a meta map and the report of
+*   a given profiler, where meta must contain a key `db_name`.
+* @return A channel with one element per reference database. Each element is a
+*   pair of a meta map with an `id` key and all corresponding profiles.
+*/
+def groupProfiles(ch_profiles, groupTupleOptions = [:]) {
+    return ch_profiles
+        .map { meta, profile -> [meta.db_name, profile] }
+        .groupTuple(groupTupleOptions)
+        .map { db_name, profiles -> [[id: db_name], profiles] }
+}
+
+/**
+* Combine profiles with their corresponding reference database, then separate into two channels.
 *
-* @param ch_profile A channel containing a meta and the profilign report of a given profiler
-* @param ch_database A channel containing a key, the database meta, and the database file/folders itself
-* @return A multiMap'ed output channel with two sub channels, one with the profile and the other with the db
+* The combined results are returned on multiple channels, where the element
+* position for the profiles in one channel is the same as the position of the
+* corresponding database element in the other channel.
+*
+* @param ch_profiles A channel containing pairs of a meta map with an `id` key
+*   for a reference database, and all the corresponding profiling reports.
+* @param ch_database A channel containing pairs of a database meta map and the
+*   database itself.
+* @return A multiMap'ed output channel with two sub channels, one with the
+*   profiles (`profile`) and the other with the corresponding database (`db`).
 */
-def combineProfilesWithDatabase(ch_profile, ch_database) {
-
-return ch_profile
-    .map { meta, profile -> [meta.db_name, meta, profile] }
-    .combine(ch_database, by: 0)
-    .multiMap {
-        key, meta, profile, db_meta, db ->
-            profile: [meta, profile]
-            db: db
-    }
+def combineProfilesWithDatabase(ch_profiles, ch_database) {
+    return ch_profiles
+        .map { meta, profile -> [meta.id, meta, profile] }
+        .combine(ch_database.map { db_meta, db -> [db_meta.db_name, db] }, by: 0)
+        .multiMap {
+            key, meta, profile, db ->
+                profile: [meta, profile]
+                db: db
+        }
 }
 
 workflow STANDARDISATION_PROFILES {
@@ -117,12 +135,7 @@ workflow STANDARDISATION_PROFILES {
 
     // Bracken
 
-    ch_profiles_for_bracken = ch_input_profiles.bracken
-                            .map { [it[0]['db_name'], it[1]] }
-                            .groupTuple()
-                            .map {
-                                [[id:it[0]], it[1]]
-                            }
+    ch_profiles_for_bracken = groupProfiles(ch_input_profiles.bracken)
 
     BRACKEN_COMBINEBRACKENOUTPUTS ( ch_profiles_for_bracken )
 
@@ -131,13 +144,10 @@ workflow STANDARDISATION_PROFILES {
     // Collect and replace id for db_name for prefix
     // Have to sort by size to ensure first file actually has hits otherwise
     // the script fails
-    ch_profiles_for_centrifuge = ch_input_profiles.centrifuge
-                                .map { [it[0]['db_name'], it[1]] }
-                                .groupTuple(sort: {-it.size()} )
-                                .map {
-                                    [[id:it[0]], it[1]]
-                                }
-
+    ch_profiles_for_centrifuge = groupProfiles(
+        ch_input_profiles.centrifuge,
+        [sort: { -it.size() }]
+    )
 
     KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE ( ch_profiles_for_centrifuge )
     ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE.out.txt )
@@ -146,12 +156,7 @@ workflow STANDARDISATION_PROFILES {
     // Kaiju
 
     // Collect and replace id for db_name for prefix
-    ch_profiles_for_kaiju = ch_input_classifications.kaiju
-                                .map { [it[0]['db_name'], it[1]] }
-                                .groupTuple()
-                                .map {
-                                    [[id:it[0]], it[1]]
-                                }
+    ch_profiles_for_kaiju = groupProfiles(ch_input_classifications.kaiju)
 
     ch_input_for_kaiju2tablecombine = combineProfilesWithDatabase(ch_profiles_for_kaiju, ch_input_databases.kaiju)
 
@@ -164,29 +169,23 @@ workflow STANDARDISATION_PROFILES {
     // Collect and replace id for db_name for prefix
     // Have to sort by size to ensure first file actually has hits otherwise
     // the script fails
-    ch_profiles_for_kraken2 = ch_input_profiles.kraken2
-                                .map {
-                                    meta, profiles ->
-                                        def new_meta = [:]
-                                        new_meta.tool = meta.tool == 'kraken2-bracken' ? 'kraken2' : meta.tool // replace to get the right output-format description
-                                        new_meta.id = meta.tool // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken
-                                        new_meta.db_name = meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}" // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken
-                                    [ new_meta, profiles ]
-                                }
-                                .groupTuple(sort: {-it.size()})
+    ch_profiles_for_kraken2 = groupProfiles(
+        ch_input_profiles.kraken2
+        .map { meta, profile ->
+            // Replace database name, to get the right output description.
+            def db_name = meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}"
+            return [meta + [db_name: db_name], profile]
+        },
+        [sort: { -it.size() }]
+    )
 
     KRAKENTOOLS_COMBINEKREPORTS_KRAKEN ( ch_profiles_for_kraken2 )
     ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.txt )
     ch_versions = ch_versions.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.versions )
 
     // MetaPhlAn
 
-    ch_profiles_for_metaphlan = ch_input_profiles.metaphlan
-                            .map { [it[0]['db_name'], it[1]] }
-                            .groupTuple()
-                            .map {
-                                [[id:it[0]], it[1]]
-                            }
+    ch_profiles_for_metaphlan = groupProfiles(ch_input_profiles.metaphlan)
 
     METAPHLAN_MERGEMETAPHLANTABLES ( ch_profiles_for_metaphlan )
     ch_multiqc_files = ch_multiqc_files.mix( METAPHLAN_MERGEMETAPHLANTABLES.out.txt )
@@ -198,12 +197,7 @@ workflow STANDARDISATION_PROFILES {
     // Therefore removing db info here, and publish merged at root mOTUs results
     // directory
 
-    ch_profiles_for_motus = ch_input_profiles.motus
-                                .map { [it[0]['db_name'], it[1]] }
-                                .groupTuple()
-                                .map {
-                                    [[id:it[0]], it[1]]
-                                }
+    ch_profiles_for_motus = groupProfiles(ch_input_profiles.motus)
 
     ch_input_for_motusmerge = combineProfilesWithDatabase(ch_profiles_for_motus, ch_input_databases.motus)
 
@@ -212,12 +206,7 @@ workflow STANDARDISATION_PROFILES {
 
     // Ganon
 
-    ch_profiles_for_ganon = ch_input_profiles.ganon
-                            .map { [it[0]['db_name'], it[1]] }
-                            .groupTuple()
-                            .map {
-                                [[id:it[0]], it[1]]
-                            }
+    ch_profiles_for_ganon = groupProfiles(ch_input_profiles.ganon)
 
     GANON_TABLE ( ch_profiles_for_ganon )
     ch_multiqc_files = ch_multiqc_files.mix( GANON_TABLE.out.txt )