Merge branch 'bouncy-basenji' of github.com:nf-core/taxprofiler into …

…bouncy-basenji
nf-core · Jul 18, 2024 · 656613e · 656613e
2 parents fac95e3 + 96dc9ec
commit 656613e
Show file tree

Hide file tree

Showing 7 changed files with 115 additions and 27 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -65,8 +65,10 @@ jobs:
           if [[ "${{ matrix.tags }}" == "test_motus" ]]; then
             wget https://raw.githubusercontent.com/motu-tool/mOTUs/master/motus/downloadDB.py
             python downloadDB.py --no-download-progress
-            echo 'tool,db_name,db_params,db_path' > 'database_motus.csv'
-            echo "motus,db_mOTU,,db_mOTU" >> 'database_motus.csv'
+            echo 'tool,db_name,db_params,db_type,db_path' > 'database_motus.csv'
+            echo "motus,db1_mOTU,,short,db_mOTU" >> 'database_motus.csv'
+            echo "motus,db2_mOTU,,long,db_mOTU" >> 'database_motus.csv'
+            echo "motus,db3_mOTU,,short;long,db_mOTU" >> 'database_motus.csv'
             nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --databases ./database_motus.csv --outdir ./results_${{ matrix.tags }};
           else
             nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --outdir ./results_${{ matrix.tags }};

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Added`
 
 - [#417](https://github.com/nf-core/taxprofiler/pull/417) - Added reference-free metagenome estimation with Nonpareil (added by @jfy133)
+- [#466](https://github.com/nf-core/taxprofiler/pull/466) - Input database sheets now require a `db_type` column to distinguish between short- and long-read databases (added by @LilyAnderssonLee)
+- [#505](https://github.com/nf-core/taxprofiler/pull/505) - Add small files to the file `tower.yml` (added by @LilyAnderssonLee)
 
 ### `Fixed`
 

diff --git a/assets/schema_database.json b/assets/schema_database.json
@@ -39,6 +39,12 @@
                 "errorMessage": "Invalid database db_params entry. No quotes allowed.",
                 "meta": ["db_params"]
             },
+            "db_type": {
+                "type": "string",
+                "enum": ["short", "long", "short;long"],
+                "default": "short;long",
+                "meta": ["db_type"]
+            },
             "db_path": {
                 "type": "string",
                 "exists": true,

diff --git a/docs/output.md b/docs/output.md
@@ -227,7 +227,7 @@ The resulting `.fastq` files may _not_ always be the 'final' reads that go into
 <summary>Output files</summary>
 
 - `filtlong/`
-  - `<sample_id>_filtered.fastq.gz`: Quality or short read data filtered file
+  - `<sample_id>_filtered.fastq.gz`: Quality or long read data filtered file
   - `<sample_id>_filtered.log`: log file containing summary statistics
 
 </details>

diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf
@@ -60,26 +60,47 @@ workflow PROFILING {
         COMBINE READS WITH POSSIBLE DATABASES
     */
 
-    // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':true], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
+    // Separate default 'short;long' (when necessary) databases when short/long specified in database sheet
+    ch_dbs = databases
+        .map{
+            meta_db, db ->
+            [ [meta_db.db_type.split(";")].flatten(), meta_db, db]
+        }
+        .transpose(by: 0)
+        .map{
+            type, meta_db, db ->
+            [[type: type], meta_db.subMap(meta_db.keySet() - 'db_type') + [type: type], db]
+        }
+
+    // Join short and long reads with their corresponding short/long database
+    // Note that for not-specified `short;long`, it will match with the database.
+    // E.g. if there is no 'long' reads the above generted 'long' database channel element
+    //  will have nothing to join to and will be discarded
+    // Final output: [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
+
     ch_input_for_profiling = reads
-            .map {
-                meta, reads ->
-                    [meta + [id: "${meta.id}${meta.single_end ? '_se' : '_pe'}"], reads]
-            }
-            .combine(databases)
-            .branch {
-                centrifuge: it[2]['tool'] == 'centrifuge'
-                diamond: it[2]['tool'] == 'diamond'
-                kaiju: it[2]['tool'] == 'kaiju'
-                kraken2: it[2]['tool'] == 'kraken2' || it[2]['tool'] == 'bracken' // to reuse the kraken module to produce the input data for bracken
-                krakenuniq: it[2]['tool'] == 'krakenuniq'
-                malt:    it[2]['tool'] == 'malt'
-                metaphlan: it[2]['tool'] == 'metaphlan'
-                motus: it[2]['tool'] == 'motus'
-                kmcp: it[2]['tool'] == 'kmcp'
-                ganon: it[2]['tool'] == 'ganon'
-                unknown: true
-            }
+        .map{
+            meta, reads ->
+            [[type: meta.type], meta, reads]
+        }
+        .combine(ch_dbs, by: 0)
+        .map{
+            db_type, meta, reads, db_meta, db ->
+            [ meta, reads, db_meta, db ]
+        }
+        .branch { meta, reads, db_meta, db ->
+            centrifuge: db_meta.tool == 'centrifuge'
+            diamond: db_meta.tool == 'diamond'
+            kaiju: db_meta.tool == 'kaiju'
+            kraken2: db_meta.tool == 'kraken2' || db_meta.tool == 'bracken' // to reuse the kraken module to produce the input data for bracken
+            krakenuniq: db_meta.tool == 'krakenuniq'
+            malt:    db_meta.tool == 'malt'
+            metaphlan: db_meta.tool == 'metaphlan'
+            motus: db_meta.tool == 'motus'
+            kmcp: db_meta.tool == 'kmcp'
+            ganon: db_meta.tool == 'ganon'
+            unknown: true
+        }
 
     /*
         PREPARE PROFILER INPUT CHANNELS & RUN PROFILING

diff --git a/tower.yml b/tower.yml
@@ -1,5 +1,59 @@
 reports:
   multiqc_report.html:
     display: "MultiQC HTML report"
-  samplesheet.csv:
-    display: "Auto-created samplesheet with collated metadata and FASTQ paths"
+  "**/fastqc/raw/*.html":
+    display: "A FastQC report containing quality metrics of raw reads in HTML format."
+  "**/fastqc/raw/*.txt":
+    display: "A FastQC report containing quality metrics of raw reads in TXT format."
+  "**/fastqc/preprocessed/*.html":
+    display: "A FastQC report containing quality metrics of processed reads in HTML format."
+  "**/fastqc/preprocessed/*.txt":
+    display: "A FastQC report containing quality metrics of processed reads in TXT format."
+  "**/falco/raw/*.html":
+    display: "A Falco report containing quality metrics of raw reads in HTML format."
+  "**/falco/raw/*.txt":
+    display: "A Falco report containing quality metrics of raw reads in TXT format."
+  "**/falco/preprocessed/*.html":
+    display: "A Falco report containing quality metrics of processed reads in HTML format."
+  "**/falco/preprocessed/*.txt":
+    display: "A Falco report containing quality metrics of processed reads in TXT format."
+  "**/fastp/*.html":
+    display: "A Log file in HTML format."
+  "**/bracken/*_combined_reports.txt":
+    display: "Combined bracken results as output from Bracken's combine_bracken_outputs.py script."
+  "**/bracken/*/*.tsv":
+    display: "A TSV file containing per-sample summary of Bracken results with abundance information."
+  "**/bracken/*/*report_bracken_species.txt":
+    display: "A Kraken2 style report with Bracken abundance information."
+  "**/kraken2/kraken2_*_combined_reports.txt":
+    display: "A combined profile of all samples aligned to a given database (as generated by krakentools)."
+  "**/kraken2/*/*.kraken2.report.txt":
+    display: "A Kraken2 report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits in the Kraken2 run for a given sample. Will be 6 column rather than 8 if --save_minimizers specified."
+  "**/krakenuniq/*.krakenuniq.report.txt":
+    display: "A Kraken2-style report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits, with an additional column for k-mer coverage, that allows for more accurate distinguishing between false-positive/true-postitive hits."
+  "**/krakenuniq/*.krakenuniq.classified.txt":
+    display: "An optional list of read IDs and the hits each read had against each database for a given sample."
+  "**/centrifuge/*_combined_reports.txt":
+    display: "A combined profile of all samples aligned to a given database (as generated by centrifuge-kreport)."
+  "**/centrifuge/*/*.centrifuge.report.txt":
+    display: "A classification report that summarises the taxonomic ID, the taxonomic rank, length of genome sequence, number of classified and uniquely classified reads."
+  "**/centrifuge/*/*.centrifuge.txt":
+    display: "A Kraken2-style report that summarises the fraction abundance, taxonomic ID, number of k-mers, taxonomic path of all the hits in the centrifuge run for a given sample."
+  "**/ganon/*_combined_reports.txt":
+    display: "A combined profile of all samples aligned to a given database (as generated by ganon table)."
+  "**/kaiju/*_combined_reports.txt":
+    display: "A combined profile of all samples aligned to a given database (as generated by kaiju2table)"
+  "**/kaiju/*/*.kaijutable.txt":
+    display: "Summarised Kaiju output with fraction abundance, taxonomic ID, number of reads, and taxonomic names (as generated by kaiju2table)"
+  "**/krona/*.html":
+    display: "Per-tool/per-database interactive HTML file containing hierarchical piecharts."
+  "**/metaphlan/*/*_combined_reports.txt":
+    display: "A combined profile of all samples aligned to a given database (as generated by metaphlan_merge_tables)."
+  "**/metaphlan/*/*.bowtie2out.txt":
+    display: "Bowtie2 alignment information (can be re-used for skipping alignment when re-running MetaPhlAn with different parameters)."
+  "**/metaphlan/*/*_profile.txt":
+    display: "A MetaPhlAn taxonomic profile including abundance estimates."
+  "**/motus/*/*_combined_reports.txt":
+    display: "A combined profile of all samples aligned to a given database (as generated by motus_merge)."
+  "**/taxpasta/*tsv":
+    display: "Standardised taxon table containing multiple samples. The first column describes the taxonomy ID and the rest of the columns describe the read counts for each sample."
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
@@ -135,13 +135,13 @@ workflow TAXPROFILER {
         }
         .branch { meta, run_accession, instrument_platform, fastq_1, fastq_2, fasta ->
             fastq: meta.single_end || fastq_2
-                return [ meta, fastq_2 ? [ fastq_1, fastq_2 ] : [ fastq_1 ] ]
+                return [ meta + [ type: "short" ], fastq_2 ? [ fastq_1, fastq_2 ] : [ fastq_1 ] ]
             nanopore: instrument_platform == 'OXFORD_NANOPORE'
                 meta.single_end = true
-                return [ meta, [ fastq_1 ] ]
+                return [ meta + [ type: "long" ], [ fastq_1 ] ]
             fasta: meta.is_fasta
                 meta.single_end = true
-                return [ meta, [ fasta ] ]
+                return [ meta + [ type: "short" ], [ fasta ] ]
         }
 
     // Merge ch_input.fastq and ch_input.nanopore into a single channel
@@ -150,6 +150,9 @@ workflow TAXPROFILER {
     // Validate and decompress databases
     ch_dbs_for_untar = databases
         .branch { db_meta, db_path ->
+            if ( !db_meta.db_type ) {
+                db_meta = db_meta + [ db_type: "short;long" ]
+            }
             untar: db_path.name.endsWith( ".tar.gz" )
             skip: true
         }