Skip to content

Commit

Permalink
Merge branch 'bouncy-basenji' of github.com:nf-core/taxprofiler into …
Browse files Browse the repository at this point in the history
…bouncy-basenji
  • Loading branch information
jfy133 committed Jul 18, 2024
2 parents fac95e3 + 96dc9ec commit 656613e
Show file tree
Hide file tree
Showing 7 changed files with 115 additions and 27 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,10 @@ jobs:
if [[ "${{ matrix.tags }}" == "test_motus" ]]; then
wget https://raw.githubusercontent.com/motu-tool/mOTUs/master/motus/downloadDB.py
python downloadDB.py --no-download-progress
echo 'tool,db_name,db_params,db_path' > 'database_motus.csv'
echo "motus,db_mOTU,,db_mOTU" >> 'database_motus.csv'
echo 'tool,db_name,db_params,db_type,db_path' > 'database_motus.csv'
echo "motus,db1_mOTU,,short,db_mOTU" >> 'database_motus.csv'
echo "motus,db2_mOTU,,long,db_mOTU" >> 'database_motus.csv'
echo "motus,db3_mOTU,,short;long,db_mOTU" >> 'database_motus.csv'
nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --databases ./database_motus.csv --outdir ./results_${{ matrix.tags }};
else
nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --outdir ./results_${{ matrix.tags }};
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### `Added`

- [#417](https://github.com/nf-core/taxprofiler/pull/417) - Added reference-free metagenome estimation with Nonpareil (added by @jfy133)
- [#466](https://github.com/nf-core/taxprofiler/pull/466) - Input database sheets now require a `db_type` column to distinguish between short- and long-read databases (added by @LilyAnderssonLee)
- [#505](https://github.com/nf-core/taxprofiler/pull/505) - Add small files to the file `tower.yml` (added by @LilyAnderssonLee)

### `Fixed`

Expand Down
6 changes: 6 additions & 0 deletions assets/schema_database.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@
"errorMessage": "Invalid database db_params entry. No quotes allowed.",
"meta": ["db_params"]
},
"db_type": {
"type": "string",
"enum": ["short", "long", "short;long"],
"default": "short;long",
"meta": ["db_type"]
},
"db_path": {
"type": "string",
"exists": true,
Expand Down
2 changes: 1 addition & 1 deletion docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ The resulting `.fastq` files may _not_ always be the 'final' reads that go into
<summary>Output files</summary>

- `filtlong/`
- `<sample_id>_filtered.fastq.gz`: Quality or short read data filtered file
- `<sample_id>_filtered.fastq.gz`: Quality or long read data filtered file
- `<sample_id>_filtered.log`: log file containing summary statistics

</details>
Expand Down
59 changes: 40 additions & 19 deletions subworkflows/local/profiling.nf
Original file line number Diff line number Diff line change
Expand Up @@ -60,26 +60,47 @@ workflow PROFILING {
COMBINE READS WITH POSSIBLE DATABASES
*/

// e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':true], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
// Separate default 'short;long' (when necessary) databases when short/long specified in database sheet
ch_dbs = databases
.map{
meta_db, db ->
[ [meta_db.db_type.split(";")].flatten(), meta_db, db]
}
.transpose(by: 0)
.map{
type, meta_db, db ->
[[type: type], meta_db.subMap(meta_db.keySet() - 'db_type') + [type: type], db]
}

// Join short and long reads with their corresponding short/long database
// Note that for not-specified `short;long`, it will match with the database.
// E.g. if there is no 'long' reads the above generted 'long' database channel element
// will have nothing to join to and will be discarded
// Final output: [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]

ch_input_for_profiling = reads
.map {
meta, reads ->
[meta + [id: "${meta.id}${meta.single_end ? '_se' : '_pe'}"], reads]
}
.combine(databases)
.branch {
centrifuge: it[2]['tool'] == 'centrifuge'
diamond: it[2]['tool'] == 'diamond'
kaiju: it[2]['tool'] == 'kaiju'
kraken2: it[2]['tool'] == 'kraken2' || it[2]['tool'] == 'bracken' // to reuse the kraken module to produce the input data for bracken
krakenuniq: it[2]['tool'] == 'krakenuniq'
malt: it[2]['tool'] == 'malt'
metaphlan: it[2]['tool'] == 'metaphlan'
motus: it[2]['tool'] == 'motus'
kmcp: it[2]['tool'] == 'kmcp'
ganon: it[2]['tool'] == 'ganon'
unknown: true
}
.map{
meta, reads ->
[[type: meta.type], meta, reads]
}
.combine(ch_dbs, by: 0)
.map{
db_type, meta, reads, db_meta, db ->
[ meta, reads, db_meta, db ]
}
.branch { meta, reads, db_meta, db ->
centrifuge: db_meta.tool == 'centrifuge'
diamond: db_meta.tool == 'diamond'
kaiju: db_meta.tool == 'kaiju'
kraken2: db_meta.tool == 'kraken2' || db_meta.tool == 'bracken' // to reuse the kraken module to produce the input data for bracken
krakenuniq: db_meta.tool == 'krakenuniq'
malt: db_meta.tool == 'malt'
metaphlan: db_meta.tool == 'metaphlan'
motus: db_meta.tool == 'motus'
kmcp: db_meta.tool == 'kmcp'
ganon: db_meta.tool == 'ganon'
unknown: true
}

/*
PREPARE PROFILER INPUT CHANNELS & RUN PROFILING
Expand Down
58 changes: 56 additions & 2 deletions tower.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,59 @@
reports:
multiqc_report.html:
display: "MultiQC HTML report"
samplesheet.csv:
display: "Auto-created samplesheet with collated metadata and FASTQ paths"
"**/fastqc/raw/*.html":
display: "A FastQC report containing quality metrics of raw reads in HTML format."
"**/fastqc/raw/*.txt":
display: "A FastQC report containing quality metrics of raw reads in TXT format."
"**/fastqc/preprocessed/*.html":
display: "A FastQC report containing quality metrics of processed reads in HTML format."
"**/fastqc/preprocessed/*.txt":
display: "A FastQC report containing quality metrics of processed reads in TXT format."
"**/falco/raw/*.html":
display: "A Falco report containing quality metrics of raw reads in HTML format."
"**/falco/raw/*.txt":
display: "A Falco report containing quality metrics of raw reads in TXT format."
"**/falco/preprocessed/*.html":
display: "A Falco report containing quality metrics of processed reads in HTML format."
"**/falco/preprocessed/*.txt":
display: "A Falco report containing quality metrics of processed reads in TXT format."
"**/fastp/*.html":
display: "A Log file in HTML format."
"**/bracken/*_combined_reports.txt":
display: "Combined bracken results as output from Bracken's combine_bracken_outputs.py script."
"**/bracken/*/*.tsv":
display: "A TSV file containing per-sample summary of Bracken results with abundance information."
"**/bracken/*/*report_bracken_species.txt":
display: "A Kraken2 style report with Bracken abundance information."
"**/kraken2/kraken2_*_combined_reports.txt":
display: "A combined profile of all samples aligned to a given database (as generated by krakentools)."
"**/kraken2/*/*.kraken2.report.txt":
display: "A Kraken2 report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits in the Kraken2 run for a given sample. Will be 6 column rather than 8 if --save_minimizers specified."
"**/krakenuniq/*.krakenuniq.report.txt":
display: "A Kraken2-style report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits, with an additional column for k-mer coverage, that allows for more accurate distinguishing between false-positive/true-postitive hits."
"**/krakenuniq/*.krakenuniq.classified.txt":
display: "An optional list of read IDs and the hits each read had against each database for a given sample."
"**/centrifuge/*_combined_reports.txt":
display: "A combined profile of all samples aligned to a given database (as generated by centrifuge-kreport)."
"**/centrifuge/*/*.centrifuge.report.txt":
display: "A classification report that summarises the taxonomic ID, the taxonomic rank, length of genome sequence, number of classified and uniquely classified reads."
"**/centrifuge/*/*.centrifuge.txt":
display: "A Kraken2-style report that summarises the fraction abundance, taxonomic ID, number of k-mers, taxonomic path of all the hits in the centrifuge run for a given sample."
"**/ganon/*_combined_reports.txt":
display: "A combined profile of all samples aligned to a given database (as generated by ganon table)."
"**/kaiju/*_combined_reports.txt":
display: "A combined profile of all samples aligned to a given database (as generated by kaiju2table)"
"**/kaiju/*/*.kaijutable.txt":
display: "Summarised Kaiju output with fraction abundance, taxonomic ID, number of reads, and taxonomic names (as generated by kaiju2table)"
"**/krona/*.html":
display: "Per-tool/per-database interactive HTML file containing hierarchical piecharts."
"**/metaphlan/*/*_combined_reports.txt":
display: "A combined profile of all samples aligned to a given database (as generated by metaphlan_merge_tables)."
"**/metaphlan/*/*.bowtie2out.txt":
display: "Bowtie2 alignment information (can be re-used for skipping alignment when re-running MetaPhlAn with different parameters)."
"**/metaphlan/*/*_profile.txt":
display: "A MetaPhlAn taxonomic profile including abundance estimates."
"**/motus/*/*_combined_reports.txt":
display: "A combined profile of all samples aligned to a given database (as generated by motus_merge)."
"**/taxpasta/*tsv":
display: "Standardised taxon table containing multiple samples. The first column describes the taxonomy ID and the rest of the columns describe the read counts for each sample."
9 changes: 6 additions & 3 deletions workflows/taxprofiler.nf
Original file line number Diff line number Diff line change
Expand Up @@ -135,13 +135,13 @@ workflow TAXPROFILER {
}
.branch { meta, run_accession, instrument_platform, fastq_1, fastq_2, fasta ->
fastq: meta.single_end || fastq_2
return [ meta, fastq_2 ? [ fastq_1, fastq_2 ] : [ fastq_1 ] ]
return [ meta + [ type: "short" ], fastq_2 ? [ fastq_1, fastq_2 ] : [ fastq_1 ] ]
nanopore: instrument_platform == 'OXFORD_NANOPORE'
meta.single_end = true
return [ meta, [ fastq_1 ] ]
return [ meta + [ type: "long" ], [ fastq_1 ] ]
fasta: meta.is_fasta
meta.single_end = true
return [ meta, [ fasta ] ]
return [ meta + [ type: "short" ], [ fasta ] ]
}

// Merge ch_input.fastq and ch_input.nanopore into a single channel
Expand All @@ -150,6 +150,9 @@ workflow TAXPROFILER {
// Validate and decompress databases
ch_dbs_for_untar = databases
.branch { db_meta, db_path ->
if ( !db_meta.db_type ) {
db_meta = db_meta + [ db_type: "short;long" ]
}
untar: db_path.name.endsWith( ".tar.gz" )
skip: true
}
Expand Down

0 comments on commit 656613e

Please sign in to comment.