Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

read_qc subworkflow #15

Merged
merged 13 commits into from
Sep 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions modules/ebi-metagenomics/fastp/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// This fastp module is simply copied from the already-existing nf-core module (https://nf-co.re/modules/fastp, https://github.com/nf-core/modules/commit/d497a4868ace3302016ea8ed4b395072d5e833cd)
// This is because there are not currently any nf-core ways of adding modules from more than one nf-core repo
// One slight change to it compared to the original is I've removed the "adapter_fasta" input as we are unlikely
// to need it for our purposes

process FASTP {
tag "$meta.id"
label 'process_medium'

conda "bioconda::fastp=0.23.4"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' :
'biocontainers/fastp:0.23.4--h5f740d0_0' }"

input:
tuple val(meta), path(reads)
val save_trimmed_fail
val save_merged

output:
tuple val(meta), path('*.fastp.fastq.gz') , optional:true, emit: reads
tuple val(meta), path('*.json') , emit: json
tuple val(meta), path('*.html') , emit: html
tuple val(meta), path('*.log') , emit: log
path "versions.yml" , emit: versions
tuple val(meta), path('*.fail.fastq.gz') , optional:true, emit: reads_fail
tuple val(meta), path('*.merged.fastq.gz'), optional:true, emit: reads_merged

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : ''
// Added soft-links to original fastqs for consistent naming in MultiQC
// Use single ended for interleaved. Add --interleaved_in in config.
if ( task.ext.args?.contains('--interleaved_in') ) {
"""
[ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz

fastp \\
--stdout \\
--in1 ${prefix}.fastq.gz \\
--thread $task.cpus \\
--json ${prefix}.fastp.json \\
--html ${prefix}.fastp.html \\
$fail_fastq \\
$args \\
2> ${prefix}.fastp.log \\
| gzip -c > ${prefix}.fastp.fastq.gz

cat <<-END_VERSIONS > versions.yml
"${task.process}":
fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
END_VERSIONS
"""
} else if (meta.single_end) {
"""
[ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz

fastp \\
--in1 ${prefix}.fastq.gz \\
--out1 ${prefix}.fastp.fastq.gz \\
--thread $task.cpus \\
--json ${prefix}.fastp.json \\
--html ${prefix}.fastp.html \\
$fail_fastq \\
$args \\
2> ${prefix}.fastp.log

cat <<-END_VERSIONS > versions.yml
"${task.process}":
fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
END_VERSIONS
"""
} else {
def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : ''
"""
[ ! -f ${prefix}_1.fastq.gz ] && ln -sf ${reads[0]} ${prefix}_1.fastq.gz
[ ! -f ${prefix}_2.fastq.gz ] && ln -sf ${reads[1]} ${prefix}_2.fastq.gz
fastp \\
--in1 ${prefix}_1.fastq.gz \\
--in2 ${prefix}_2.fastq.gz \\
--out1 ${prefix}_1.fastp.fastq.gz \\
--out2 ${prefix}_2.fastp.fastq.gz \\
--json ${prefix}.fastp.json \\
--html ${prefix}.fastp.html \\
$fail_fastq \\
$merge_fastq \\
--thread $task.cpus \\
--detect_adapter_for_pe \\
$args \\
2> ${prefix}.fastp.log

cat <<-END_VERSIONS > versions.yml
"${task.process}":
fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
END_VERSIONS
"""
}
}
69 changes: 69 additions & 0 deletions modules/ebi-metagenomics/fastp/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
name: fastp
description: Perform adapter/quality trimming on sequencing reads
keywords:
- trimming
- quality control
- fastq
tools:
- fastp:
description: |
A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance.
documentation: https://github.com/OpenGene/fastp
doi: 10.1093/bioinformatics/bty560
licence: ["MIT"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information. Use 'single_end: true' to specify single ended or interleaved FASTQs. Use 'single_end: false' for paired-end reads.
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: |
List of input FastQ files of size 1 and 2 for single-end and paired-end data,
respectively. If you wish to run interleaved paired-end data, supply as single-end data
but with `--interleaved_in` in your `modules.conf`'s `ext.args` for the module.
- save_trimmed_fail:
type: boolean
description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz`
- save_merged:
type: boolean
description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz`

output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: The trimmed/modified/unmerged fastq reads
pattern: "*fastp.fastq.gz"
- json:
type: file
description: Results in JSON format
pattern: "*.json"
- html:
type: file
description: Results in HTML format
pattern: "*.html"
- log:
type: file
description: fastq log file
pattern: "*.log"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- reads_fail:
type: file
description: Reads the failed the preprocessing
pattern: "*fail.fastq.gz"
- reads_merged:
type: file
description: Reads that were successfully merged
pattern: "*.{merged.fastq.gz}"
authors:
- "@drpatelh"
- "@kevinmenden"
43 changes: 43 additions & 0 deletions modules/ebi-metagenomics/seqtk/seq/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// This seqtk/seq module is simply copied from the already-existing nf-core module (https://nf-co.re/modules/seqtk_seq/, https://github.com/nf-core/modules/commit/726ee59cd9360a965d96ea9ea8770f16b8ddd6cc)
// This is because there are not currently any nf-core ways of adding modules from more than one nf-core repo

process SEQTK_SEQ {
tag "$meta.id"
label 'process_single'

conda "bioconda::seqtk=1.4"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/seqtk:1.3--h5bf99c6_3' :
'biocontainers/seqtk:1.3--h5bf99c6_3' }"

input:
tuple val(meta), path(fastx)

output:
tuple val(meta), path("*.gz") , emit: fastx
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"

def extension = "fastq"
if ("$fastx" ==~ /.+\.fasta|.+\.fasta.gz|.+\.fa|.+\.fa.gz|.+\.fas|.+\.fas.gz|.+\.fna|.+\.fna.gz/ || "$args" ==~ /\-[aA]/ ) {
extension = "fasta"
}
"""
seqtk \\
seq \\
$args \\
$fastx | \\
gzip -c > ${prefix}.seqtk-seq.${extension}.gz

cat <<-END_VERSIONS > versions.yml
"${task.process}":
seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//')
END_VERSIONS
"""
}
44 changes: 44 additions & 0 deletions modules/ebi-metagenomics/seqtk/seq/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
name: seqtk_seq
description: Common transformation operations on FASTA or FASTQ files.
keywords:
- seq
- fasta
- fastq
tools:
- seqtk:
description: Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format. The seqtk seq command enables common transformation operations on FASTA or FASTQ files.
homepage: https://github.com/lh3/seqtk
documentation: https://docs.csc.fi/apps/seqtk/
tool_dev_url: https://github.com/lh3/seqtk
licence: ["MIT"]

input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test' ]
- sequences:
type: file
description: A FASTQ or FASTA file
pattern: "*.{fastq.gz, fastq, fq, fq.gz, fasta, fastq.gz, fa, fa.gz, fas, fas.gz, fna, fna.gz}"

output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test' ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- sequences:
type: file
description: FASTQ/FASTA file containing renamed sequences
pattern: "*.{fastq.gz, fasta.gz}"

authors:
- "@hseabolt"
- "@mjcipriano"
- "@sateeshperi"
33 changes: 33 additions & 0 deletions subworkflows/ebi-metagenomics/reads_qc/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@

include { FASTP } from '../../../modules/ebi-metagenomics/fastp/main'
include { SEQTK_SEQ } from '../../../modules/ebi-metagenomics/seqtk/seq/main'

workflow READS_QC {

take:
ch_reads // channel: [ val(meta), [ fastq ] ]

main:

ch_versions = Channel.empty()

FASTP ( ch_reads, params.save_trimmed_fail, params.save_merged )
ch_versions = ch_versions.mix(FASTP.out.versions.first())

ch_se_fastp_reads = FASTP
.out.reads
.filter { it[0].single_end }

ch_reads_se_and_merged = ch_se_fastp_reads.concat(FASTP.out.reads_merged)

SEQTK_SEQ(ch_reads_se_and_merged)
ch_versions = ch_versions.mix(SEQTK_SEQ.out.versions.first())

emit:
reads = FASTP.out.reads // channel: [ val(meta), [ fastq ] ]
reads_se_and_merged = ch_reads_se_and_merged // channel: [ val(meta), [ fastq ] ]
fastp_summary_json = FASTP.out.json // channel: [ val(meta), [ json ] ]
reads_fasta = SEQTK_SEQ.out.fastx // channel: [ val(meta), [ fasta ] ]
versions = ch_versions // channel: [ versions.yml ]
}

52 changes: 52 additions & 0 deletions subworkflows/ebi-metagenomics/reads_qc/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
name: "reads_qc"
description: |
Quality control and merging of fastq-format short-reads using fastp, generating fasta
keywords:
- trimming
- quality control
- merging
- fastq
- fasta
components:
- fastp
- seqtk/seq
input:
- meta:
type: map
description: |
Groovy Map containing sample information. Use 'single_end: true' to specify single ended or interleaved FASTQs. Use 'single_end: false' for paired-end reads.
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: |
List of input FastQ files of size 1 and 2 for single-end and paired-end data,
respectively.
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'test' ]`
- reads:
type: file
description: The trimmed/modified/unmerged fastq reads
pattern: "*fastp.fastq.gz"
- reads_se_and_merged:
type: file
description: fastp-cleaned single-end reads and merged paired-end reads
pattern: "*.merged.fastq.gz"
- fastp_summary_json:
type: file
description: fastp results in JSON format
pattern: "*.json"
- reads_fasta:
type: file
description: FASTA file converted from FASTQ
pattern: "*.fasta.gz"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@chrisata"
12 changes: 12 additions & 0 deletions subworkflows/ebi-metagenomics/reads_qc/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
process {

withName: SEQTK_SEQ {
ext.args = '-a'
}
}

params {

save_trimmed_fail = true
save_merged = true
}
12 changes: 12 additions & 0 deletions tests/config/pytest_modules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ eggnogmapper:
- modules/ebi-metagenomics/eggnogmapper/**
- tests/modules/ebi-metagenomics/eggnogmapper/**

fastp:
- modules/ebi-metagenomics/fastp/**
- tests/modules/ebi-metagenomics/fastp/**

fetchtool/assembly:
- modules/ebi-metagenomics/fetchtool/assembly/**
- tests/modules/ebi-metagenomics/fetchtool/assembly/**
Expand All @@ -30,6 +34,14 @@ infernal/cmsearch:
- modules/ebi-metagenomics/infernal/cmsearch/**
- tests/modules/ebi-metagenomics/infernal/cmsearch/**

seqtk/seq:
- modules/ebi-metagenomics/seqtk/seq/**
- tests/modules/ebi-metagenomics/seqtk/seq/**

subworkflows/combined_gene_caller:
- subworkflows/ebi-metagenomics/combined_gene_caller/**
- tests/subworkflows/ebi-metagenomics/combined_gene_caller/**

subworkflows/reads_qc:
- subworkflows/ebi-metagenomics/reads_qc/**
- tests/subworkflows/ebi-metagenomics/reads_qc/**
Binary file not shown.
Binary file not shown.
Loading