EBI-Metagenomics · chrisAta · Sep 27, 2023 · Sep 15, 2023 · Sep 15, 2023 · Sep 18, 2023
diff --git a/modules/ebi-metagenomics/fastp/main.nf b/modules/ebi-metagenomics/fastp/main.nf
@@ -0,0 +1,102 @@
+// This fastp module is simply copied from the already-existing nf-core module (https://nf-co.re/modules/fastp, https://github.com/nf-core/modules/commit/d497a4868ace3302016ea8ed4b395072d5e833cd)
+// This is because there are not currently any nf-core ways of adding modules from more than one nf-core repo
+// One slight change to it compared to the original is I've removed the "adapter_fasta" input as we are unlikely
+// to need it for our purposes
+
+process FASTP {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "bioconda::fastp=0.23.4"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' :
+        'biocontainers/fastp:0.23.4--h5f740d0_0' }"
+
+    input:
+    tuple val(meta), path(reads)
+    val   save_trimmed_fail
+    val   save_merged
+
+    output:
+    tuple val(meta), path('*.fastp.fastq.gz') , optional:true, emit: reads
+    tuple val(meta), path('*.json')           , emit: json
+    tuple val(meta), path('*.html')           , emit: html
+    tuple val(meta), path('*.log')            , emit: log
+    path "versions.yml"                       , emit: versions
+    tuple val(meta), path('*.fail.fastq.gz')  , optional:true, emit: reads_fail
+    tuple val(meta), path('*.merged.fastq.gz'), optional:true, emit: reads_merged
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : ''
+    // Added soft-links to original fastqs for consistent naming in MultiQC
+    // Use single ended for interleaved. Add --interleaved_in in config.
+    if ( task.ext.args?.contains('--interleaved_in') ) {
+        """
+        [ ! -f  ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz
+
+        fastp \\
+            --stdout \\
+            --in1 ${prefix}.fastq.gz \\
+            --thread $task.cpus \\
+            --json ${prefix}.fastp.json \\
+            --html ${prefix}.fastp.html \\
+            $fail_fastq \\
+            $args \\
+            2> ${prefix}.fastp.log \\
+        | gzip -c > ${prefix}.fastp.fastq.gz
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
+        END_VERSIONS
+        """
+    } else if (meta.single_end) {
+        """
+        [ ! -f  ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz
+
+        fastp \\
+            --in1 ${prefix}.fastq.gz \\
+            --out1  ${prefix}.fastp.fastq.gz \\
+            --thread $task.cpus \\
+            --json ${prefix}.fastp.json \\
+            --html ${prefix}.fastp.html \\
+            $fail_fastq \\
+            $args \\
+            2> ${prefix}.fastp.log
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
+        END_VERSIONS
+        """
+    } else {
+        def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : ''
+        """
+        [ ! -f  ${prefix}_1.fastq.gz ] && ln -sf ${reads[0]} ${prefix}_1.fastq.gz
+        [ ! -f  ${prefix}_2.fastq.gz ] && ln -sf ${reads[1]} ${prefix}_2.fastq.gz
+        fastp \\
+            --in1 ${prefix}_1.fastq.gz \\
+            --in2 ${prefix}_2.fastq.gz \\
+            --out1 ${prefix}_1.fastp.fastq.gz \\
+            --out2 ${prefix}_2.fastp.fastq.gz \\
+            --json ${prefix}.fastp.json \\
+            --html ${prefix}.fastp.html \\
+            $fail_fastq \\
+            $merge_fastq \\
+            --thread $task.cpus \\
+            --detect_adapter_for_pe \\
+            $args \\
+            2> ${prefix}.fastp.log
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
+        END_VERSIONS
+        """
+    }
+}
diff --git a/modules/ebi-metagenomics/fastp/meta.yml b/modules/ebi-metagenomics/fastp/meta.yml
@@ -0,0 +1,69 @@
+name: fastp
+description: Perform adapter/quality trimming on sequencing reads
+keywords:
+  - trimming
+  - quality control
+  - fastq
+tools:
+  - fastp:
+      description: |
+        A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance.
+      documentation: https://github.com/OpenGene/fastp
+      doi: 10.1093/bioinformatics/bty560
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information. Use 'single_end: true' to specify single ended or interleaved FASTQs. Use 'single_end: false' for paired-end reads.
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: |
+        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
+        respectively. If you wish to run interleaved paired-end data,  supply as single-end data
+        but with `--interleaved_in` in your `modules.conf`'s `ext.args` for the module.
+  - save_trimmed_fail:
+      type: boolean
+      description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz`
+  - save_merged:
+      type: boolean
+      description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz`
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: The trimmed/modified/unmerged fastq reads
+      pattern: "*fastp.fastq.gz"
+  - json:
+      type: file
+      description: Results in JSON format
+      pattern: "*.json"
+  - html:
+      type: file
+      description: Results in HTML format
+      pattern: "*.html"
+  - log:
+      type: file
+      description: fastq log file
+      pattern: "*.log"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - reads_fail:
+      type: file
+      description: Reads the failed the preprocessing
+      pattern: "*fail.fastq.gz"
+  - reads_merged:
+      type: file
+      description: Reads that were successfully merged
+      pattern: "*.{merged.fastq.gz}"
+authors:
+  - "@drpatelh"
+  - "@kevinmenden"
diff --git a/modules/ebi-metagenomics/seqtk/seq/main.nf b/modules/ebi-metagenomics/seqtk/seq/main.nf
@@ -0,0 +1,43 @@
+// This seqtk/seq module is simply copied from the already-existing nf-core module (https://nf-co.re/modules/seqtk_seq/, https://github.com/nf-core/modules/commit/726ee59cd9360a965d96ea9ea8770f16b8ddd6cc)
+// This is because there are not currently any nf-core ways of adding modules from more than one nf-core repo
+
+process SEQTK_SEQ {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "bioconda::seqtk=1.4"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/seqtk:1.3--h5bf99c6_3' :
+        'biocontainers/seqtk:1.3--h5bf99c6_3' }"
+
+    input:
+    tuple val(meta), path(fastx)
+
+    output:
+    tuple val(meta), path("*.gz")     , emit: fastx
+    path "versions.yml"               , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
+    def extension = "fastq"
+    if ("$fastx" ==~ /.+\.fasta|.+\.fasta.gz|.+\.fa|.+\.fa.gz|.+\.fas|.+\.fas.gz|.+\.fna|.+\.fna.gz/ || "$args" ==~ /\-[aA]/ ) {
+        extension = "fasta"
+    }
+    """
+    seqtk \\
+        seq \\
+        $args \\
+        $fastx | \\
+        gzip -c > ${prefix}.seqtk-seq.${extension}.gz
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/ebi-metagenomics/seqtk/seq/meta.yml b/modules/ebi-metagenomics/seqtk/seq/meta.yml
@@ -0,0 +1,44 @@
+name: seqtk_seq
+description: Common transformation operations on FASTA or FASTQ files.
+keywords:
+  - seq
+  - fasta
+  - fastq
+tools:
+  - seqtk:
+      description: Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format. The seqtk seq command enables common transformation operations on FASTA or FASTQ files.
+      homepage: https://github.com/lh3/seqtk
+      documentation: https://docs.csc.fi/apps/seqtk/
+      tool_dev_url: https://github.com/lh3/seqtk
+      licence: ["MIT"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test' ]
+  - sequences:
+      type: file
+      description: A FASTQ or FASTA file
+      pattern: "*.{fastq.gz, fastq, fq, fq.gz, fasta, fastq.gz, fa, fa.gz, fas, fas.gz, fna, fna.gz}"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test' ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - sequences:
+      type: file
+      description: FASTQ/FASTA file containing renamed sequences
+      pattern: "*.{fastq.gz, fasta.gz}"
+
+authors:
+  - "@hseabolt"
+  - "@mjcipriano"
+  - "@sateeshperi"
diff --git a/subworkflows/ebi-metagenomics/reads_qc/main.nf b/subworkflows/ebi-metagenomics/reads_qc/main.nf
@@ -0,0 +1,33 @@
+
+include { FASTP      } from '../../../modules/ebi-metagenomics/fastp/main'
+include { SEQTK_SEQ     } from '../../../modules/ebi-metagenomics/seqtk/seq/main'
+
+workflow  READS_QC {
+
+    take:
+    ch_reads // channel: [ val(meta), [ fastq ] ]
+
+    main:
+
+    ch_versions = Channel.empty()
+
+    FASTP ( ch_reads, params.save_trimmed_fail, params.save_merged )
+    ch_versions = ch_versions.mix(FASTP.out.versions.first())
+
+    ch_se_fastp_reads = FASTP
+                        .out.reads
+                        .filter { it[0].single_end }
+
+    ch_reads_se_and_merged = ch_se_fastp_reads.concat(FASTP.out.reads_merged)
+
+    SEQTK_SEQ(ch_reads_se_and_merged)
+    ch_versions = ch_versions.mix(SEQTK_SEQ.out.versions.first())
+
+    emit:
+    reads               = FASTP.out.reads           // channel: [ val(meta), [ fastq ] ]
+    reads_se_and_merged = ch_reads_se_and_merged    // channel: [ val(meta), [ fastq ] ]
+    fastp_summary_json  = FASTP.out.json            // channel: [ val(meta), [ json ] ]
+    reads_fasta         = SEQTK_SEQ.out.fastx       // channel: [ val(meta), [ fasta ] ]
+    versions            = ch_versions               // channel: [ versions.yml ]
+}
+
diff --git a/subworkflows/ebi-metagenomics/reads_qc/meta.yml b/subworkflows/ebi-metagenomics/reads_qc/meta.yml
@@ -0,0 +1,52 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
+name: "reads_qc"
+description: |
+  Quality control and merging of fastq-format short-reads using fastp, generating fasta
+keywords:
+  - trimming
+  - quality control
+  - merging
+  - fastq
+  - fasta
+components:
+  - fastp
+  - seqtk/seq
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information. Use 'single_end: true' to specify single ended or interleaved FASTQs. Use 'single_end: false' for paired-end reads.
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: |
+        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
+        respectively.
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. `[ id:'test' ]`
+  - reads:
+      type: file
+      description: The trimmed/modified/unmerged fastq reads
+      pattern: "*fastp.fastq.gz"
+  - reads_se_and_merged:
+      type: file
+      description: fastp-cleaned single-end reads and merged paired-end reads
+      pattern: "*.merged.fastq.gz"
+  - fastp_summary_json:
+      type: file
+      description: fastp results in JSON format
+      pattern: "*.json"
+  - reads_fasta:
+      type: file
+      description: FASTA file converted from FASTQ
+      pattern: "*.fasta.gz"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@chrisata"
diff --git a/subworkflows/ebi-metagenomics/reads_qc/nextflow.config b/subworkflows/ebi-metagenomics/reads_qc/nextflow.config
@@ -0,0 +1,12 @@
+process {
+
+    withName: SEQTK_SEQ {
+        ext.args = '-a'
+    }
+}
+
+params {
+
+    save_trimmed_fail = true
+    save_merged = true
+}
diff --git a/tests/config/pytest_modules.yml b/tests/config/pytest_modules.yml
@@ -18,6 +18,10 @@ eggnogmapper:
   - modules/ebi-metagenomics/eggnogmapper/**
   - tests/modules/ebi-metagenomics/eggnogmapper/**
 
+fastp:
+  - modules/ebi-metagenomics/fastp/**
+  - tests/modules/ebi-metagenomics/fastp/**
+
 fetchtool/assembly:
   - modules/ebi-metagenomics/fetchtool/assembly/**
   - tests/modules/ebi-metagenomics/fetchtool/assembly/**
@@ -30,6 +34,14 @@ infernal/cmsearch:
   - modules/ebi-metagenomics/infernal/cmsearch/**
   - tests/modules/ebi-metagenomics/infernal/cmsearch/**
 
+seqtk/seq:
+  - modules/ebi-metagenomics/seqtk/seq/**
+  - tests/modules/ebi-metagenomics/seqtk/seq/**
+
 subworkflows/combined_gene_caller:
   - subworkflows/ebi-metagenomics/combined_gene_caller/**
   - tests/subworkflows/ebi-metagenomics/combined_gene_caller/**
+
+subworkflows/reads_qc:
+  - subworkflows/ebi-metagenomics/reads_qc/**
+  - tests/subworkflows/ebi-metagenomics/reads_qc/**
diff --git a/tests/modules/ebi-metagenomics/fastp/data/SRR21814853_1.fastq.gz b/tests/modules/ebi-metagenomics/fastp/data/SRR21814853_1.fastq.gz
diff --git a/tests/modules/ebi-metagenomics/fastp/data/SRR21814853_2.fastq.gz b/tests/modules/ebi-metagenomics/fastp/data/SRR21814853_2.fastq.gz