diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 905c58e4..8d12b98a 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -32,7 +32,7 @@ jobs: - uses: actions/setup-node@v4 - name: Install Prettier - run: npm install -g prettier + run: npm install -g prettier@3.1.0 - name: Run Prettier --check run: prettier --check ${GITHUB_WORKSPACE} @@ -84,7 +84,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install nf-core + pip install nf-core==2.11 - name: Run nf-core lint env: diff --git a/.nf-core.yml b/.nf-core.yml index 3e9d09b4..2a47982a 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -17,6 +17,7 @@ lint: - docs/images/nf-core-blobtoolkit_logo_dark.png - .github/ISSUE_TEMPLATE/bug_report.yml - .github/PULL_REQUEST_TEMPLATE.md + - .github/workflows/linting.yml multiqc_config: - report_comment nextflow_config: diff --git a/CHANGELOG.md b/CHANGELOG.md index bd4cc71d..56007304 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,36 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[0.3.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.3.0)] – Poliwag – [2024-02-09] + +The pipeline has now been validated on five genomes, all under 100 Mbp: a +sponge, a platyhelminth, and three fungi. + +### Enhancements & fixes + +- Fixed the conditional runs of blastn +- Fixed the generation of the no-hit list +- Fixed the conversion of the unaligned input files to Fasta +- Fixed the documentation about preparing the NT database +- Fixed the detection of the NT database in the nf-core module +- The pipeline now supports samplesheets generated by the + [nf-core/fetchngs](https://nf-co.re/fetchngs) pipeline by passing the + `--fetchngs_samplesheet true` option. +- FastQ files can bypass the conversion to Fasta +- Fixed missing BUSCO results from the blobdir (only 1 BUSCO was loaded) +- Fixed the default category used to colour the blob plots +- Fixed the output directory of the images +- Added an option to select the format of the images (PNG or SVG) + +### Parameters + +| Old parameter | New parameter | +| ------------- | ---------------------- | +| | --fetchngs_samplesheet | +| | --image_format | + +> **NB:** Parameter has been **updated** if both old and new parameter information is present.
**NB:** Parameter has been **added** if just the new parameter information is present.
**NB:** Parameter has been **removed** if new parameter information isn't present. + ## [[0.2.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.2.0)] – Pikachu – [2023-12-22] ### Enhancements & fixes diff --git a/README.md b/README.md index fab2a350..c2f2a9fc 100644 --- a/README.md +++ b/README.md @@ -13,19 +13,6 @@ **sanger-tol/blobtoolkit** is a bioinformatics pipeline that can be used to identify and analyse non-target DNA for eukaryotic genomes. It takes a samplesheet and aligned CRAM files as input, calculates genome statistics, coverage and completeness information, combines them in a TSV file by window size to create a BlobDir dataset and static plots. - - - - - - - - 1. Calculate genome statistics in windows ([`fastawindows`](https://github.com/tolkit/fasta_windows)) 2. Calculate Coverage ([`blobtk/depth`](https://github.com/blobtoolkit/blobtk)) 3. Fetch associated BUSCO lineages ([`goat/taxonsearch`](https://github.com/genomehubs/goat-cli)) @@ -44,9 +31,6 @@ > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. - - First, prepare a samplesheet with your input data that looks as follows: `samplesheet.csv`: @@ -58,12 +42,10 @@ mMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram mMelMel3,ont,GCA_922984935.2.ont.mMelMel3.cram ``` -Each row represents an aligned file. Rows with the same sample identifier are considered technical replicates. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (ont, hic, pacbio, pacbio_clr, illumina). The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline. +Each row represents an aligned file. Rows with the same sample identifier are considered technical replicates. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (`ont`, `hic`, `pacbio`, `pacbio_clr`, `illumina`). The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline. Now, you can run the pipeline using: - - ```bash nextflow run sanger-tol/blobtoolkit \ -profile \ @@ -86,7 +68,7 @@ For more details, please refer to the [usage documentation](https://pipelines.to ## Pipeline output - For more details about the output files and reports, please refer to the [output documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/output). +For more details about the output files and reports, please refer to the [output documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/output). ## Credits diff --git a/bin/check_fetchngs_samplesheet.py b/bin/check_fetchngs_samplesheet.py new file mode 100755 index 00000000..324811c9 --- /dev/null +++ b/bin/check_fetchngs_samplesheet.py @@ -0,0 +1,252 @@ +#!/usr/bin/env python + + +"""Provide a command line tool to validate and transform tabular samplesheets.""" + + +import argparse +import csv +import logging +import sys +from collections import Counter +from pathlib import Path + +logger = logging.getLogger() + + +class RowChecker: + """ + Define a service that can validate and transform each given row. + + Attributes: + modified (list): A list of dicts, where each dict corresponds to a previously + validated and transformed row. The order of rows is maintained. + + """ + + VALID_FORMATS = (".fastq.gz",) + + def __init__( + self, + accession_col="run_accession", + model_col="instrument_model", + platform_col="instrument_platform", + library_col="library_strategy", + file1_col="fastq_1", + file2_col="fastq_2", + **kwargs, + ): + """ + Initialize the row checker with the expected column names. + + Args: + accession_col (str): The name of the column that contains the accession name + (default "run_accession"). + model_col (str): The name of the column that contains the model name + of the instrument (default "instrument_model"). + platform_col (str): The name of the column that contains the platform name + of the instrument (default "instrument_platform"). + library_col (str): The name of the column that contains the strategy of the + preparation of the library (default "library_strategy"). + file2_col (str): The name of the column that contains the second file path + for the paired-end read data (default "fastq_2"). + """ + super().__init__(**kwargs) + self._accession_col = accession_col + self._model_col = model_col + self._platform_col = platform_col + self._library_col = library_col + self._file1_col = file1_col + self._file2_col = file2_col + self._seen = set() + self.modified = [] + + def validate_and_transform(self, row): + """ + Perform all validations on the given row. + + Args: + row (dict): A mapping from column headers (keys) to elements of that row + (values). + + """ + self._validate_accession(row) + self._validate_file(row) + self._seen.add((row[self._accession_col], row[self._file1_col])) + self.modified.append(row) + + def _validate_accession(self, row): + """Assert that the run accession name exists.""" + if len(row[self._accession_col]) <= 0: + raise AssertionError("Run accession is required.") + + def _validate_file(self, row): + """Assert that the datafile is non-empty and has the right format.""" + if len(row[self._file1_col]) <= 0: + raise AssertionError("Data file is required.") + self._validate_data_format(row[self._file1_col]) + if row[self._file2_col]: + self._validate_data_format(row[self._file2_col]) + + def _validate_data_format(self, filename): + """Assert that a given filename has one of the expected FASTQ extensions.""" + if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): + raise AssertionError( + f"The data file has an unrecognized extension: {filename}\n" + f"It should be one of: {', '.join(self.VALID_FORMATS)}" + ) + + def validate_unique_accessions(self): + """ + Assert that the combination of accession name and aligned filename is unique. + + In addition to the validation, also rename all accessions to have a suffix of _T{n}, where n is the + number of times the same accession exist, but with different FASTQ files, e.g., multiple runs per experiment. + + """ + if len(self._seen) != len(self.modified): + raise AssertionError("The pair of accession and file name must be unique.") + seen = Counter() + for row in self.modified: + accession = row[self._accession_col] + seen[accession] += 1 + row[self._accession_col] = f"{accession}_T{seen[accession]}" + + +def read_head(handle, num_lines=10): + """Read the specified number of lines from the current position in the file.""" + lines = [] + for idx, line in enumerate(handle): + if idx == num_lines: + break + lines.append(line) + return "".join(lines) + + +def sniff_format(handle): + """ + Detect the tabular format. + + Args: + handle (text file): A handle to a `text file`_ object. The read position is + expected to be at the beginning (index 0). + + Returns: + csv.Dialect: The detected tabular format. + + .. _text file: + https://docs.python.org/3/glossary.html#term-text-file + + """ + peek = read_head(handle) + handle.seek(0) + sniffer = csv.Sniffer() + dialect = sniffer.sniff(peek) + return dialect + + +def check_samplesheet(file_in, file_out): + """ + Check that the tabular samplesheet has the structure expected by sanger-tol pipelines. + + Validate the general shape of the table, expected columns, and each row. Also add + Args: + file_in (pathlib.Path): The given tabular samplesheet. The format can be either + CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. + file_out (pathlib.Path): Where the validated and transformed samplesheet should + be created; always in CSV format. + + Example: + This function checks that the samplesheet follows the following structure, + see also the `blobtoolkit samplesheet`_:: + + sample,datatype,datafile + sample1,hic,/path/to/file1.cram + sample1,pacbio,/path/to/file2.cram + sample1,ont,/path/to/file3.cram + + .. _blobtoolkit samplesheet: + https://raw.githubusercontent.com/sanger-tol/blobtoolkit/main/assets/test/samplesheet.csv + + """ + required_columns = { + "run_accession", + "instrument_model", + "instrument_platform", + "library_strategy", + "fastq_1", + "fastq_2", + } + # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. + with file_in.open(newline="") as in_handle: + reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) + # Validate the existence of the expected header columns. + if not required_columns.issubset(reader.fieldnames): + req_cols = ", ".join(required_columns) + logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") + sys.exit(1) + # Validate each row. + checker = RowChecker() + for i, row in enumerate(reader): + try: + checker.validate_and_transform(row) + except AssertionError as error: + logger.critical(f"{str(error)} On line {i + 2}.") + sys.exit(1) + checker.validate_unique_accessions() + header = list(reader.fieldnames) + # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. + with file_out.open(mode="w", newline="") as out_handle: + writer = csv.DictWriter(out_handle, header, delimiter=",") + writer.writeheader() + for row in checker.modified: + writer.writerow(row) + + +def parse_args(argv=None): + """Define and immediately parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Validate and transform a tabular samplesheet.", + epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", + ) + parser.add_argument( + "file_in", + metavar="FILE_IN", + type=Path, + help="Tabular input samplesheet in CSV or TSV format.", + ) + parser.add_argument( + "file_out", + metavar="FILE_OUT", + type=Path, + help="Transformed output samplesheet in CSV format.", + ) + parser.add_argument( + "-l", + "--log-level", + help="The desired log level (default WARNING).", + choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), + default="WARNING", + ) + parser.add_argument( + "-v", + "--version", + action="version", + version="%(prog)s 1.0.0", + ) + return parser.parse_args(argv) + + +def main(argv=None): + """Coordinate argument parsing and program execution.""" + args = parse_args(argv) + logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") + if not args.file_in.is_file(): + logger.error(f"The given input file {args.file_in} was not found!") + sys.exit(2) + args.file_out.parent.mkdir(parents=True, exist_ok=True) + check_samplesheet(args.file_in, args.file_out) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index f5bf5c5b..c63d06fe 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -27,6 +27,8 @@ class RowChecker: VALID_FORMATS = ( ".cram", ".bam", + ".fastq", + ".fastq.gz", ) VALID_DATATYPES = ( diff --git a/bin/nohitlist.sh b/bin/nohitlist.sh index c935cebe..bd9bcc14 100755 --- a/bin/nohitlist.sh +++ b/bin/nohitlist.sh @@ -8,8 +8,8 @@ E=$4 # find ids of sequences with no hits in the blastx search grep '>' $fasta | \ - grep -v -w -f <(awk -v evalue="$E" '{{if($14<{evalue}){{print $1}}}}' $blast | sort | uniq) | \ - cut -f1 | sed 's/>//' > $prefix.nohit.txt + grep -v -w -f <(awk -v evalue="$E" '{if($14//' > $prefix.nohit.txt diff --git a/conf/base.config b/conf/base.config index 4d5e9045..6ebea12c 100644 --- a/conf/base.config +++ b/conf/base.config @@ -16,7 +16,7 @@ process { time = { check_max( 4.h * task.attempt, 'time' ) } errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } - maxRetries = 1 + maxRetries = 5 maxErrors = '-1' // Process-specific resource requirements @@ -52,13 +52,6 @@ process { withLabel:process_high_memory { memory = { check_max( 200.GB * task.attempt, 'memory' ) } } - withLabel:error_ignore { - errorStrategy = 'ignore' - } - withLabel:error_retry { - errorStrategy = 'retry' - maxRetries = 2 - } withName:CUSTOM_DUMPSOFTWAREVERSIONS { cache = false } diff --git a/conf/modules.config b/conf/modules.config index 155111ab..974728f5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -62,10 +62,13 @@ process { withName: "BUSCO" { scratch = true - // Overridden in the test profile, see at the end of this file - ext.args = "--force" + ext.args = { 'test' in workflow.profile.tokenize(',') ? + // Additional configuration to speed processes up during testing. + // Note: BUSCO *must* see the double-quotes around the parameters + '--force --metaeuk_parameters \'"-s=2"\' --metaeuk_rerun_parameters \'"-s=2"\'' + : '--force' } publishDir = [ - path: { "${params.outdir}/BUSCO" }, + path: { "${params.outdir}/busco" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals("versions.yml") ? null : filename } ] @@ -89,25 +92,15 @@ process { withName: "BLOBTOOLKIT_CREATEBLOBDIR" { ext.args = "--evalue 1.0e-25 --hit-count 10" - publishDir = [ - path: { "${params.outdir}/" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals("versions.yml") ? null : filename } - ] } withName: "BLOBTOOLKIT_UPDATEBLOBDIR" { - ext.args = "--evalue 1.0e-25 --hit-count 10 --update-plot" - publishDir = [ - path: { "${params.outdir}/" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals("versions.yml") ? null : filename } - ] + ext.args = "--evalue 1.0e-25 --hit-count 10" } withName: "BLOBTOOLKIT_SUMMARY" { publishDir = [ - path: { "${params.outdir}/${blobdir.name}/" }, + path: { "${params.outdir}/blobtoolkit/${blobdir.name}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals("versions.yml") ? null : filename } ] @@ -115,7 +108,7 @@ process { withName: "BLOBTK_IMAGES" { publishDir = [ - path: { "${params.outdir}/${blobdir.name}/" }, + path: { "${params.outdir}/blobtoolkit/plots" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals("versions.yml") ? null : filename } ] @@ -147,7 +140,7 @@ process { withName: "BLOBTOOLKIT_UPDATEMETA" { publishDir = [ - path: { "${params.outdir}/" }, + path: { "${params.outdir}/blobtoolkit" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals("versions.yml") ? null : filename } ] @@ -164,26 +157,3 @@ process { } - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Additional configuration to speed processes up during testing. - ----------------------------------------------------------------------------------------- -*/ - -profiles { - test { - process { - withName: BUSCO { - // Note: BUSCO *must* see the double-quotes around the parameters - ext.args = '--force --metaeuk_parameters \'"-s=2"\' --metaeuk_rerun_parameters \'"-s=2"\'' - publishDir = [ - path: { "${params.outdir}/BUSCO" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals("versions.yml") ? null : filename } - ] - } - } - } -} diff --git a/conf/test.config b/conf/test.config index 221a0f22..623cf3f9 100644 --- a/conf/test.config +++ b/conf/test.config @@ -30,7 +30,7 @@ params { taxon = "Meles meles" // Databases - taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump" + taxdump = "/lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump" busco = "/lustre/scratch123/tol/resources/nextflow/busco/blobtoolkit.GCA_922984935.2.2023-08-03" blastp = "${projectDir}/assets/test/mMelMel3.1.buscogenes.dmnd" blastx = "${projectDir}/assets/test/mMelMel3.1.buscoregions.dmnd" diff --git a/conf/test_full.config b/conf/test_full.config index ff1ac068..6af9eecb 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -25,7 +25,7 @@ params { taxon = "Laetiporus sulphureus" // Databases - taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump" + taxdump = "/lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump" busco = "/lustre/scratch123/tol/resources/busco/latest" blastp = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd" blastx = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscoregions.dmnd" diff --git a/conf/test_raw.config b/conf/test_raw.config index 6d4174c2..47cc4267 100644 --- a/conf/test_raw.config +++ b/conf/test_raw.config @@ -31,8 +31,8 @@ params { taxon = "Meles meles" // Databases - taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump" - busco = "/lustre/scratch123/tol/resources/nextflow/busco/blobtoolkit.GCA_922984935.2.2023-08-03" + taxdump = "/lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump" + busco = "/lustre/scratch123/tol/resources/nextflow/busco/blobtoolkit.GCA_922984935.2.2023-08-03" blastp = "${projectDir}/assets/test/mMelMel3.1.buscogenes.dmnd" blastx = "${projectDir}/assets/test/mMelMel3.1.buscoregions.dmnd" blastn = "${projectDir}/assets/test/nt_mMelMel3.1/" diff --git a/docs/decision-records/README.md b/docs/decision-records/README.md deleted file mode 100644 index bd17babb..00000000 --- a/docs/decision-records/README.md +++ /dev/null @@ -1,25 +0,0 @@ -Design decisions about the pipeline are indexed and recorded as individual files in this directory. - -To add a new decision, please create a pull request that adds a new markdown file named `XX-short-summary.md` to this directory. When replacing a previous decision, change the status of the latter to "Superseded" and add this to the title of the file `superseded-XX-short-summary.md`. The new file should have the following structure: - -## Title – Decision Statement - -## Status – Either Proposed, Rejected, Current, Deprecated or Superseded - -If this issue has been superseded, please add a line saying 'Superseded by '. - -## Context - -Explain why a decision is needed (problem statement) and provide details of the different options considered when making this decision. - -## Decision - -State what option was selected and why was it picked over other choices. - -## Consequences - -Reflect on how this decision will impact other planned work, or what new work needs to be planned to implement the decision. - -## Discussion Notes and Linked Issues or Pull Requests - -Add any offline discussion notes here, along with associated issue(s) and pull request links. diff --git a/docs/output.md b/docs/output.md index ffa089a9..e6efe8bc 100644 --- a/docs/output.md +++ b/docs/output.md @@ -29,6 +29,8 @@ The files in the BlobDir dataset which is used to create the online interactive - `*.json`: files generated from genome and alignment coverage statistics - `*.png`: static plot images +More information about visualising the data in the [BlobToolKit repository](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/viewer) + ### MultiQC diff --git a/docs/usage.md b/docs/usage.md index 4e4c9d7c..84229b17 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -48,6 +48,11 @@ sample3,ont,ont.cram An [example samplesheet](assets/test/samplesheet.csv) has been provided with the pipeline. +### Support for [nf-core/fetchngs](https://nf-co.re/fetchngs) + +The pipeline can also accept a samplesheet generated by the [nf-core/fetchngs](https://nf-co.re/fetchngs) pipeline (tested with version 1.11.0). +The pipeline then needs the `--fetchngs_samplesheet true` option _and_ `--align true`, since the data files would all be unaligned. + ## Getting databases ready for the pipeline The BlobToolKit pipeline can be run in many different ways. The default way requires access to several databases: @@ -90,7 +95,7 @@ cd $NT Retrieve the NCBI blast nt database (version 5) files and tar gunzip them. We are using the `&&` syntax to ensure that each command completes without error before the next one is run: ```bash -wget "ftp://ftp.ncbi.nlm.nih.gov/blast/db/v5/nt.??.tar.gz" -P $NT/ && +wget "ftp://ftp.ncbi.nlm.nih.gov/blast/db/v5/nt.???.tar.gz" -P $NT/ && for file in $NT/*.tar.gz; do tar xf $file -C $NT && rm $file; done diff --git a/modules.json b/modules.json index 38431cf4..7ba1a8db 100644 --- a/modules.json +++ b/modules.json @@ -7,7 +7,7 @@ "nf-core": { "blast/blastn": { "branch": "master", - "git_sha": "f0d13ae7e1f9b24a705764f8673af859268d7077", + "git_sha": "209e5a3e2753c5e628736a662c877c20f341ee15", "installed_by": ["modules"], "patch": "modules/nf-core/blast/blastn/blast-blastn.diff" }, @@ -17,6 +17,12 @@ "installed_by": ["modules"], "patch": "modules/nf-core/busco/busco.diff" }, + "cat/cat": { + "branch": "master", + "git_sha": "81f27e75847087865299cc46605deb3b09b4e0a2", + "installed_by": ["modules"], + "patch": "modules/nf-core/cat/cat/cat-cat.diff" + }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", @@ -59,8 +65,9 @@ }, "samtools/fasta": { "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": ["modules"] + "git_sha": "9b1071e19265cf9c0d06958a011cf7a9cfe37213", + "installed_by": ["modules"], + "patch": "modules/nf-core/samtools/fasta/samtools-fasta.diff" }, "samtools/index": { "branch": "master", diff --git a/modules/local/blobtk/images.nf b/modules/local/blobtk/images.nf index 1b6e8087..48a9b1d4 100644 --- a/modules/local/blobtk/images.nf +++ b/modules/local/blobtk/images.nf @@ -10,9 +10,11 @@ process BLOBTK_IMAGES { input: tuple val(meta), path(blobdir) each plot + val format output: - tuple val(meta), path('*.png') , emit: png + tuple val(meta), path('*.png') , optional: true, emit: png + tuple val(meta), path('*.svg') , optional: true, emit: svg path "versions.yml" , emit: versions when: @@ -26,7 +28,7 @@ process BLOBTK_IMAGES { blobtk plot \\ -v ${plot} \\ -d ${blobdir} \\ - -o ${prefix}.${plot}.png \\ + -o ${prefix}.${plot}.${format} \\ ${legend} \\ $args diff --git a/modules/local/blobtoolkit/chunk.nf b/modules/local/blobtoolkit/chunk.nf index 38bc37fe..73f27532 100644 --- a/modules/local/blobtoolkit/chunk.nf +++ b/modules/local/blobtoolkit/chunk.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_CHUNK { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_CHUNK module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.2" + container "docker.io/genomehubs/blobtoolkit:4.3.3" input: tuple val(meta) , path(fasta) diff --git a/modules/local/blobtoolkit/config.nf b/modules/local/blobtoolkit/config.nf index 0a9c2f58..d93b85b4 100644 --- a/modules/local/blobtoolkit/config.nf +++ b/modules/local/blobtoolkit/config.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_CONFIG { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "GENERATE_CONFIG module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.2" + container "docker.io/genomehubs/blobtoolkit:4.3.3" input: tuple val(meta), val(reads) diff --git a/modules/local/blobtoolkit/countbuscos.nf b/modules/local/blobtoolkit/countbuscos.nf index e151cde8..203633e1 100644 --- a/modules/local/blobtoolkit/countbuscos.nf +++ b/modules/local/blobtoolkit/countbuscos.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_COUNTBUSCOS { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_COUNTBUSCOS module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.2" + container "docker.io/genomehubs/blobtoolkit:4.3.3" input: tuple val(meta), path(table, stageAs: 'dir??/*') @@ -21,7 +21,7 @@ process BLOBTOOLKIT_COUNTBUSCOS { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def busco_inputs = table.collect{"--in $it"}.join(' ') + def busco_inputs = (table instanceof List ? table : [table]).collect{"--in $it"}.join(' ') """ btk pipeline count-busco-genes \\ $busco_inputs \\ diff --git a/modules/local/blobtoolkit/createblobdir.nf b/modules/local/blobtoolkit/createblobdir.nf index 54810650..2c8517ab 100644 --- a/modules/local/blobtoolkit/createblobdir.nf +++ b/modules/local/blobtoolkit/createblobdir.nf @@ -5,11 +5,11 @@ process BLOBTOOLKIT_CREATEBLOBDIR { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_BLOBDIR module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.2" + container "docker.io/genomehubs/blobtoolkit:4.3.3" input: tuple val(meta), path(window, stageAs: 'windowstats/*') - tuple val(meta1), path(busco) + tuple val(meta1), path(busco, stageAs: 'lineage??/*') tuple val(meta2), path(blastp) tuple val(meta3), path(yaml) path(taxdump) @@ -24,6 +24,7 @@ process BLOBTOOLKIT_CREATEBLOBDIR { script: def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" + def busco_args = (busco instanceof List ? busco : [busco]).collect { "--busco " + it } .join(' ') def hits_blastp = blastp ? "--hits ${blastp}" : "" """ blobtools replace \\ @@ -31,7 +32,7 @@ process BLOBTOOLKIT_CREATEBLOBDIR { --meta ${yaml} \\ --taxdump ${taxdump} \\ --taxrule buscogenes \\ - --busco ${busco} \\ + ${busco_args} \\ ${hits_blastp} \\ --threads ${task.cpus} \\ $args \\ diff --git a/modules/local/blobtoolkit/extractbuscos.nf b/modules/local/blobtoolkit/extractbuscos.nf index e34bfd93..128780fe 100644 --- a/modules/local/blobtoolkit/extractbuscos.nf +++ b/modules/local/blobtoolkit/extractbuscos.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_EXTRACTBUSCOS { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_EXTRACTBUSCOS module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.2" + container "docker.io/genomehubs/blobtoolkit:4.3.3" input: tuple val(meta), path(fasta) @@ -21,7 +21,7 @@ process BLOBTOOLKIT_EXTRACTBUSCOS { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def seq_args = seq.collect { "--busco " + it } .join(' ') + def seq_args = (seq instanceof List ? seq : [seq]).collect { "--busco " + it } .join(' ') """ btk pipeline extract-busco-genes \\ $seq_args \\ diff --git a/modules/local/blobtoolkit/metadata.nf b/modules/local/blobtoolkit/metadata.nf index 8e2d585d..96948345 100644 --- a/modules/local/blobtoolkit/metadata.nf +++ b/modules/local/blobtoolkit/metadata.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_METADATA { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_METADATA module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.2" + container "docker.io/genomehubs/blobtoolkit:4.3.3" input: tuple val(meta), path(yaml) diff --git a/modules/local/blobtoolkit/summary.nf b/modules/local/blobtoolkit/summary.nf index ac92a3b3..45f0471a 100644 --- a/modules/local/blobtoolkit/summary.nf +++ b/modules/local/blobtoolkit/summary.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_SUMMARY { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_SUMMARY module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.2" + container "docker.io/genomehubs/blobtoolkit:4.3.3" input: tuple val(meta), path(blobdir) diff --git a/modules/local/blobtoolkit/unchunk.nf b/modules/local/blobtoolkit/unchunk.nf index b544bf1f..f9797178 100644 --- a/modules/local/blobtoolkit/unchunk.nf +++ b/modules/local/blobtoolkit/unchunk.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_UNCHUNK { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_UNCHUNK module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.2" + container "docker.io/genomehubs/blobtoolkit:4.3.3" input: tuple val(meta), path(blast_table) diff --git a/modules/local/blobtoolkit/updateblobdir.nf b/modules/local/blobtoolkit/updateblobdir.nf index 7a677828..cbcdc7b5 100644 --- a/modules/local/blobtoolkit/updateblobdir.nf +++ b/modules/local/blobtoolkit/updateblobdir.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_UPDATEBLOBDIR { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_BLOBDIR module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.2" + container "docker.io/genomehubs/blobtoolkit:4.3.3" input: tuple val(meta), path(input) diff --git a/modules/local/blobtoolkit/windowstats.nf b/modules/local/blobtoolkit/windowstats.nf index dde880e6..26bd49f5 100644 --- a/modules/local/blobtoolkit/windowstats.nf +++ b/modules/local/blobtoolkit/windowstats.nf @@ -3,9 +3,9 @@ process BLOBTOOLKIT_WINDOWSTATS { label 'process_single' if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { - exit 1, "GET_WINDOW_STATS module does not support Conda. Please use Docker / Singularity / Podman instead." + exit 1, "BLOBTOOLKIT_WINDOWSTATS module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/genomehubs/blobtoolkit:4.3.2" + container "docker.io/genomehubs/blobtoolkit:4.3.3" input: tuple val(meta), path(tsv) diff --git a/modules/local/fetchngssamplesheet_check.nf b/modules/local/fetchngssamplesheet_check.nf new file mode 100644 index 00000000..962768d5 --- /dev/null +++ b/modules/local/fetchngssamplesheet_check.nf @@ -0,0 +1,32 @@ +process FETCHNGSSAMPLESHEET_CHECK { + tag "$samplesheet" + label 'process_single' + + conda "conda-forge::python=3.9.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'biocontainers/python:3.9--1' }" + + input: + path samplesheet + + output: + path '*.csv' , emit: csv + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: // This script is bundled with the pipeline, in sanger-tol/blobtoolkit/bin/ + """ + check_fetchngs_samplesheet.py \\ + $samplesheet \\ + samplesheet.valid.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + check_fetchngs_samplesheet.py: \$(check_fetchngs_samplesheet.py --version | cut -d' ' -f2) + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/blast/blastn/blast-blastn.diff b/modules/nf-core/blast/blastn/blast-blastn.diff index dc3f108f..1695c793 100644 --- a/modules/nf-core/blast/blastn/blast-blastn.diff +++ b/modules/nf-core/blast/blastn/blast-blastn.diff @@ -9,16 +9,15 @@ Changes in module 'nf-core/blast/blastn' output: tuple val(meta), path('*.txt'), emit: txt -@@ -23,7 +24,7 @@ +@@ -23,6 +24,7 @@ def prefix = task.ext.prefix ?: "${meta.id}" def is_compressed = fasta.getExtension() == "gz" ? true : false def fasta_name = is_compressed ? fasta.getBaseName() : fasta -- + def exclude_taxon = taxid ? "-negative_taxids ${taxid}" : '' + """ if [ "${is_compressed}" == "true" ]; then - gzip -c -d ${fasta} > ${fasta_name} -@@ -34,6 +35,7 @@ +@@ -39,6 +41,7 @@ -num_threads ${task.cpus} \\ -db \$DB \\ -query ${fasta_name} \\ diff --git a/modules/nf-core/blast/blastn/main.nf b/modules/nf-core/blast/blastn/main.nf index 44b581a9..065ad7cd 100644 --- a/modules/nf-core/blast/blastn/main.nf +++ b/modules/nf-core/blast/blastn/main.nf @@ -25,12 +25,18 @@ process BLAST_BLASTN { def is_compressed = fasta.getExtension() == "gz" ? true : false def fasta_name = is_compressed ? fasta.getBaseName() : fasta def exclude_taxon = taxid ? "-negative_taxids ${taxid}" : '' + """ if [ "${is_compressed}" == "true" ]; then gzip -c -d ${fasta} > ${fasta_name} fi - DB=`find -L ./ -name "*.nin" | sed 's/\\.nin\$//'` + DB=`find -L ./ -name "*.nal" | sed 's/\\.nal\$//'` + if [ -z "\$DB" ]; then + DB=`find -L ./ -name "*.nin" | sed 's/\\.nin\$//'` + fi + echo Using \$DB + blastn \\ -num_threads ${task.cpus} \\ -db \$DB \\ diff --git a/modules/nf-core/blast/blastn/tests/main.nf.test b/modules/nf-core/blast/blastn/tests/main.nf.test index 0e909a7e..02ecfab5 100644 --- a/modules/nf-core/blast/blastn/tests/main.nf.test +++ b/modules/nf-core/blast/blastn/tests/main.nf.test @@ -8,6 +8,7 @@ nextflow_process { tag "modules_nfcore" tag "blast" tag "blast/blastn" + tag "blast/makeblastdb" setup { run("BLAST_MAKEBLASTDB") { diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml new file mode 100644 index 00000000..17a04ef2 --- /dev/null +++ b/modules/nf-core/cat/cat/environment.yml @@ -0,0 +1,7 @@ +name: cat_cat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::pigz=2.3.4 diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf new file mode 100644 index 00000000..adbdbd7b --- /dev/null +++ b/modules/nf-core/cat/cat/main.nf @@ -0,0 +1,79 @@ +process CAT_CAT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.3.4' : + 'biocontainers/pigz:2.3.4' }" + + input: + tuple val(meta), path(files_in) + + output: + tuple val(meta), path("${prefix}"), emit: file_out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def file_list = files_in.collect { it.toString() } + + // choose appropriate concatenation tool depending on input and output format + + // | input | output | command1 | command2 | + // |-----------|------------|----------|----------| + // | gzipped | gzipped | cat | | + // | ungzipped | ungzipped | cat | | + // | gzipped | ungzipped | zcat | | + // | ungzipped | gzipped | cat | pigz | + + // Use input file ending as default + prefix = task.ext.prefix ?: "${meta.id}${getFileSuffix(file_list[0])}" + out_zip = prefix.endsWith('.gz') + in_zip = file_list[0].endsWith('.gz') + command1 = (in_zip && !out_zip) ? 'zcat' : 'cat' + command2 = (!in_zip && out_zip) ? "| pigz -c -p $task.cpus $args2" : '' + if(file_list.contains(prefix.trim())) { + error "The name of the input file can't be the same as for the output prefix in the " + + "module CAT_CAT (currently `$prefix`). Please choose a different one." + } + """ + $command1 \\ + $args \\ + ${file_list.join(' ')} \\ + $command2 \\ + > ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + + stub: + def file_list = files_in.collect { it.toString() } + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + if(file_list.contains(prefix.trim())) { + error "The name of the input file can't be the same as for the output prefix in the " + + "module CAT_CAT (currently `$prefix`). Please choose a different one." + } + """ + touch $prefix + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} + +// for .gz files also include the second to last extension if it is present. E.g., .fasta.gz +def getFileSuffix(filename) { + def match = filename =~ /^.*?((\.\w{1,5})?(\.\w{1,5}\.gz$))/ + return match ? match[0][1] : filename.substring(filename.lastIndexOf('.')) +} + diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml new file mode 100644 index 00000000..00a8db0b --- /dev/null +++ b/modules/nf-core/cat/cat/meta.yml @@ -0,0 +1,36 @@ +name: cat_cat +description: A module for concatenation of gzipped or uncompressed files +keywords: + - concatenate + - gzip + - cat +tools: + - cat: + description: Just concatenation + documentation: https://man7.org/linux/man-pages/man1/cat.1.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - files_in: + type: file + description: List of compressed / uncompressed files + pattern: "*" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - file_out: + type: file + description: Concatenated file. Will be gzipped if file_out ends with ".gz" + pattern: "${file_out}" +authors: + - "@erikrikarddaniel" + - "@FriederikeHanssen" +maintainers: + - "@erikrikarddaniel" + - "@FriederikeHanssen" diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test new file mode 100644 index 00000000..aaae04f9 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test @@ -0,0 +1,177 @@ +nextflow_process { + + name "Test Process CAT_CAT" + script "../main.nf" + process "CAT_CAT" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/cat" + + test("test_cat_name_conflict") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'genome', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + assertAll( + { assert !process.success }, + { assert process.stdout.toString().contains("The name of the input file can't be the same as for the output prefix") } + ) + } + } + + test("test_cat_unzipped_unzipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + + test("test_cat_zipped_zipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_cat_zipped_unzipped") { + config './nextflow_zipped_unzipped.config' + + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("test_cat_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_cat_one_file_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_one_file_unzipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_one_file_unzipped_zipped_size")} + ) + } + } +} + diff --git a/modules/nf-core/cat/cat/tests/main.nf.test.snap b/modules/nf-core/cat/cat/tests/main.nf.test.snap new file mode 100644 index 00000000..0c9bfe8d --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test.snap @@ -0,0 +1,145 @@ +{ + "test_cat_unzipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:18.500464399" + }, + "test_cat_zipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:49.642741302" + }, + "test_cat_zipped_zipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.gff3.gz:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "test.gff3.gz:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2024-01-12T14:02:02.999254641" + }, + "test_cat_one_file_unzipped_zipped_lines": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ] + ], + "timestamp": "2023-10-16T14:33:21.39642399" + }, + "test_cat_unzipped_zipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt.gz:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt.gz:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2024-01-12T14:08:26.948048418" + }, + "test_cat_one_file_unzipped_zipped_size": { + "content": [ + 374 + ], + "timestamp": "2024-01-12T14:10:22.445700266" + } +} diff --git a/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config new file mode 100644 index 00000000..ec26b0fd --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config @@ -0,0 +1,6 @@ + +process { + withName: CAT_CAT { + ext.prefix = 'cat.txt.gz' + } +} diff --git a/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config new file mode 100644 index 00000000..fbc79783 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config @@ -0,0 +1,8 @@ + +process { + + withName: CAT_CAT { + ext.prefix = 'cat.txt' + } + +} diff --git a/modules/nf-core/cat/cat/tests/tags.yml b/modules/nf-core/cat/cat/tests/tags.yml new file mode 100644 index 00000000..37b578f5 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/tags.yml @@ -0,0 +1,2 @@ +cat/cat: + - modules/nf-core/cat/cat/** diff --git a/modules/nf-core/samtools/fasta/main.nf b/modules/nf-core/samtools/fasta/main.nf index 63e2852e..4b0cad9a 100644 --- a/modules/nf-core/samtools/fasta/main.nf +++ b/modules/nf-core/samtools/fasta/main.nf @@ -24,7 +24,7 @@ process SAMTOOLS_FASTA { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def output = ( interleave && ! meta.single_end ) ? "> ${prefix}_interleaved.fasta.gz" : + def output = ( interleave && ! meta.single_end ) ? "| gzip > ${prefix}_interleaved.fasta.gz" : meta.single_end ? "-1 ${prefix}_1.fasta.gz -s ${prefix}_singleton.fasta.gz" : "-1 ${prefix}_1.fasta.gz -2 ${prefix}_2.fasta.gz -s ${prefix}_singleton.fasta.gz" """ @@ -32,7 +32,6 @@ process SAMTOOLS_FASTA { fasta \\ $args \\ --threads ${task.cpus-1} \\ - -0 ${prefix}_other.fasta.gz \\ $input \\ $output diff --git a/modules/nf-core/samtools/fasta/samtools-fasta.diff b/modules/nf-core/samtools/fasta/samtools-fasta.diff new file mode 100644 index 00000000..e2374ed9 --- /dev/null +++ b/modules/nf-core/samtools/fasta/samtools-fasta.diff @@ -0,0 +1,13 @@ +Changes in module 'nf-core/samtools/fasta' +--- modules/nf-core/samtools/fasta/main.nf ++++ modules/nf-core/samtools/fasta/main.nf +@@ -32,7 +32,6 @@ + fasta \\ + $args \\ + --threads ${task.cpus-1} \\ +- -0 ${prefix}_other.fasta.gz \\ + $input \\ + $output + + +************************************************************ diff --git a/nextflow.config b/nextflow.config index 98b0398c..6c9fadf8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,6 +15,7 @@ params { yaml = null align = false mask = false + fetchngs_samplesheet = false // Reference options fasta = null @@ -22,6 +23,9 @@ params { taxon = null taxa_file = null + // Output options + image_format = 'png' + // Databases and related options taxdump = null busco = null @@ -243,7 +247,7 @@ manifest { description = """Quality assessment of genome assemblies""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '0.2.0' + version = '0.3.0' doi = '10.5281/zenodo.7949058' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 37c8a567..97c84534 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -32,12 +32,23 @@ "description": "Turn on optional genome masking if needed.", "fa_icon": "fas fa-toggle-off" }, + "fetchngs_samplesheet": { + "type": "boolean", + "description": "Turn on the conversion from a nf-core/fetchngs samplesheet.", + "fa_icon": "fas fa-toggle-off" + }, "yaml": { "type": "string", "format": "file-path", "description": "Custom config file for draft assembly", "fa_icon": "fas fa-file-alt" }, + "image_format": { + "type": "string", + "enum": ["png", "svg"], + "description": "Select the format of the output images.", + "fa_icon": "fas fa-image" + }, "outdir": { "type": "string", "format": "directory-path", @@ -67,8 +78,8 @@ "required": ["taxon", "accession", "fasta"], "properties": { "taxon": { - "type": "string", - "description": "NCBI taxonomy ID for the genome species" + "type": ["string", "integer"], + "description": "Name or taxonomy ID for the genome species" }, "accession": { "type": "string", diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index 6037de19..a43b26dd 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -51,7 +51,7 @@ workflow BUSCO_DIAMOND { // Add the basal lineages to the list (excluding duplicates) - basal_lineages = [ "archaea_odb10", "bacteria_odb10", "eukaryota_odb10" ] + basal_lineages = [ "eukaryota_odb10", "bacteria_odb10", "archaea_odb10" ] ch_ancestral_lineages | map { lineages -> (lineages + basal_lineages).unique() } | flatten () @@ -86,11 +86,26 @@ workflow BUSCO_DIAMOND { ch_versions = ch_versions.mix ( DIAMOND_BLASTP.out.versions.first() ) - // Select BUSCO results for taxonomically closest database + // Index the lineages in the taxonomic order + def lineage_position = 0 + ch_lineages + | map { lineage -> [lineage, lineage_position++] } + | set { ch_ordered_lineages } + + + // Order BUSCO results according to ch_ordered_lineages BUSCO.out.full_table - | combine ( ch_lineages.toList().map { it[0] } ) - | filter { meta, table, lineage -> table =~ /$lineage/ } - | map { meta, table, lineage -> [ meta, table ] } + | map { meta, table -> [table.parent.baseName.minus("run_"), meta, table] } + | join ( ch_ordered_lineages ) + | map { lineage, meta, table, index -> [meta, table, index] } + | groupTuple() + | map { meta, tables, positions -> [ meta, tables.withIndex().sort { a, b -> positions[a[1]] <=> positions[b[1]] } . collect { table, i -> table } ] } + | set { ch_indexed_buscos } + + + // Select BUSCO results for taxonomically closest database + ch_indexed_buscos + | map { meta, tables -> [meta, tables[0]] } | set { ch_first_table } @@ -102,7 +117,7 @@ workflow BUSCO_DIAMOND { emit: first_table = ch_first_table // channel: [ val(meta), path(full_table) ] - full_table = BUSCO.out.full_table // channel: [ val(meta), path(full_tables) ] + all_tables = ch_indexed_buscos // channel: [ val(meta), path(full_tables) ] blastp_txt = DIAMOND_BLASTP.out.txt // channel: [ val(meta), path(txt) ] taxon_id = ch_taxid // channel: taxon_id multiqc // channel: [ meta, summary ] diff --git a/subworkflows/local/collate_stats.nf b/subworkflows/local/collate_stats.nf index 21baf44a..08bc43c9 100644 --- a/subworkflows/local/collate_stats.nf +++ b/subworkflows/local/collate_stats.nf @@ -9,7 +9,7 @@ include { BLOBTOOLKIT_WINDOWSTATS } from '../../modules/local/blobtoolkit/window workflow COLLATE_STATS { take: - busco_table // channel: [ val(meta), path(full_table) ] + busco // channel: [ val(meta), path(full_table) ] bed // channel: [ val(meta), path(bed) ] freq // channel: [ val(meta), path(freq) ] mononuc // channel: [ val(meta), path(mononuc) ] @@ -20,11 +20,7 @@ workflow COLLATE_STATS { // Count BUSCO genes in a region - busco_table - | groupTuple() - | set { ch_busco } - - BLOBTOOLKIT_COUNTBUSCOS ( ch_busco, bed ) + BLOBTOOLKIT_COUNTBUSCOS ( busco, bed ) ch_versions = ch_versions.mix ( BLOBTOOLKIT_COUNTBUSCOS.out.versions.first() ) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 01849bd1..5b028911 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -2,8 +2,10 @@ // Check input samplesheet and get aligned read channels // -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' -include { BLOBTOOLKIT_CONFIG } from '../../modules/local/blobtoolkit/config' +include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' +include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' +include { FETCHNGSSAMPLESHEET_CHECK } from '../../modules/local/fetchngssamplesheet_check' +include { BLOBTOOLKIT_CONFIG } from '../../modules/local/blobtoolkit/config' workflow INPUT_CHECK { take: @@ -14,14 +16,36 @@ workflow INPUT_CHECK { main: ch_versions = Channel.empty() + if ( params.fetchngs_samplesheet ) { + FETCHNGSSAMPLESHEET_CHECK ( samplesheet ) + .csv + .splitCsv ( header:true, sep:',' ) + .branch { row -> + paired: row.fastq_2 + [[id: row.run_accession, row:row], [row.fastq_1, row.fastq_2]] + not_paired: true + } + .set { reads_pairedness } + ch_versions = ch_versions.mix ( FETCHNGSSAMPLESHEET_CHECK.out.versions.first() ) - SAMPLESHEET_CHECK ( samplesheet ) - .csv - .splitCsv ( header:true, sep:',' ) - .map { create_data_channels(it) } - .set { aln } + CAT_CAT ( reads_pairedness.paired ) + ch_versions = ch_versions.mix ( CAT_CAT.out.versions.first() ) + + CAT_CAT.out.file_out + | map { meta, file -> meta.row + [fastq_1: file] } + | mix ( reads_pairedness.not_paired ) + | map { create_data_channels_from_fetchngs(it) } + | set { aln } + + } else { + SAMPLESHEET_CHECK ( samplesheet ) + .csv + .splitCsv ( header:true, sep:',' ) + .map { create_data_channels(it) } + .set { aln } + ch_versions = ch_versions.mix ( SAMPLESHEET_CHECK.out.versions.first() ) + } - ch_versions = ch_versions.mix ( SAMPLESHEET_CHECK.out.versions.first() ) if ( !params.yaml ) { aln @@ -55,6 +79,10 @@ def create_data_channels(LinkedHashMap row) { // add path(s) of the read file(s) to the meta map def data_meta = [] + if ( !params.align && (row.datafile.endsWith(".fastq") || row.datafile.endsWith(".fastq.gz")) ) { + exit 1, "ERROR: Please check input samplesheet and pipeline parameters -> Data file is in FastQ format but --align is not set!\n${row.datafile}" + } + if ( !file(row.datafile).exists() ) { exit 1, "ERROR: Please check input samplesheet -> Data file does not exist!\n${row.datafile}" } else { @@ -63,3 +91,39 @@ def create_data_channels(LinkedHashMap row) { return data_meta } + +// Function to get list of [ meta, datafile ] +def create_data_channels_from_fetchngs(LinkedHashMap row) { + // create meta map + def meta = [:] + meta.id = row.run_accession + + // Same as https://github.com/blobtoolkit/blobtoolkit/blob/4.3.3/src/blobtoolkit-pipeline/src/lib/functions.py#L30-L39 + // with the addition of "hic" + switch (row.instrument_platform) { + case "ILLUMINA": + meta.datatype = (row.library_strategy == "Hi-C" ? "hic" : "illumina") + break + case "OXFORD_NANOPORE": + meta.datatype = "ont" + break + case "PACBIO_SMRT": + meta.datatype = (row.instrument_model == "Sequel" ? "pacbio_clr" : "pacbio") + break + default: + meta.datatype = "illumina" + } + + + // add path(s) of the read file(s) to the meta map + def data_meta = [] + + if ( !file(row.fastq_1).exists() ) { + exit 1, "ERROR: Please check input samplesheet -> Data file does not exist!\n${row.fastq_1}" + } else { + data_meta = [ meta, file(row.fastq_1) ] + } + + return data_meta +} + diff --git a/subworkflows/local/minimap_alignment.nf b/subworkflows/local/minimap_alignment.nf index b9a4409e..e0b479bc 100644 --- a/subworkflows/local/minimap_alignment.nf +++ b/subworkflows/local/minimap_alignment.nf @@ -20,13 +20,22 @@ workflow MINIMAP2_ALIGNMENT { ch_versions = Channel.empty() - // Convert reads to FASTA - SAMTOOLS_FASTA ( input, true ) + // Convert BAM/CRAM reads to FASTA + input + | branch { + meta, reads -> + fastq: reads.toString().endsWith(".fastq") || reads.toString().endsWith(".fastq.gz") || reads.toString().endsWith(".fq") || reads.toString().endsWith(".fq.gz") + bamcram: true + } + | set { ch_reads_by_type } + + SAMTOOLS_FASTA ( ch_reads_by_type.bamcram, true ) ch_versions = ch_versions.mix(SAMTOOLS_FASTA.out.versions.first()) // Branch input by sequencing type SAMTOOLS_FASTA.out.interleaved + | mix ( ch_reads_by_type.fastq ) | branch { meta, reads -> hic: meta.datatype == "hic" diff --git a/subworkflows/local/run_blastn.nf b/subworkflows/local/run_blastn.nf index 87cb0a88..5e3c913f 100644 --- a/subworkflows/local/run_blastn.nf +++ b/subworkflows/local/run_blastn.nf @@ -52,23 +52,27 @@ workflow RUN_BLASTN { // Run blastn search // run blastn excluding taxon_id BLASTN_TAXON ( BLOBTOOLKIT_CHUNK.out.chunks, blastn, taxon_id ) + ch_versions = ch_versions.mix ( BLASTN_TAXON.out.versions.first() ) // check if blastn output table is empty BLASTN_TAXON.out.txt - | map { meta, txt -> txt.isEmpty() } - | set { is_txt_empty } + | branch { meta, txt -> + empty: txt.isEmpty() + not_empty: true + } + | set { ch_blastn_taxon_out } // repeat the blastn search without excluding taxon_id - if ( is_txt_empty ) { - BLAST_BLASTN ( BLOBTOOLKIT_CHUNK.out.chunks, blastn, [] ) - ch_blastn_txt = BLAST_BLASTN.out.txt - } - else { - ch_blastn_txt = BLASTN_TAXON.out.txt - } + ch_blastn_taxon_out.empty.join ( BLOBTOOLKIT_CHUNK.out.chunks ) + | map { meta, txt, fasta -> [meta, fasta] } + | set { ch_blast_blastn_input } + BLAST_BLASTN ( ch_blast_blastn_input, blastn, [] ) ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions.first() ) + BLAST_BLASTN.out.txt + | mix( ch_blastn_taxon_out.not_empty ) + | set { ch_blastn_txt } // Unchunk chunked blastn results BLOBTOOLKIT_UNCHUNK ( ch_blastn_txt ) diff --git a/subworkflows/local/view.nf b/subworkflows/local/view.nf index 505d6c36..e2de7ede 100644 --- a/subworkflows/local/view.nf +++ b/subworkflows/local/view.nf @@ -22,16 +22,17 @@ workflow VIEW { // - // Generate static plots in png format + // Generate static plots in png/svg format // plots = [ "blob", "cumulative", "snail" ] - BLOBTK_IMAGES ( blobdir, plots ) + BLOBTK_IMAGES ( blobdir, plots, params.image_format ) ch_versions = ch_versions.mix( BLOBTK_IMAGES.out.versions ) + ch_images = BLOBTK_IMAGES.out.png.mix(BLOBTK_IMAGES.out.svg) emit: summary = BLOBTOOLKIT_SUMMARY.out.json // channel: [ val(meta), path(json) ] - images = BLOBTK_IMAGES.out.png // channel: [ val(meta), path(png) ] + images = ch_images // channel: [ val(meta), path(png/svg) ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 0d452a54..944ccc4c 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -28,6 +28,7 @@ if (params.blastp && params.accession) { ch_blastp = Channel.of([ [ 'id': params if (params.blastx && params.accession) { ch_blastx = Channel.of([ [ 'id': params.accession ], params.blastx ]).first() } else { exit 1, 'Diamond BLASTx database and accession must be specified!' } if (params.blastn && params.accession) { ch_blastn = Channel.of([ [ 'id': params.accession ], params.blastn ]).first() } else { exit 1, 'BLASTn database not specified!' } if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI Taxonomy database not specified!' } +if (params.fetchngs_samplesheet && !params.align) { exit 1, '--align not specified, even though the input samplesheet is a nf-core/fetchngs one - i.e has fastq files!' } // Create channel for optional parameters if (params.busco) { ch_busco_db = Channel.fromPath(params.busco) } else { ch_busco_db = Channel.empty() } @@ -171,7 +172,7 @@ workflow BLOBTOOLKIT { // SUBWORKFLOW: Collate genome statistics by various window sizes // COLLATE_STATS ( - BUSCO_DIAMOND.out.full_table, + BUSCO_DIAMOND.out.all_tables, COVERAGE_STATS.out.bed, COVERAGE_STATS.out.freq, COVERAGE_STATS.out.mononuc, @@ -185,7 +186,7 @@ workflow BLOBTOOLKIT { BLOBTOOLS ( INPUT_CHECK.out.config, COLLATE_STATS.out.window_tsv, - BUSCO_DIAMOND.out.first_table, + BUSCO_DIAMOND.out.all_tables, BUSCO_DIAMOND.out.blastp_txt.ifEmpty([[],[]]), RUN_BLASTX.out.blastx_out.ifEmpty([[],[]]), RUN_BLASTN.out.blastn_out.ifEmpty([[],[]]),