diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 073e187..1fcafe8 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -14,13 +14,12 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - - name: Set up Python 3.11 - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - name: Set up Python 3.12 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: 3.11 - cache: "pip" + python-version: "3.12" - name: Install pre-commit run: pip install pre-commit @@ -32,14 +31,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - name: Install Nextflow - uses: nf-core/setup-nextflow@v1 + uses: nf-core/setup-nextflow@v2 - - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: "3.11" + python-version: "3.12" architecture: "x64" - name: Install dependencies @@ -60,7 +59,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4 + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index b706875..40acc23 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@f6b0bace624032e30a85a8fd9c1a7f8f611f5737 # v3 + uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3 with: workflow: linting.yml workflow_conclusion: completed diff --git a/.nf-core.yml b/.nf-core.yml index 1c764f6..e7be709 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,5 +1,6 @@ repository_type: pipeline +nf_core_version: "2.14.1" lint: files_exist: - assets/nf-core-gasnomenclature_logo_light.png diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a15fda..4b7a138 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,32 +3,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## In-development +## [0.1.0] - 2024/06/28 -- Fixed nf-core tools linting failures introduced in version 2.12.1. -- Added phac-nml prefix to nf-core config - -## 1.0.3 - 2024/02/23 - -- Pinned nf-validation@1.1.3 plugin - -## 1.0.2 - 2023/12/18 - -- Removed GitHub workflows that weren't needed. -- Adding additional parameters for testing purposes. - -## 1.0.1 - 2023/12/06 - -Allowing non-gzipped FASTQ files as input. Default branch is now main. - -## 1.0.0 - 2023/11/30 - -Initial release of phac-nml/gasnomenclature, created with the [nf-core](https://nf-co.re/) template. +Initial release of the Genomic Address Nomenclature pipeline to be used to assign cluster addresses to samples based on an existing cluster designations. ### `Added` -### `Fixed` - -### `Dependencies` +- Input of cg/wgMLST allele calls produced from [locidex](https://github.com/phac-nml/locidex). +- Output of assigned cluster addresses for any **query** samples using [profile_dists](https://github.com/phac-nml/profile_dists) and [gas call](https://github.com/phac-nml/genomic_address_service). -### `Deprecated` +[0.1.0]: https://github.com/phac-nml/gasnomenclature/releases/tag/0.1.0 diff --git a/CITATIONS.md b/CITATIONS.md index 84e1767..600a9e2 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,6 +10,18 @@ ## Pipeline tools +- [locidex](https://github.com/phac-nml/locidex) (in-development, citation subject to change) + + > Robertson, James, Wells, Matthew, Christy-Lynn, Peterson, Kyrylo Bessonov, Reimer, Aleisha, Schonfeld, Justin. LOCIDEX: Distributed allele calling engine. 2024. https://github.com/phac-nml/locidex + +- [profile_dists](https://github.com/phac-nml/profile_dists) (in-development, citation subject to change) + + > Robertson, James, Wells, Matthew, Schonfeld, Justin, Reimer, Aleisha. Profile Dists: Convenient package for comparing genetic similarity of samples based on allelic profiles. 2023. https://github.com/phac-nml/profile_dists + +- [genomic_address_service (GAS)](https://github.com/phac-nml/genomic_address_service) (in-development, citation subject to change) + + > Robertson, James, Wells, Matthew, Schonfeld, Justin, Reimer, Aleisha. Genomic Address Service: Convenient package for de novo clustering and sample assignment to existing clusters. 2023. https://github.com/phac-nml/genomic_address_service + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/LICENSE b/LICENSE index ae9c66b..0ca6cdb 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) Aaron Petkau +Copyright (c) Government of Canada Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 303bd5f..499f513 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,80 @@ [![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A523.04.3-brightgreen.svg)](https://www.nextflow.io/) -# Example Pipeline for IRIDA Next +# Genomic Address Service Nomenclature Workflow -This is an example pipeline to be used for integration with IRIDA Next. +This workflow takes provided JSON-formatted MLST allelic profiles and assigns cluster addresses to samples based on an existing cluster designations. This pipeline is designed to be integrated into IRIDA Next. However, it may be run as a stand-alone pipeline. + +A brief overview of the usage of this pipeline is given below. Detailed documentation can be found in the [docs/](docs/) directory. # Input The input to the pipeline is a standard sample sheet (passed as `--input samplesheet.csv`) that looks like: -| sample | fastq_1 | fastq_2 | -| ------- | --------------- | --------------- | -| SampleA | file_1.fastq.gz | file_2.fastq.gz | +| sample | mlst_alleles | address | +| ------- | ----------------- | ------- | +| sampleA | sampleA.mlst.json | 1.1.1 | +| sampleQ | sampleQ.mlst.json | | +| sampleF | sampleF.mlst.json | | The structure of this file is defined in [assets/schema_input.json](assets/schema_input.json). Validation of the sample sheet is performed by [nf-validation](https://nextflow-io.github.io/nf-validation/). +Details on the columns can be found in the [Full samplesheet](docs/usage.md#full-samplesheet) documentation. + # Parameters The main parameters are `--input` as defined above and `--output` for specifying the output results directory. You may wish to provide `-profile singularity` to specify the use of singularity containers and `-r [branch]` to specify which GitHub branch you would like to run. +## Distance Method and Thresholds + +Profile_Dists and the Genomic Address Service workflows can use two distance methods: hamming or scaled. + +### Hamming Distances + +Hamming distances are integers representing the number of differing loci between two sequences and will range between [0, n], where `n` is the total number of loci. When using Hamming distances, you must specify `--pd_distm hamming` and provide Hamming distance thresholds as integers between [0, n]: `--gm_thresholds "10,5,0"` (10, 5, and 0 loci). + +### Scaled Distances + +Scaled distances are floats representing the percentage of differing loci between two sequences and will range between [0.0, 100.0]. When using scaled distances, you must specify `--pd_distm scaled` and provide percentages between [0.0, 100.0] as thresholds: `--gm_thresholds "50,20,0"` (50%, 20%, and 0% of loci). + +### Thresholds and Linkage Methods + +The `--gm_thresholds` parameter sets thresholds for each cluster level, which dictate how sequences are assigned cluster codes. These thresholds specify the maximum allowable differences in loci between sequences sharing the same cluster code at each level. The consistency of these thresholds in ensuring uniform cluster codes across levels depends on the `--gm_method` parameter, which determines the linkage method used for clustering. + +- _Complete Linkage_: When using complete linkage clustering, sequences are grouped such that identical cluster codes at a particular level guarantee that all sequences in that cluster are within the specified threshold distance. For example, specifying `--pd_distm hamming` and `--gm_thresholds "10,5,0"` would mean that sequences with no more than 10 loci differences are assigned the same cluster code at the first level, no more than 5 differences at the second level, and identical sequences at the third level. + +- _Average Linkage_: With average linkage clustering, sequences may share the same cluster code if their average distance is below the specified threshold. For instance, sequences with average distances less than 10, 5, and 0 for each level respectively may share the same cluster code. + +- _Single Linkage_: Single linkage clustering can result in merging distant samples into the same cluster if there exists a third sample that bridges the distance between them. This method does not provide strict guarantees on the maximum distance within a cluster, potentially allowing distant sequences to share the same cluster code. + +## Profile_dists + +The following can be used to adjust parameters for the [profile_dists][] tool. + +- `--pd_distm`: The distance method/unit, either _hamming_ or _scaled_. For _hamming_ distances, the distance values will be a non-negative integer. For _scaled_ distances, the distance values are between 0.0 and 100.0. Please see the [Distance Method and Thresholds](#distance-method-and-thresholds) section for more information. +- `--pd_missing_threshold`: The maximum proportion of missing data per locus for a locus to be kept in the analysis. Values from 0 to 1. +- `--pd_sample_quality_threshold`: The maximum proportion of missing data per sample for a sample to be kept in the analysis. Values from 0 to 1. +- `--pd_file_type`: Output format file type. One of _text_ or _parquet_. +- `--pd_mapping_file`: A file used to map allele codes to integers for internal distance calculations. This is the same file as produced from the _profile dists_ step (the [allele_map.json](docs/output.md#profile-dists) file). Normally, this is unneeded unless you wish to override the automated process of mapping alleles to integers. +- `--pd_skip`: Skip QA/QC steps. Can be used as a flag, `--pd_skip`, or passing a boolean, `--pd_skip true` or `--pd_skip false`. +- `--pd_columns`: Path to a file that defines the loci to keep within the analysis (default when unset is to keep all loci). Formatted as a single column file with one locus name per line. For example: + - **Single column format** + ``` + loci1 + loci2 + loci3 + ``` +- `--pd_count_missing`: Count missing alleles as different. Can be used as a flag, `--pd_count_missing`, or passing a boolean, `--pd_count_missing true` or `--pd_count_missing false`. If true, will consider missing allele calls for the same locus between samples as a difference, increasing the distance counts. + +## GAS CALL + +The following can be used to adjust parameters for the [gas call][] tool. + +- `--gm_thresholds`: Thresholds delimited by `,`. Values should match units from `--pd_distm` (either _hamming_ or _scaled_). Please see the [Distance Method and Thresholds](#distance-method-and-thresholds) section for more information. +- `--gm_method`: The linkage method to use for clustering. Value should be one of _single_, _average_, or _complete_. +- `--gm_delimiter`: Delimiter desired for nomenclature code. Must be alphanumeric or one of `._-`. + +## Other + Other parameters (defaults from nf-core) are defined in [nextflow_schema.json](nextflow_schmea.json). # Running @@ -39,51 +96,26 @@ An example of the what the contents of the IRIDA Next JSON file looks like for t ``` { "files": { - "global": [ - { - "path": "summary/summary.txt.gz" - } - ], + "global": [], "samples": { - "SAMPLE1": [ + "sampleF": [ { - "path": "assembly/SAMPLE1.assembly.fa.gz" + "path": "input/sampleF_error_report.csv" } ], - "SAMPLE2": [ - { - "path": "assembly/SAMPLE2.assembly.fa.gz" - } - ], - "SAMPLE3": [ - { - "path": "assembly/SAMPLE3.assembly.fa.gz" - } - ] } }, "metadata": { "samples": { - "SAMPLE1": { - "reads.1": "sample1_R1.fastq.gz", - "reads.2": "sample1_R2.fastq.gz" - }, - "SAMPLE2": { - "reads.1": "sample2_R1.fastq.gz", - "reads.2": "sample2_R2.fastq.gz" - }, - "SAMPLE3": { - "reads.1": "sample1_R1.fastq.gz", - "reads.2": "null" + "sampleQ": { + "address": "1.1.3", } } } } ``` -Within the `files` section of this JSON file, all of the output paths are relative to the `outdir`. Therefore, `"path": "assembly/SAMPLE1.assembly.fa.gz"` refers to a file located within `outdir/assembly/SAMPLE1.assembly.fa.gz`. - -There is also a pipeline execution summary output file provided (specified in the above JSON as `"global": [{"path":"summary/summary.txt.gz"}]`). However, there is no formatting specification for this file. +Within the `files` section of this JSON file, all of the output paths are relative to the `outdir`. Therefore, `"path": "input/sampleF_error_report.csv"` refers to a file located within `outdir/input/sampleF_error_report.csv`. This file is generated only if a sample fails the input check during samplesheet assessment. ## Test profile @@ -95,7 +127,7 @@ nextflow run phac-nml/gasnomenclature -profile docker,test -r main -latest --out # Legal -Copyright 2023 Government of Canada +Copyright 2024 Government of Canada Licensed under the MIT License (the "License"); you may not use this work except in compliance with the License. You may obtain a copy of the diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 814a27d..b0d6f9e 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,4 +1,5 @@ -sample,fastq_1,fastq_2 -SAMPLE1,https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/illumina/amplicon/sample1_R1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/illumina/amplicon/sample1_R2.fastq.gz -SAMPLE2,https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/illumina/amplicon/sample2_R1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/illumina/amplicon/sample2_R2.fastq.gz -SAMPLE3,https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/illumina/amplicon/sample1_R1.fastq.gz, +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 diff --git a/assets/schema_input.json b/assets/schema_input.json index 028cdfd..6094f92 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -14,19 +14,20 @@ "unique": true, "errorMessage": "Sample name must be provided and cannot contain spaces" }, - "profile_type": { - "meta": ["profile_type"], - "description": "Determines has already been clustered (True) or if it is new, and requiring nomenclature assignment (False)", - "errorMessage": "Please specify if the mlst profile has already been clustered (True) or if it is new and requires nomenclature assignment (False)", - "type": "boolean" - }, "mlst_alleles": { "type": "string", "format": "file-path", - "pattern": "^\\S+\\.mlst\\.json(\\.gz)?$", - "errorMessage": "MLST JSON file from locidex report, cannot contain spaces and must have the extension: '.mlst.json' or '.mlst.json.gz'" + "pattern": "^\\S+\\.mlst(\\.subtyping)?\\.json(\\.gz)?$", + "errorMessage": "MLST JSON file from locidex report, cannot contain spaces and must have the extension: '.mlst.json', '.mlst.json.gz', '.mlst.subtyping.json', or 'mlst.subtyping.json.gz'" + }, + "address": { + "type": "string", + "pattern": "^\\d+(\\.\\d+)*$", + "meta": ["address"], + "description": "The loci-based typing identifier (address) of the sample", + "error_message": "Invalid loci-based typing identifier. Please ensure that the address follows the correct format, consisting of one or more digits separated by periods. Example of a valid identifier: '1.1.1'. Please review and correct the entry" } }, - "required": ["sample", "profile_type", "mlst_alleles"] + "required": ["sample", "mlst_alleles"] } } diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py deleted file mode 100755 index 4a758fe..0000000 --- a/bin/check_samplesheet.py +++ /dev/null @@ -1,259 +0,0 @@ -#!/usr/bin/env python - - -"""Provide a command line tool to validate and transform tabular samplesheets.""" - - -import argparse -import csv -import logging -import sys -from collections import Counter -from pathlib import Path - -logger = logging.getLogger() - - -class RowChecker: - """ - Define a service that can validate and transform each given row. - - Attributes: - modified (list): A list of dicts, where each dict corresponds to a previously - validated and transformed row. The order of rows is maintained. - - """ - - VALID_FORMATS = ( - ".fq.gz", - ".fastq.gz", - ) - - def __init__( - self, - sample_col="sample", - first_col="fastq_1", - second_col="fastq_2", - single_col="single_end", - **kwargs, - ): - """ - Initialize the row checker with the expected column names. - - Args: - sample_col (str): The name of the column that contains the sample name - (default "sample"). - first_col (str): The name of the column that contains the first (or only) - FASTQ file path (default "fastq_1"). - second_col (str): The name of the column that contains the second (if any) - FASTQ file path (default "fastq_2"). - single_col (str): The name of the new column that will be inserted and - records whether the sample contains single- or paired-end sequencing - reads (default "single_end"). - - """ - super().__init__(**kwargs) - self._sample_col = sample_col - self._first_col = first_col - self._second_col = second_col - self._single_col = single_col - self._seen = set() - self.modified = [] - - def validate_and_transform(self, row): - """ - Perform all validations on the given row and insert the read pairing status. - - Args: - row (dict): A mapping from column headers (keys) to elements of that row - (values). - - """ - self._validate_sample(row) - self._validate_first(row) - self._validate_second(row) - self._validate_pair(row) - self._seen.add((row[self._sample_col], row[self._first_col])) - self.modified.append(row) - - def _validate_sample(self, row): - """Assert that the sample name exists and convert spaces to underscores.""" - if len(row[self._sample_col]) <= 0: - raise AssertionError("Sample input is required.") - # Sanitize samples slightly. - row[self._sample_col] = row[self._sample_col].replace(" ", "_") - - def _validate_first(self, row): - """Assert that the first FASTQ entry is non-empty and has the right format.""" - if len(row[self._first_col]) <= 0: - raise AssertionError("At least the first FASTQ file is required.") - self._validate_fastq_format(row[self._first_col]) - - def _validate_second(self, row): - """Assert that the second FASTQ entry has the right format if it exists.""" - if len(row[self._second_col]) > 0: - self._validate_fastq_format(row[self._second_col]) - - def _validate_pair(self, row): - """Assert that read pairs have the same file extension. Report pair status.""" - if row[self._first_col] and row[self._second_col]: - row[self._single_col] = False - first_col_suffix = Path(row[self._first_col]).suffixes[-2:] - second_col_suffix = Path(row[self._second_col]).suffixes[-2:] - if first_col_suffix != second_col_suffix: - raise AssertionError("FASTQ pairs must have the same file extensions.") - else: - row[self._single_col] = True - - def _validate_fastq_format(self, filename): - """Assert that a given filename has one of the expected FASTQ extensions.""" - if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): - raise AssertionError( - f"The FASTQ file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_FORMATS)}" - ) - - def validate_unique_samples(self): - """ - Assert that the combination of sample name and FASTQ filename is unique. - - In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the - number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment. - - """ - if len(self._seen) != len(self.modified): - raise AssertionError("The pair of sample name and FASTQ must be unique.") - seen = Counter() - for row in self.modified: - sample = row[self._sample_col] - seen[sample] += 1 - row[self._sample_col] = f"{sample}_T{seen[sample]}" - - -def read_head(handle, num_lines=10): - """Read the specified number of lines from the current position in the file.""" - lines = [] - for idx, line in enumerate(handle): - if idx == num_lines: - break - lines.append(line) - return "".join(lines) - - -def sniff_format(handle): - """ - Detect the tabular format. - - Args: - handle (text file): A handle to a `text file`_ object. The read position is - expected to be at the beginning (index 0). - - Returns: - csv.Dialect: The detected tabular format. - - .. _text file: - https://docs.python.org/3/glossary.html#term-text-file - - """ - peek = read_head(handle) - handle.seek(0) - sniffer = csv.Sniffer() - dialect = sniffer.sniff(peek) - return dialect - - -def check_samplesheet(file_in, file_out): - """ - Check that the tabular samplesheet has the structure expected by nf-core pipelines. - - Validate the general shape of the table, expected columns, and each row. Also add - an additional column which records whether one or two FASTQ reads were found. - - Args: - file_in (pathlib.Path): The given tabular samplesheet. The format can be either - CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. - file_out (pathlib.Path): Where the validated and transformed samplesheet should - be created; always in CSV format. - - Example: - This function checks that the samplesheet follows the following structure, - see also the `viral recon samplesheet`_:: - - sample,fastq_1,fastq_2 - SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz - SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz - SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz, - - .. _viral recon samplesheet: - https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv - - """ - required_columns = {"sample", "fastq_1", "fastq_2"} - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_in.open(newline="") as in_handle: - reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) - # Validate the existence of the expected header columns. - if not required_columns.issubset(reader.fieldnames): - req_cols = ", ".join(required_columns) - logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") - sys.exit(1) - # Validate each row. - checker = RowChecker() - for i, row in enumerate(reader): - try: - checker.validate_and_transform(row) - except AssertionError as error: - logger.critical(f"{str(error)} On line {i + 2}.") - sys.exit(1) - checker.validate_unique_samples() - header = list(reader.fieldnames) - header.insert(1, "single_end") - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_out.open(mode="w", newline="") as out_handle: - writer = csv.DictWriter(out_handle, header, delimiter=",") - writer.writeheader() - for row in checker.modified: - writer.writerow(row) - - -def parse_args(argv=None): - """Define and immediately parse command line arguments.""" - parser = argparse.ArgumentParser( - description="Validate and transform a tabular samplesheet.", - epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", - ) - parser.add_argument( - "file_in", - metavar="FILE_IN", - type=Path, - help="Tabular input samplesheet in CSV or TSV format.", - ) - parser.add_argument( - "file_out", - metavar="FILE_OUT", - type=Path, - help="Transformed output samplesheet in CSV format.", - ) - parser.add_argument( - "-l", - "--log-level", - help="The desired log level (default WARNING).", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), - default="WARNING", - ) - return parser.parse_args(argv) - - -def main(argv=None): - """Coordinate argument parsing and program execution.""" - args = parse_args(argv) - logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") - if not args.file_in.is_file(): - logger.error(f"The given input file {args.file_in} was not found!") - sys.exit(2) - args.file_out.parent.mkdir(parents=True, exist_ok=True) - check_samplesheet(args.file_in, args.file_out) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/input_assure.py b/bin/input_assure.py new file mode 100755 index 0000000..d99bf2a --- /dev/null +++ b/bin/input_assure.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python + +import json +import argparse +import csv +import gzip +import sys + + +def open_file(file_path, mode): + # Open a file based on the file extension + if file_path.endswith(".gz"): + return gzip.open(file_path, mode) + else: + return open(file_path, mode) + + +def check_inputs(json_file, sample_id, address, output_error_file, output_json_file): + with open_file(json_file, "rt") as f: + json_data = json.load(f) + + # Define a variable to store the match_status (True or False) + match_status = sample_id in json_data + + # Initialize the error message + error_message = None + + # Check for multiple keys in the JSON file and define error message + keys = list(json_data.keys()) + original_key = keys[0] if keys else None + + if len(keys) == 0: + error_message = f"{json_file} is completely empty!" + print(error_message) + sys.exit(1) + elif len(keys) > 1: + # Check if sample_id matches any key + if not match_status: + error_message = f"No key in the MLST JSON file ({json_file}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed." + # Retain only the specified sample ID + json_data = {sample_id: json_data.pop(original_key)} + else: + error_message = f"MLST JSON file ({json_file}) contains multiple keys: {keys}. The MLST JSON file has been modified to retain only the '{sample_id}' entry" + # Remove all keys expect the one matching sample_id + json_data = {sample_id: json_data[sample_id]} + elif not match_status: + # Define error message based on meta.address (query or reference) + if address == "null": + error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." + else: + error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness." + # Update the JSON file with the new sample ID + json_data[sample_id] = json_data.pop(original_key) + + # Write file containing relevant error messages + if error_message: + with open(output_error_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["sample", "JSON_key", "error_message"]) + writer.writerow([sample_id, keys, error_message]) + + # Write the updated JSON data back to the original file + with gzip.open(output_json_file, "wt") as f: + json.dump(json_data, f, indent=4) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Check sample inputs, force change if ID ≠ KEY, and generate an error report." + ) + parser.add_argument("--input", help="Path to the mlst.json file.", required=True) + parser.add_argument( + "--sample_id", help="Sample ID to check in the JSON file.", required=True + ) + parser.add_argument( + "--address", help="Address to use in the error message.", required=True + ) + parser.add_argument( + "--output_error", help="Path to the error report file.", required=True + ) + parser.add_argument( + "--output_json", help="Path to the MLST JSON file (gzipped).", required=True + ) + + args = parser.parse_args() + + check_inputs( + args.input, args.sample_id, args.address, args.output_error, args.output_json + ) diff --git a/bin/irida-next-output.py b/bin/irida-next-output.py deleted file mode 100755 index 32acd36..0000000 --- a/bin/irida-next-output.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python - -import json -from pathlib import Path -from mimetypes import guess_type -from functools import partial -import gzip -import sys -import argparse -import os -import glob - - -def get_open(f): - if "gzip" == guess_type(str(f))[1]: - return partial(gzip.open) - else: - return open - - -def main(argv=None): - parser = argparse.ArgumentParser( - description="Creates example output JSON for loading into IRIDA Next", - epilog="Example: python irida-next-output.py --json-output output.json *.json *.json.gz", - ) - parser.add_argument("files", nargs="+") - parser.add_argument( - "--summary-file", - action="store", - dest="summary_file", - type=str, - help="pipeline summary file", - default=None, - required=True, - ) - parser.add_argument( - "--json-output", - action="store", - dest="json_output", - type=str, - help="JSON output file", - default=None, - required=True, - ) - - args = parser.parse_args(argv) - - json_output_file = Path(args.json_output) - if json_output_file.exists(): - sys.stderr.write(f"Error: --json-output [{json_output_file}] exists") - return 1 - - # Not checking for the existance of the summary file - # because the path may be relative to the outdir, which we don't have here. - - input_files = args.files - if isinstance(input_files, str): - input_files = [input_files] - - output_dict = { - "files": { - "summary": {}, - "samples": {}, - }, - "metadata": { - "samples": {}, - }, - } - - output_metadata = { - "files": {"global": [{"path": str(args.summary_file)}], "samples": {}}, - "metadata": {"samples": {}}, - } - - for f in input_files: - _open = get_open(f) - with _open(f, "r") as fh: - sample_metadata = json.load(fh) - output_metadata["files"]["samples"] |= sample_metadata["files"]["samples"] - output_metadata["metadata"]["samples"] |= sample_metadata["metadata"]["samples"] - - data_json = json.dumps(output_metadata, sort_keys=True, indent=4) - _open = get_open(json_output_file) - with _open(json_output_file, "wt") as oh: - oh.write(data_json) - - print(f"Output written to [{json_output_file}]") - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/simplify_irida_json.py b/bin/simplify_irida_json.py deleted file mode 100755 index c486625..0000000 --- a/bin/simplify_irida_json.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python - -import json -import argparse -import sys -import gzip -from mimetypes import guess_type -from functools import partial -from pathlib import Path - - -def flatten_dictionary(dictionary): - result = {} - - def flatten(item, name=""): - if type(item) is dict: - for component in item: - flatten(item[component], str(name) + str(component) + ".") - - elif type(item) is list: - for i in range(len(item)): - flatten(item[i], str(name) + str(i + 1) + ".") # i + 1 because biologists - - else: - result[str(name)[:-1]] = item # [:-1] avoids the "." appended on the previous recursion - - flatten(dictionary) - return result - - -def main(): - parser = argparse.ArgumentParser( - description="Simplifies JSON files for use with IRIDA Next", - epilog="Example: python simplify_irida_json.py --json-output output.json input.json", - ) - parser.add_argument("input") - parser.add_argument( - "--json-output", - action="store", - dest="json_output", - type=str, - help="JSON output file", - default=None, - required=True, - ) - - args = parser.parse_args() - - json_output_location = Path(args.json_output) - if json_output_location.exists(): - sys.stderr.write("Error: --json-output [{json_output_location}] exists!\n") - return 1 - - json_input_file = args.input - - # Handle GZIP and non-GZIP - encoding = guess_type(json_input_file)[1] - open_file = partial(gzip.open, mode="rt") if encoding == "gzip" else open # partial (function pointer) - - with open_file(json_input_file) as input_file: - input_json = json.load(input_file) - - # Flatten metadata: - for sample in input_json["metadata"]["samples"]: - input_json["metadata"]["samples"][sample] = flatten_dictionary(input_json["metadata"]["samples"][sample]) - - json_data = json.dumps(input_json, sort_keys=True, indent=4) - with open(json_output_location, "w") as output_file: - output_file.write(json_data) - - print("Output written to " + str(json_output_location) + "!") - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/conf/iridanext.config b/conf/iridanext.config new file mode 100644 index 0000000..ce9ad72 --- /dev/null +++ b/conf/iridanext.config @@ -0,0 +1,18 @@ +iridanext { + enabled = true + output { + path = "${params.outdir}/iridanext.output.json.gz" + overwrite = true + files { + samples = ["**/input/*_error_report.csv"] + } + metadata { + samples { + csv { + path = "**/filter/new_addresses.csv" + idcol = "id" + } + } + } + } +} diff --git a/conf/modules.config b/conf/modules.config index 08fc284..00855c7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -13,10 +13,11 @@ process { // Publish directory names - assembly_directory_name = "assembly" - summary_directory_name = "summary" + profile_dists_directory_name = "distances" + gas_call_directory_name = "call" - locidex_merge_directory_name = [params.outdir , "locidex", "merge"].join(File.separator) + locidex_merge_ref_directory_name = [params.outdir , "locidex", "merge", "reference"].join(File.separator) + locidex_merge_query_directory_name = [params.outdir , "locidex", "merge", "query"].join(File.separator) publishDir = [ path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, @@ -24,44 +25,54 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: SAMPLESHEET_CHECK { - publishDir = [ - path: { "${params.outdir}/pipeline_info" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + withName: INPUT_ASSURE { + fair = true } - withName: ASSEMBLY_STUB { + withName: LOCIDEX_MERGE_REF { publishDir = [ - path: { ["${params.outdir}", "${task.assembly_directory_name}"].join(File.separator) }, + path: locidex_merge_ref_directory_name, mode: params.publish_dir_mode, + pattern: "*/*", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: GENERATE_SUMMARY { + withName: LOCIDEX_MERGE_QUERY { publishDir = [ - path: { ["${params.outdir}", "${task.summary_directory_name}"].join(File.separator) }, + path: locidex_merge_query_directory_name, mode: params.publish_dir_mode, + pattern: "*/*", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: IRIDA_NEXT_OUTPUT { + withName: PROFILE_DISTS { publishDir = [ - path: { "${params.outdir}" }, + path: { ["${params.outdir}", "${task.profile_dists_directory_name}"].join(File.separator) }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : + filename.contains(File.separator) ? filename.split(File.separator)[-1] : filename } ] } - withName: LOCIDEX_MERGE { + withName: GAS_CALL { publishDir = [ - path: locidex_merge_directory_name, - mode: params.publish_dir_mode, - pattern: "*/*", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + [ + path: { ["${params.outdir}", "${task.gas_call_directory_name}"].join(File.separator) }, + mode: params.publish_dir_mode, + pattern: "*/thresholds.json" + ], + [ + path: { ["${params.outdir}", "${task.gas_call_directory_name}"].join(File.separator) }, + mode: params.publish_dir_mode, + pattern: "*/results.{text,parquet}" + ], + [ + path: { ["${params.outdir}", "${task.gas_call_directory_name}"].join(File.separator) }, + mode: params.publish_dir_mode, + pattern: "*/run.json" + ] ] } diff --git a/conf/test.config b/conf/test.config index 0e0b591..dee168d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,5 +20,13 @@ params { max_time = '1.h' // Input data - input = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/main/assets/samplesheet.csv' + input = "${projectDir}/assets/samplesheet.csv" } + + +/* This is required to run in WSL/Ubuntu using singularity +Without this, profile_dists was not successfully completing +due to issues with multiprocessing in the container. A similar +error is found at https://github.com/marcelm/cutadapt/issues/583 +*/ +singularity.runOptions = "--contain" diff --git a/conf/test_full.config b/conf/test_full.config index c8b5764..4133c10 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -15,5 +15,14 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - input = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/main/assets/samplesheet.csv' + input = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/samplesheets/samplesheet1.csv' + ref_clusters = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/clusters/expected_clusters.txt' } + +/* This is required to run in WSL/Ubuntu using singularity +Without this, profile_dists was not successfully completing +due to issues with multiprocessing in the container. A similar +error is found at https://github.com/marcelm/cutadapt/issues/583 +*/ +singularity.runOptions = "--contain" + diff --git a/docs/output.md b/docs/output.md index 817c382..27a33c2 100644 --- a/docs/output.md +++ b/docs/output.md @@ -6,11 +6,13 @@ This document describes the output produced by the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. -- assembly: very small mock assembly files for each sample -- generate: intermediate files used in generating the IRIDA Next JSON output -- pipeline_info: information about the pipeline's execution -- simplify: simplified intermediate files used in generating the IRIDA Next JSON output -- summary: summary report about the pipeline's execution and results +- call: The cluster addresses from the [genomic_address_service](https://github.com/phac-nml/genomic_address_service). +- cluster: The cluster file required by GAS_call. +- distances: Distances between genomes from [profile_dists](https://github.com/phac-nml/profile_dists). +- filter: The cluster addresses from only the query samples. +- input: An error report that is only generated when sample IDs and MLST JSON files do not match. +- locidex: The merged MLST JSON files for reference and query samples. +- pipeline_info: Information about the pipeline's execution The IRIDA Next-compliant JSON output file will be named `iridanext.output.json.gz` and will be written to the top-level of the results directory. This file is compressed using GZIP and conforms to the [IRIDA Next JSON output specifications](https://github.com/phac-nml/pipeline-standards#42-irida-next-json). @@ -18,60 +20,80 @@ The IRIDA Next-compliant JSON output file will be named `iridanext.output.json.g The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [Assembly stub](#assembly-stub) - Performs a stub assembly by generating a mock assembly -- [Generate sample JSON](#generate-sample-json) - Generates a JSON file for each sample -- [Generate summary](#generate-summary) - Generates a summary text file describing the samples and assemblies -- [Simplify IRIDA JSON](#simplify-irida-json) - Simplifies the sample JSONs by limiting nesting depth +- [Input assure](#input-assure) - Performs a validation check on the samplesheet inputs to ensure that the sampleID precisely matches the MLST JSON key and enforces necessary changes where discrepancies are found. +- [Locidex merge](#locidex-merge) - Merges MLST profile JSON files into a single profiles file for reference and query samples. +- [Profile dists](#profile-dists) - Computes pairwise distances between genomes using MLST allele differences. +- [Cluster file](#cluster-file) - Generates the expected_clusters.txt file from reference sample addresses for use in GAS_call. +- [GAS call](#gas-call) - Generates hierarchical cluster addresses. +- [Filter query](#filter-query) - Filters and generates a csv file containing only the cluster addresses for query samples. - [IRIDA Next Output](#irida-next-output) - Generates a JSON output file that is compliant with IRIDA Next - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -### Assembly stub +### Input Assure
Output files -- `assembly/` - - Mock assembly files: `ID.assembly.fa.gz` +- `input/` + - `sampleID_error_report.csv` + - `sampleID.mlst.json.gz`
-### Generate sample JSON +### Locidex merge
Output files -- `generate/` - - JSON files: `ID.json.gz` +- `locidex/merge/` + - reference samples: `reference/merged_ref/merged_profiles_ref.tsv` + - query samples: `query/merged_value/merged_profile_value.tsv`
-### Generate summary +### Profile Dists
Output files -- `summary/` - - Text summary describing samples and assemblies: `summary.txt.gz` +- `distances/` + - Mapping allele identifiers to integers: `allele_map.json` + - The query MLST profiles: `query_profile.text` + - The reference MLST profiles: `ref_profile.text` + - The computed distances based on MLST allele differences: `results.text` + - Information on the profile_dists run: `run.json`
-### Simplify IRIDA JSON +### Cluster File
Output files -- `simplify/` - - Simplified JSON files: `ID.simple.json.gz` +- `cluster/` + - `expected_clusters.txt`
-### IRIDA Next Output +### GAS call
Output files -- `/` - - IRIDA Next-compliant JSON output: `iridanext.output.json.gz` +- `call/` + - The computed cluster addresses: `clusters.text` + - Information on the GAS mcluster run: `run.json` + - Thesholds used to compute cluster addresses: `thresholds.json` + +
+ +### Filter Query + +
+Output files + +- `filter/` + - `new_addresses.csv`
diff --git a/docs/usage.md b/docs/usage.md index 4fbd758..2433443 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,7 +2,7 @@ ## Introduction -This pipeline is an example that illustrates running a nf-core-compliant pipeline on IRIDA Next. +This workflow takes provided JSON-formatted MLST allelic profiles and assigns cluster addresses to samples based on an existing cluster designations. This pipeline is designed to be integrated into IRIDA Next. However, it may be run as a stand-alone pipeline. ## Samplesheet input @@ -14,22 +14,22 @@ You will need to create a samplesheet with information about the samples you wou ### Full samplesheet -The input samplesheet must contain three columns: `ID`, `fastq_1`, `fastq_2`. The IDs within a samplesheet should be unique. All other columns will be ignored. +The input samplesheet must contain three columns: `sample`, `mlst_alleles`, `address`. The sample names within a samplesheet should be unique. All other columns will be ignored. -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. +A final samplesheet file consisting of mlst_alleles and addresses may look something like the one below: ```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -SAMPLE1,sample1_R1.fastq.gz,sample1_R2.fastq.gz -SAMPLE2,sample2_R1.fastq.gz,sample2_R2.fastq.gz -SAMPLE3,sample1_R1.fastq.gz, +sample,mlst_alleles,address +sampleA,sampleA.mlst.json.gz,1.1.1 +sampleQ,sampleQ.mlst.json.gz,2.2.2 +sampleF,sampleF.mlst.json, ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. Samples should be unique within a samplesheet. | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| Column | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. Samples should be unique within a samplesheet. | +| `mlst_alleles` | Full path to an MLST JSON file describing the loci/alleles for the sample against some MLST scheme. A way to generate this file is via [locidex](https://github.com/phac-nml/locidex). File can optionally be gzipped and must have the extension ".mlst.json", ".mlst.subtyping.json" (or with an additional ".gz" if gzipped). | +| `address` | Hierarchal clustering address. If left empty for a sample, the pipeline will assign a cluster address. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. diff --git a/modules/local/assemblystub/main.nf b/modules/local/assemblystub/main.nf deleted file mode 100644 index 00f27d2..0000000 --- a/modules/local/assemblystub/main.nf +++ /dev/null @@ -1,33 +0,0 @@ -process ASSEMBLY_STUB { - tag "$meta.id" - label 'process_single' - - container 'docker.io/python:3.9.17' - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("*.assembly.fa.gz"), emit: assembly - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - cat <<-EOF > ${prefix}.assembly.fa - >${meta.id}-stub-assembly - ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTTAAAAACCCCCGGGGGTTTTT - EOF - - gzip -n ${prefix}.assembly.fa - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - assemblystub : 0.1.0 - END_VERSIONS - """ -} diff --git a/modules/local/cluster_file/main.nf b/modules/local/cluster_file/main.nf new file mode 100644 index 0000000..0a97545 --- /dev/null +++ b/modules/local/cluster_file/main.nf @@ -0,0 +1,45 @@ +process CLUSTER_FILE { + tag "Create cluster file for GAS call" + label 'process_single' + + input: + val meta + + output: + path("reference_clusters.txt"), emit: text + + exec: + def outputLines = [] + def delimiter = java.util.regex.Pattern.quote(params.gm_delimiter) + + // Determine the maximum number of levels to set the header requirements for each pipeline run + int maxLevels = meta.collect { sample -> sample.address.split(delimiter).size() }.max() ?: 0 + + // Verify each sample is consistent with $maxLevels + meta.each { sample -> + int level = sample.address.split(delimiter).size() + if (level != maxLevels) { + error ("Inconsistent levels found: expected $maxLevels levels but found $level levels in ${sample.id}") + } + } + + // Generate the header for the expected_clusters.txt file + def header = ["id", "address"] + (1..maxLevels).collect { "level_$it" } + outputLines << header.join("\t") + + // Iterate over each sample in the meta list and pull the relevant information for the text file + meta.each { sample -> + def id = sample.id + def address = sample.address + def levels = address.split(delimiter) + def line = [id, address] + levels.collect { it.toString() } + outputLines << line.join("\t") + } + + // Write the text file, iterating over each sample + task.workDir.resolve("reference_clusters.txt").withWriter { writer -> + outputLines.each { line -> + writer.writeLine(line) + } + } +} diff --git a/modules/local/filter_query/main.nf b/modules/local/filter_query/main.nf new file mode 100644 index 0000000..9912ee5 --- /dev/null +++ b/modules/local/filter_query/main.nf @@ -0,0 +1,45 @@ +process FILTER_QUERY { + tag "Filter New Query Addresses" + label 'process_single' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/csvtk:0.22.0--h9ee0642_1' : + 'biocontainers/csvtk:0.22.0--h9ee0642_1' }" + + input: + val query_ids + path addresses + val in_format + val out_format + + output: + path("new_addresses.*"), emit: csv + path("versions.yml"), emit: versions + + script: + def outputFile = "new_addresses" + def delimiter = in_format == "tsv" ? "\t" : (in_format == "csv" ? "," : in_format) + def out_delimiter = out_format == "tsv" ? "\t" : (out_format == "csv" ? "," : out_format) + def out_extension = out_format == "tsv" ? 'tsv' : 'csv' + + // Join the query IDs in the correct csvtk filter2 required format + def queryID = query_ids.collect { id -> "\$id == \"${id}\"" }.join(" || ") + + """ + # Filter the query samples only; keep only the 'id' and 'address' columns + csvtk filter2 \\ + ${addresses} \\ + --filter '$queryID' \\ + --delimiter "${delimiter}" \\ + --out-delimiter "${out_delimiter}" | \\ + csvtk cut -f id,address > ${outputFile}.${out_extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ + + +} + diff --git a/modules/local/gas/call/main.nf b/modules/local/gas/call/main.nf index 1fe2c64..3216c9e 100644 --- a/modules/local/gas/call/main.nf +++ b/modules/local/gas/call/main.nf @@ -2,7 +2,7 @@ process GAS_CALL{ label "process_high" - tag "Calling: ${meta.id}" + tag "Assigning Nomenclature" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/genomic_address_service%3A0.1.1--pyh7cba7a3_1' : @@ -10,21 +10,22 @@ process GAS_CALL{ input: - tuple val(meta), path(reference_clusters), path(distances) + path(reference_clusters) + path(distances) output: - tuple val(meta), path("${prefix}/results.{text,parquet}"), emit: distances, optional: true - tuple val(meta), path("${prefix}/thresholds.json"), emit: thresholds - tuple val(meta), path("${prefix}/run.json"), emit: run + path("${prefix}/results.{text,parquet}"), emit: distances, optional: true + path("${prefix}/thresholds.json"), emit: thresholds + path("${prefix}/run.json"), emit: run path "versions.yml", emit: versions script: - // Need to add more args for gas call below - prefix = meta.id + prefix = "Called" """ gas call --dists $distances \\ --rclusters $reference_clusters \\ --outdir ${prefix} \\ + --method ${params.gm_method} \\ --threshold ${params.gm_thresholds} \\ --delimeter ${params.gm_delimiter} diff --git a/modules/local/generatesamplejson/main.nf b/modules/local/generatesamplejson/main.nf deleted file mode 100644 index f3b5cd3..0000000 --- a/modules/local/generatesamplejson/main.nf +++ /dev/null @@ -1,49 +0,0 @@ -process GENERATE_SAMPLE_JSON { - tag "$meta.id" - label 'process_single' - - container 'docker.io/python:3.9.17' - - input: - tuple val(meta), path(reads), path(assembly) - - output: - tuple val(meta), path("*.json.gz"), emit: json - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def assembly_path = ["${task.assembly_directory_name}", "${assembly}"].join(File.separator) - """ - cat <<-EOF > "${meta.id}.json" - { - "files": { - "samples": { - "${meta.id}": [ - { - "path": "${assembly_path}" - } - ] - } - }, - "metadata": { - "samples": { - "${meta.id}": { - "reads": ["${reads[0]}", "${reads[1]}"] - } - } - } - } - EOF - gzip ${meta.id}.json - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - generatesamplejson : 0.1.0 - END_VERSIONS - """ -} diff --git a/modules/local/generatesummary/main.nf b/modules/local/generatesummary/main.nf deleted file mode 100644 index a3d0245..0000000 --- a/modules/local/generatesummary/main.nf +++ /dev/null @@ -1,38 +0,0 @@ -process GENERATE_SUMMARY { - label 'process_single' - container 'docker.io/python:3.9.17' - - input: - val summaries - - output: - path("summary.txt.gz"), emit: summary - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def sorted_summaries = summaries.sort{ it[0].id } - - // Generate summary text: - def summary_text = "IRIDANEXTEXAMPLE Pipeline Summary\n\nSUCCESS!\n" - - // TODO: Consider the possibility of code injection. - // Should probably be moved to file processing through Python. - for (summary in sorted_summaries) { - summary_text += "\n${summary[0].id}:\n" - summary_text += " reads.1: ${summary[1][0]}\n" - summary_text += " reads.2: ${summary[1][1]}\n" - summary_text += " assembly: ${summary[2]}\n" - } - - version_text = "\"${task.process}\":\n generatesummary : 0.1.0" - - """ - echo "${summary_text}" > summary.txt - gzip -n summary.txt - echo "${version_text}" > versions.yml - """ -} diff --git a/modules/local/input_assure/main.nf b/modules/local/input_assure/main.nf new file mode 100644 index 0000000..43b7462 --- /dev/null +++ b/modules/local/input_assure/main.nf @@ -0,0 +1,32 @@ +process INPUT_ASSURE { + tag "Assures Inputs are Consistent" + label 'process_single' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'biocontainers/python:3.8.3' }" + + input: + tuple val(meta), path(mlst) + + output: + tuple val(meta), path("${meta.id}.mlst.json.gz"), emit: result + tuple val(meta), path("*_error_report.csv"), optional: true, emit: error_report + path("versions.yml"), emit: versions + + script: + + """ + input_assure.py \\ + --input ${mlst} \\ + --sample_id ${meta.id} \\ + --address ${meta.address} \\ + --output_error ${meta.id}_error_report.csv \\ + --output_json ${meta.id}.mlst.json.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/iridanextoutput/main.nf b/modules/local/iridanextoutput/main.nf deleted file mode 100644 index 92595ee..0000000 --- a/modules/local/iridanextoutput/main.nf +++ /dev/null @@ -1,31 +0,0 @@ -process IRIDA_NEXT_OUTPUT { - label 'process_single' - - container 'docker.io/python:3.9.17' - - input: - path(samples_data) - - output: - path("iridanext.output.json.gz"), emit: output_json - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def samples_data_dir = "samples_data" - """ - irida-next-output.py \\ - $args \\ - --summary-file ${task.summary_directory_name}/summary.txt.gz \\ - --json-output iridanext.output.json.gz \\ - ${samples_data} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - iridanextoutput : 0.1.0 - END_VERSIONS - """ -} diff --git a/modules/local/locidex/merge/main.nf b/modules/local/locidex/merge/main.nf index bd9f3e8..7721625 100644 --- a/modules/local/locidex/merge/main.nf +++ b/modules/local/locidex/merge/main.nf @@ -9,17 +9,20 @@ process LOCIDEX_MERGE { 'quay.io/biocontainers/locidex:0.1.1--pyhdfd78af_0' }" input: - val input_values // [file(sample1), file(sample2), file(sample3), etc...] + path input_values // [file(sample1), file(sample2), file(sample3), etc...] + val input_tag // makes output unique and denotes the item as the reference or query to preven name collision output: path("${combined_dir}/*.tsv"), emit: combined_profiles - path("${combined_dir}/*.json"), emit: report path "versions.yml", emit: versions script: - combined_dir = "merged" + combined_dir = "merged_${input_tag}" """ locidex merge -i ${input_values.join(' ')} -o ${combined_dir} + + mv ${combined_dir}/*.tsv ${combined_dir}/merged_profiles_${input_tag}.tsv + cat <<-END_VERSIONS > versions.yml "${task.process}": locidex merge: \$(echo \$(locidex search -V 2>&1) | sed 's/^.*locidex //' ) diff --git a/modules/local/profile_dists/main.nf b/modules/local/profile_dists/main.nf index 2e48a02..f43d63b 100644 --- a/modules/local/profile_dists/main.nf +++ b/modules/local/profile_dists/main.nf @@ -1,24 +1,24 @@ process PROFILE_DISTS{ label "process_high" - tag "Pairwise Distance Generation: ${meta.id}" + tag "Gathering Distances Between Reference and Query Profiles" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/profile_dists%3A1.0.0--pyh7cba7a3_0' : 'quay.io/biocontainers/profile_dists:1.0.0--pyh7cba7a3_0' }" input: - tuple val(meta), path(query), path(ref) - val mapping_format - path(mapping_file) - path(columns) + path query + path ref + path mapping_file + path columns output: - tuple val(meta), path("${prefix}_${mapping_format}/allele_map.json"), emit: allele_map - tuple val(meta), path("${prefix}_${mapping_format}/query_profile.{text,parquet}"), emit: query_profile - tuple val(meta), path("${prefix}_${mapping_format}/ref_profile.{text,parquet}"), emit: ref_profile - tuple val(meta), path("${prefix}_${mapping_format}/results.{text,parquet}"), emit: results - tuple val(meta), path("${prefix}_${mapping_format}/run.json"), emit: run + path("${prefix}/allele_map.json"), emit: allele_map + path("${prefix}/query_profile.{text,parquet}"), emit: query_profile + path("${prefix}/ref_profile.{text,parquet}"), emit: ref_profile + path("${prefix}/results.{text,parquet}"), emit: results + path("${prefix}/run.json"), emit: run path "versions.yml", emit: versions @@ -31,9 +31,6 @@ process PROFILE_DISTS{ if(columns){ args = args + " --columns $columns" } - if(params.pd_force){ - args = args + " --force" - } if(params.pd_skip){ args = args + " --skip" } @@ -41,16 +38,17 @@ process PROFILE_DISTS{ args = args + " --count_missing" } // --match_threshold $params.profile_dists.match_thresh \\ - prefix = meta.id + prefix = "distances_pairwise" """ - profile_dists --query $query --ref $ref $args --outfmt $mapping_format \\ + profile_dists --query $query --ref $ref $args \\ + --outfmt pairwise \\ --distm $params.pd_distm \\ --file_type $params.pd_file_type \\ --missing_thresh $params.pd_missing_threshold \\ --sample_qual_thresh $params.pd_sample_quality_threshold \\ --max_mem ${task.memory.toGiga()} \\ --cpus ${task.cpus} \\ - -o ${prefix}_${mapping_format} + -o ${prefix} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf deleted file mode 100644 index 6c1c1f4..0000000 --- a/modules/local/samplesheet_check.nf +++ /dev/null @@ -1,31 +0,0 @@ -process SAMPLESHEET_CHECK { - tag "$samplesheet" - label 'process_single' - - conda "conda-forge::python=3.8.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'biocontainers/python:3.8.3' }" - - input: - path samplesheet - - output: - path '*.csv' , emit: csv - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: // This script is bundled with the pipeline, in phac-nml/gasnomenclature/bin/ - """ - check_samplesheet.py \\ - $samplesheet \\ - samplesheet.valid.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/local/simplifyiridajson/main.nf b/modules/local/simplifyiridajson/main.nf deleted file mode 100644 index e2e7352..0000000 --- a/modules/local/simplifyiridajson/main.nf +++ /dev/null @@ -1,33 +0,0 @@ -process SIMPLIFY_IRIDA_JSON { - tag "$meta.id" - label 'process_single' - - container 'docker.io/python:3.9.17' - - input: - tuple val(meta), path(json) - - output: - tuple val(meta), path("*.simple.json.gz") , emit: simple_json - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - simplify_irida_json.py \\ - $args \\ - --json-output ${meta.id}.simple.json \\ - ${json} - - gzip ${meta.id}.simple.json - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - simplifyiridajson : 0.1.0 - END_VERSIONS - """ -} diff --git a/nextflow.config b/nextflow.config index bddc99e..423d59c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,9 +11,6 @@ params { // Input options input = null - project_name = 'assembly' - assembler = 'stub' - random_seed = 1 // Boilerplate options outdir = null @@ -47,6 +44,20 @@ params { validate_params = true // Profile Dists + pd_distm = "hamming" + pd_missing_threshold = 1.0 + pd_sample_quality_threshold = 1.0 + pd_file_type = "text" + pd_mapping_file = null // default is no file + pd_skip = false + pd_columns = null + pd_count_missing = false + + + // GAS Call + gm_thresholds = "10,5,0" + gm_method = "average" + gm_delimiter = "." } @@ -160,8 +171,12 @@ singularity.registry = 'quay.io' // Nextflow plugins plugins { id 'nf-validation@1.1.3' // Validation of pipeline parameters and creation of an input channel from a sample sheet + id 'nf-iridanext@0.2.0' // Generation of JSON output for IRIDA Next } +// Load iridanext.config for specific options +includeConfig 'conf/iridanext.config' + // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. @@ -204,7 +219,7 @@ manifest { description = """Gas Nomenclature assignment pipeline""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = 'v0.0.1dev' + version = '0.1.0' doi = '' defaultBranch = 'main' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 8639c86..b2b8a89 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -2,9 +2,93 @@ "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/phac-nml/gasnomenclature/main/nextflow_schema.json", "title": "phac-nml/gasnomenclature pipeline parameters", - "description": "IRIDA Next Example Pipeline", + "description": "Gas Nomenclature assignment pipeline", "type": "object", "definitions": { + "gas_call": { + "title": "GAS Call", + "type": "object", + "description": "", + "default": "", + "properties": { + "gm_thresholds": { + "type": "string", + "default": "10,5,0", + "description": "Thresholds delimited by ','. Values should match units from '--pd_distm' (either 'hamming' or 'scaled').", + "pattern": "^(\\d+(\\.\\d+)?,)*\\d+(\\.\\d+)?$" + }, + "gm_method": { + "type": "string", + "default": "average", + "description": "Clustering linkage method.", + "enum": ["single", "average", "complete"] + }, + "gm_delimiter": { + "type": "string", + "default": ".", + "description": "Delimiter desired for nomenclature code.", + "pattern": "^[A-Fa-f0-9\\._-]+$" + } + } + }, + "profile_dists": { + "title": "Profile Dists", + "type": "object", + "description": "", + "default": "", + "properties": { + "pd_distm": { + "type": "string", + "description": "The distance method/unit", + "enum": ["hamming", "scaled"], + "default": "hamming" + }, + "pd_missing_threshold": { + "type": "number", + "description": "The maximum proportion of missing data per locus for a locus to be kept in the analysis", + "minimum": 0, + "maximum": 1, + "default": 1 + }, + "pd_sample_quality_threshold": { + "type": "number", + "description": "The maximum proportion of missing data per sample for a sample to be kept in the analysis", + "minimum": 0, + "maximum": 1, + "default": 1 + }, + "pd_file_type": { + "type": "string", + "description": "Output format file type", + "enum": ["text", "parquet"], + "default": "text" + }, + "pd_mapping_file": { + "type": "string", + "pattern": "^\\S+\\.json(\\.gz)?$", + "description": "A file used to map allele codes to integers for internal distance calculations", + "exists": true, + "hidden": true, + "format": "file-path" + }, + "pd_skip": { + "type": "boolean", + "description": "Skip QA/QC steps" + }, + "pd_columns": { + "type": "string", + "pattern": "^\\S+$", + "description": "Defines the loci to keep within the analysis. Formatted as a single column file with one locus name per line or list of comma-separated loci", + "exists": true, + "format": "file-path" + }, + "pd_count_missing": { + "type": "boolean", + "description": "Count missing alleles as different", + "default": false + } + } + }, "input_output_options": { "title": "Input/output options", "type": "object", @@ -29,27 +113,6 @@ "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" }, - "project_name": { - "type": "string", - "default": "assembly", - "pattern": "^\\S+$", - "description": "The name of the project.", - "fa_icon": "fas fa-tag" - }, - "assembler": { - "type": "string", - "default": "stub", - "fa_icon": "fas fa-desktop", - "description": "The sequence assembler to use for sequence assembly.", - "enum": ["default", "stub", "experimental"] - }, - "random_seed": { - "type": "integer", - "default": 1, - "fa_icon": "fas fa-dice-six", - "description": "The random seed to use for sequence assembly.", - "minimum": 1 - }, "email": { "type": "string", "description": "Email address for completion summary.", @@ -214,6 +277,12 @@ } }, "allOf": [ + { + "$ref": "#/definitions/gas_call" + }, + { + "$ref": "#/definitions/profile_dists" + }, { "$ref": "#/definitions/input_output_options" }, diff --git a/nf-test.config b/nf-test.config index 870799d..2fa82ad 100644 --- a/nf-test.config +++ b/nf-test.config @@ -3,6 +3,6 @@ config { testsDir "tests" workDir ".nf-test" configFile "tests/nextflow.config" - profile "" + profile "docker" } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf deleted file mode 100644 index 0aecf87..0000000 --- a/subworkflows/local/input_check.nf +++ /dev/null @@ -1,44 +0,0 @@ -// -// Check input samplesheet and get read channels -// - -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' - -workflow INPUT_CHECK { - take: - samplesheet // file: /path/to/samplesheet.csv - - main: - SAMPLESHEET_CHECK ( samplesheet ) - .csv - .splitCsv ( header:true, sep:',' ) - .map { create_fastq_channel(it) } - .set { reads } - - emit: - reads // channel: [ val(meta), [ reads ] ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] -} - -// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channel(LinkedHashMap row) { - // create meta map - def meta = [:] - meta.id = row.sample - meta.single_end = row.single_end.toBoolean() - - // add path(s) of the fastq file(s) to the meta map - def fastq_meta = [] - if (!file(row.fastq_1).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}" - } - if (meta.single_end) { - fastq_meta = [ meta, [ file(row.fastq_1) ] ] - } else { - if (!file(row.fastq_2).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" - } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] - } - return fastq_meta -} diff --git a/tests/data/called/expected_results.txt b/tests/data/called/expected_results.txt new file mode 100644 index 0000000..1e530e7 --- /dev/null +++ b/tests/data/called/expected_results.txt @@ -0,0 +1,5 @@ +id address level_1 level_2 level_3 +sample1 1.1.1 1 1 1 +sample2 1.1.1 1 1 1 +sample3 1.1.2 1 1 2 +sampleQ 1.1.3 1 1 3 diff --git a/tests/data/called/expected_results_count-missing.txt b/tests/data/called/expected_results_count-missing.txt new file mode 100644 index 0000000..26b264c --- /dev/null +++ b/tests/data/called/expected_results_count-missing.txt @@ -0,0 +1,5 @@ +id address level_1 +sample1 1 1 +sample2 1 1 +sample3 2 2 +sampleQ 1 1 diff --git a/tests/data/called/expected_results_loci-missing.txt b/tests/data/called/expected_results_loci-missing.txt new file mode 100644 index 0000000..26b264c --- /dev/null +++ b/tests/data/called/expected_results_loci-missing.txt @@ -0,0 +1,5 @@ +id address level_1 +sample1 1 1 +sample2 1 1 +sample3 2 2 +sampleQ 1 1 diff --git a/tests/data/called/expected_results_missing.txt b/tests/data/called/expected_results_missing.txt new file mode 100644 index 0000000..26b264c --- /dev/null +++ b/tests/data/called/expected_results_missing.txt @@ -0,0 +1,5 @@ +id address level_1 +sample1 1 1 +sample2 1 1 +sample3 2 2 +sampleQ 1 1 diff --git a/tests/data/called/expected_results_queries.txt b/tests/data/called/expected_results_queries.txt new file mode 100644 index 0000000..f5e5ae4 --- /dev/null +++ b/tests/data/called/expected_results_queries.txt @@ -0,0 +1,6 @@ +id address level_1 level_2 level_3 +sample1 1.1.1 1 1 1 +sample2 1.1.1 1 1 1 +sample3 1.1.2 1 1 2 +sampleQ 2.2.3 2 2 3 +sampleN 2.2.3 2 2 3 diff --git a/tests/data/called/expected_results_scaled.txt b/tests/data/called/expected_results_scaled.txt new file mode 100644 index 0000000..bd70a8e --- /dev/null +++ b/tests/data/called/expected_results_scaled.txt @@ -0,0 +1,5 @@ +id address level_1 level_2 level_3 +sample1 1.1.1 1 1 1 +sample2 1.1.1 1 1 1 +sample3 1.1.2 1 1 2 +sampleQ 1.2.3 1 2 3 diff --git a/tests/data/called/expected_results_thresh_1.txt b/tests/data/called/expected_results_thresh_1.txt new file mode 100644 index 0000000..165001a --- /dev/null +++ b/tests/data/called/expected_results_thresh_1.txt @@ -0,0 +1,5 @@ +id address level_1 +sample1 1 1 +sample2 1 1 +sample3 1 1 +sampleQ 1 1 diff --git a/tests/data/called/expected_results_thresh_1_0.txt b/tests/data/called/expected_results_thresh_1_0.txt new file mode 100644 index 0000000..c2ddc4f --- /dev/null +++ b/tests/data/called/expected_results_thresh_1_0.txt @@ -0,0 +1,5 @@ +id address level_1 level_2 +sample1 1.1 1 1 +sample2 1.1 1 1 +sample3 1.1 1 1 +sampleQ 1.2 1 2 diff --git a/tests/data/clusters/expected_clusters.txt b/tests/data/clusters/expected_clusters.txt new file mode 100644 index 0000000..362ea84 --- /dev/null +++ b/tests/data/clusters/expected_clusters.txt @@ -0,0 +1,4 @@ +id address level_1 level_2 level_3 +sample1 1.1.1 1 1 1 +sample2 1.1.1 1 1 1 +sample3 1.1.2 1 1 2 diff --git a/tests/data/clusters/expected_clusters_missing.txt b/tests/data/clusters/expected_clusters_missing.txt new file mode 100644 index 0000000..186ff1d --- /dev/null +++ b/tests/data/clusters/expected_clusters_missing.txt @@ -0,0 +1,4 @@ +id address level_1 +sample1 1 1 +sample2 1 1 +sample3 2 2 diff --git a/tests/data/clusters/expected_tree.nwk b/tests/data/clusters/expected_tree.nwk new file mode 100644 index 0000000..a8cc370 --- /dev/null +++ b/tests/data/clusters/expected_tree.nwk @@ -0,0 +1 @@ +((sample2:0.000000,sample1:0.000000):16.666666666666668,sample3:33.333333); diff --git a/tests/data/columns/keep-zero-loci-empty-file.txt b/tests/data/columns/keep-zero-loci-empty-file.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/data/columns/keep-zero-loci-empty-file.txt @@ -0,0 +1 @@ + diff --git a/tests/data/distances/expected_dists.tsv b/tests/data/distances/expected_dists.tsv new file mode 100644 index 0000000..45bdd70 --- /dev/null +++ b/tests/data/distances/expected_dists.tsv @@ -0,0 +1,5 @@ +dists sampleQ sample1 sample2 sample3 +sampleQ 0 1 1 2 +sample1 1 0 0 1 +sample2 1 0 0 1 +sample3 2 1 1 0 diff --git a/tests/data/distances/expected_dists_count-missing.txt b/tests/data/distances/expected_dists_count-missing.txt new file mode 100644 index 0000000..1313023 --- /dev/null +++ b/tests/data/distances/expected_dists_count-missing.txt @@ -0,0 +1,5 @@ +query_id ref_id dist +sampleQ sampleQ 0 +sampleQ sample1 1 +sampleQ sample2 2 +sampleQ sample3 3 diff --git a/tests/data/distances/expected_dists_loci-missing.txt b/tests/data/distances/expected_dists_loci-missing.txt new file mode 100644 index 0000000..1313023 --- /dev/null +++ b/tests/data/distances/expected_dists_loci-missing.txt @@ -0,0 +1,5 @@ +query_id ref_id dist +sampleQ sampleQ 0 +sampleQ sample1 1 +sampleQ sample2 2 +sampleQ sample3 3 diff --git a/tests/data/distances/expected_dists_missing.txt b/tests/data/distances/expected_dists_missing.txt new file mode 100644 index 0000000..84ea004 --- /dev/null +++ b/tests/data/distances/expected_dists_missing.txt @@ -0,0 +1,5 @@ +query_id ref_id dist +sampleQ sampleQ 0 +sampleQ sample1 1 +sampleQ sample2 1 +sampleQ sample3 2 diff --git a/tests/data/distances/expected_dists_scaled.txt b/tests/data/distances/expected_dists_scaled.txt new file mode 100644 index 0000000..cd51991 --- /dev/null +++ b/tests/data/distances/expected_dists_scaled.txt @@ -0,0 +1,5 @@ +query_id ref_id dist +sampleQ sampleQ 0.0 +sampleQ sample1 33.333333333333336 +sampleQ sample2 33.333333333333336 +sampleQ sample3 66.66666666666667 diff --git a/tests/data/distances/expected_dists_thresh_1.txt b/tests/data/distances/expected_dists_thresh_1.txt new file mode 100644 index 0000000..84ea004 --- /dev/null +++ b/tests/data/distances/expected_dists_thresh_1.txt @@ -0,0 +1,5 @@ +query_id ref_id dist +sampleQ sampleQ 0 +sampleQ sample1 1 +sampleQ sample2 1 +sampleQ sample3 2 diff --git a/tests/data/distances/expected_dists_thresh_1_0.txt b/tests/data/distances/expected_dists_thresh_1_0.txt new file mode 100644 index 0000000..84ea004 --- /dev/null +++ b/tests/data/distances/expected_dists_thresh_1_0.txt @@ -0,0 +1,5 @@ +query_id ref_id dist +sampleQ sampleQ 0 +sampleQ sample1 1 +sampleQ sample2 1 +sampleQ sample3 2 diff --git a/tests/data/distances/expected_pairwise_dists.txt b/tests/data/distances/expected_pairwise_dists.txt new file mode 100644 index 0000000..84ea004 --- /dev/null +++ b/tests/data/distances/expected_pairwise_dists.txt @@ -0,0 +1,5 @@ +query_id ref_id dist +sampleQ sampleQ 0 +sampleQ sample1 1 +sampleQ sample2 1 +sampleQ sample3 2 diff --git a/tests/data/distances/expected_pairwise_queries_dists.txt b/tests/data/distances/expected_pairwise_queries_dists.txt new file mode 100644 index 0000000..44aa848 --- /dev/null +++ b/tests/data/distances/expected_pairwise_queries_dists.txt @@ -0,0 +1,11 @@ +query_id ref_id dist +sampleQ sampleQ 0 +sampleQ sampleN 0 +sampleQ sample1 1 +sampleQ sample2 1 +sampleQ sample3 2 +sampleN sampleQ 0 +sampleN sampleN 0 +sampleN sample1 1 +sampleN sample2 1 +sampleN sample3 2 diff --git a/tests/data/irida/count-missing_iridanext.output.json b/tests/data/irida/count-missing_iridanext.output.json new file mode 100644 index 0000000..2f0745e --- /dev/null +++ b/tests/data/irida/count-missing_iridanext.output.json @@ -0,0 +1,13 @@ +{ + "files": { + "global": [], + "samples": {} + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1" + } + } + } +} diff --git a/tests/data/irida/loci-missing_iridanext.output.json b/tests/data/irida/loci-missing_iridanext.output.json new file mode 100644 index 0000000..2f0745e --- /dev/null +++ b/tests/data/irida/loci-missing_iridanext.output.json @@ -0,0 +1,13 @@ +{ + "files": { + "global": [], + "samples": {} + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1" + } + } + } +} diff --git a/tests/data/irida/mismatched_iridanext.output.json b/tests/data/irida/mismatched_iridanext.output.json new file mode 100644 index 0000000..750523b --- /dev/null +++ b/tests/data/irida/mismatched_iridanext.output.json @@ -0,0 +1,27 @@ +{ + "files": { + "global": [], + "samples": { + "sampleR": [ + { + "path": "input/sampleR_error_report.csv" + } + ], + "sample2": [ + { + "path": "input/sample2_error_report.csv" + } + ] + } + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "2.2.3" + }, + "sampleR": { + "address": "2.2.3" + } + } + } +} diff --git a/tests/data/irida/missing_iridanext.output.json b/tests/data/irida/missing_iridanext.output.json new file mode 100644 index 0000000..2f0745e --- /dev/null +++ b/tests/data/irida/missing_iridanext.output.json @@ -0,0 +1,13 @@ +{ + "files": { + "global": [], + "samples": {} + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1" + } + } + } +} diff --git a/tests/data/irida/multiplekeys_iridanext.output.json b/tests/data/irida/multiplekeys_iridanext.output.json new file mode 100644 index 0000000..f7b872f --- /dev/null +++ b/tests/data/irida/multiplekeys_iridanext.output.json @@ -0,0 +1,19 @@ +{ + "files": { + "global": [], + "samples": { + "sample3": [ + { + "path": "input/sample3_error_report.csv" + } + ] + } + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1.1.3" + } + } + } +} diff --git a/tests/data/irida/queries_iridanext.output.json b/tests/data/irida/queries_iridanext.output.json new file mode 100644 index 0000000..7063e8e --- /dev/null +++ b/tests/data/irida/queries_iridanext.output.json @@ -0,0 +1,16 @@ +{ + "files": { + "global": [], + "samples": {} + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "2.2.3" + }, + "sampleN": { + "address": "2.2.3" + } + } + } +} diff --git a/tests/data/irida/scaled_iridanext.output.json b/tests/data/irida/scaled_iridanext.output.json new file mode 100644 index 0000000..3121e6a --- /dev/null +++ b/tests/data/irida/scaled_iridanext.output.json @@ -0,0 +1,13 @@ +{ + "files": { + "global": [], + "samples": {} + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1.2.3" + } + } + } +} diff --git a/tests/data/irida/test_iridanext.output.json b/tests/data/irida/test_iridanext.output.json new file mode 100644 index 0000000..3d0bfb5 --- /dev/null +++ b/tests/data/irida/test_iridanext.output.json @@ -0,0 +1,13 @@ +{ + "files": { + "global": [], + "samples": {} + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1.1.3" + } + } + } +} diff --git a/tests/data/irida/thresh1.0_iridanext.output.json b/tests/data/irida/thresh1.0_iridanext.output.json new file mode 100644 index 0000000..d85169a --- /dev/null +++ b/tests/data/irida/thresh1.0_iridanext.output.json @@ -0,0 +1,13 @@ +{ + "files": { + "global": [], + "samples": {} + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1.2" + } + } + } +} diff --git a/tests/data/irida/thresh1_iridanext.output.json b/tests/data/irida/thresh1_iridanext.output.json new file mode 100644 index 0000000..2f0745e --- /dev/null +++ b/tests/data/irida/thresh1_iridanext.output.json @@ -0,0 +1,13 @@ +{ + "files": { + "global": [], + "samples": {} + }, + "metadata": { + "samples": { + "sampleQ": { + "address": "1" + } + } + } +} diff --git a/tests/data/profiles/expected-profile1.tsv b/tests/data/profiles/expected-profile1.tsv new file mode 100644 index 0000000..6d02526 --- /dev/null +++ b/tests/data/profiles/expected-profile1.tsv @@ -0,0 +1,5 @@ +sample_id l1 l2 l3 +sampleQ 1 2 1 +sample1 1 1 1 +sample2 1 1 1 +sample3 1 1 2 diff --git a/tests/data/profiles/expected-profile2.tsv b/tests/data/profiles/expected-profile2.tsv new file mode 100644 index 0000000..44020cb --- /dev/null +++ b/tests/data/profiles/expected-profile2.tsv @@ -0,0 +1,2 @@ +sample_id l1 l2 l3 +sampleQ 1 2 1 diff --git a/tests/data/profiles/expected-profile_missing1.tsv b/tests/data/profiles/expected-profile_missing1.tsv new file mode 100644 index 0000000..6d37496 --- /dev/null +++ b/tests/data/profiles/expected-profile_missing1.tsv @@ -0,0 +1,5 @@ +sample_id l1 l2 l3 +sampleQ 1 2 1 +sample1 1 1 1 +sample2 - 1 1 +sample3 - 1 2 diff --git a/tests/data/profiles/expected-profile_missing2.tsv b/tests/data/profiles/expected-profile_missing2.tsv new file mode 100644 index 0000000..44020cb --- /dev/null +++ b/tests/data/profiles/expected-profile_missing2.tsv @@ -0,0 +1,2 @@ +sample_id l1 l2 l3 +sampleQ 1 2 1 diff --git a/tests/data/profiles/expected-profile_queries1.tsv b/tests/data/profiles/expected-profile_queries1.tsv new file mode 100644 index 0000000..b2f8100 --- /dev/null +++ b/tests/data/profiles/expected-profile_queries1.tsv @@ -0,0 +1,6 @@ +sample_id l1 l2 l3 +sampleQ 1 2 1 +sampleN 1 2 1 +sample1 1 1 1 +sample2 1 1 1 +sample3 1 1 2 diff --git a/tests/data/profiles/expected-profile_queries2.tsv b/tests/data/profiles/expected-profile_queries2.tsv new file mode 100644 index 0000000..4b4d059 --- /dev/null +++ b/tests/data/profiles/expected-profile_queries2.tsv @@ -0,0 +1,3 @@ +sample_id l1 l2 l3 +sampleQ 1 2 1 +sampleN 1 2 1 diff --git a/tests/data/profiles/expected-profile_scaled1.tsv b/tests/data/profiles/expected-profile_scaled1.tsv new file mode 100644 index 0000000..6d02526 --- /dev/null +++ b/tests/data/profiles/expected-profile_scaled1.tsv @@ -0,0 +1,5 @@ +sample_id l1 l2 l3 +sampleQ 1 2 1 +sample1 1 1 1 +sample2 1 1 1 +sample3 1 1 2 diff --git a/tests/data/profiles/expected-profile_scaled2.tsv b/tests/data/profiles/expected-profile_scaled2.tsv new file mode 100644 index 0000000..44020cb --- /dev/null +++ b/tests/data/profiles/expected-profile_scaled2.tsv @@ -0,0 +1,2 @@ +sample_id l1 l2 l3 +sampleQ 1 2 1 diff --git a/tests/data/reports/sample1.mlst.json b/tests/data/reports/sample1.mlst.json new file mode 100644 index 0000000..01bc774 --- /dev/null +++ b/tests/data/reports/sample1.mlst.json @@ -0,0 +1,7 @@ +{ + "sample1": { + "l1": "1", + "l2": "1", + "l3": "1" + } +} diff --git a/tests/data/reports/sample1.mlst.json.gz b/tests/data/reports/sample1.mlst.json.gz new file mode 100644 index 0000000..735e108 Binary files /dev/null and b/tests/data/reports/sample1.mlst.json.gz differ diff --git a/tests/data/reports/sample2.mlst.json b/tests/data/reports/sample2.mlst.json new file mode 100644 index 0000000..7c0426c --- /dev/null +++ b/tests/data/reports/sample2.mlst.json @@ -0,0 +1,7 @@ +{ + "sample2": { + "l1": "1", + "l2": "1", + "l3": "1" + } +} diff --git a/tests/data/reports/sample2_empty.mlst.json b/tests/data/reports/sample2_empty.mlst.json new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/tests/data/reports/sample2_empty.mlst.json @@ -0,0 +1 @@ +{} diff --git a/tests/data/reports/sample2_missing.mlst.json b/tests/data/reports/sample2_missing.mlst.json new file mode 100644 index 0000000..113e15b --- /dev/null +++ b/tests/data/reports/sample2_missing.mlst.json @@ -0,0 +1,7 @@ +{ + "sample2": { + "l1": "-", + "l2": "1", + "l3": "1" + } +} diff --git a/tests/data/reports/sample3.mlst.json b/tests/data/reports/sample3.mlst.json new file mode 100644 index 0000000..43ea3c7 --- /dev/null +++ b/tests/data/reports/sample3.mlst.json @@ -0,0 +1,7 @@ +{ + "sample3": { + "l1": "1", + "l2": "1", + "l3": "2" + } +} diff --git a/tests/data/reports/sample3_missing.mlst.json b/tests/data/reports/sample3_missing.mlst.json new file mode 100644 index 0000000..49942f8 --- /dev/null +++ b/tests/data/reports/sample3_missing.mlst.json @@ -0,0 +1,7 @@ +{ + "sample3": { + "l1": "-", + "l2": "1", + "l3": "2" + } +} diff --git a/tests/data/reports/sample3_multiplekeys.mlst.json b/tests/data/reports/sample3_multiplekeys.mlst.json new file mode 100644 index 0000000..5d85e65 --- /dev/null +++ b/tests/data/reports/sample3_multiplekeys.mlst.json @@ -0,0 +1,12 @@ +{ + "extra_key": { + "l1": "1", + "l2": "1", + "l3": "2" + }, + "sample3": { + "l1": "1", + "l2": "1", + "l3": "2" + } +} diff --git a/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json b/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json new file mode 100644 index 0000000..6d7878d --- /dev/null +++ b/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json @@ -0,0 +1,12 @@ +{ + "sample4": { + "l1": "1", + "l2": "1", + "l3": "2" + }, + "extra_key": { + "l1": "1", + "l2": "1", + "l3": "2" + } +} diff --git a/tests/data/reports/sample7.mlst.json b/tests/data/reports/sample7.mlst.json new file mode 100644 index 0000000..41d6312 --- /dev/null +++ b/tests/data/reports/sample7.mlst.json @@ -0,0 +1,7 @@ +{ + "sample7": { + "l1": "1", + "l2": "1", + "l3": "1" + } +} diff --git a/tests/data/reports/sampleF.mlst.json b/tests/data/reports/sampleF.mlst.json new file mode 100644 index 0000000..8c09d39 --- /dev/null +++ b/tests/data/reports/sampleF.mlst.json @@ -0,0 +1,7 @@ +{ + "sampleF": { + "l1": "1", + "l2": "2", + "l3": "1" + } +} diff --git a/tests/data/reports/sampleN.mlst.json b/tests/data/reports/sampleN.mlst.json new file mode 100644 index 0000000..178b6db --- /dev/null +++ b/tests/data/reports/sampleN.mlst.json @@ -0,0 +1,7 @@ +{ + "sampleN": { + "l1": "1", + "l2": "2", + "l3": "1" + } +} diff --git a/tests/data/reports/sampleQ.mlst.json b/tests/data/reports/sampleQ.mlst.json new file mode 100644 index 0000000..c6cca43 --- /dev/null +++ b/tests/data/reports/sampleQ.mlst.json @@ -0,0 +1,7 @@ +{ + "sampleQ": { + "l1": "1", + "l2": "2", + "l3": "1" + } +} diff --git a/tests/data/samplesheets/samplesheet-hash_missing.csv b/tests/data/samplesheets/samplesheet-hash_missing.csv new file mode 100644 index 0000000..7bfe7af --- /dev/null +++ b/tests/data/samplesheets/samplesheet-hash_missing.csv @@ -0,0 +1,5 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2_missing.mlst.json,1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3_missing.mlst.json,2 diff --git a/tests/data/samplesheets/samplesheet-mismatched_IDs.csv b/tests/data/samplesheets/samplesheet-mismatched_IDs.csv new file mode 100644 index 0000000..73230d4 --- /dev/null +++ b/tests/data/samplesheets/samplesheet-mismatched_IDs.csv @@ -0,0 +1,7 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sampleR,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleF.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample7.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 + diff --git a/tests/data/samplesheets/samplesheet-multiple_keys.csv b/tests/data/samplesheets/samplesheet-multiple_keys.csv new file mode 100644 index 0000000..74f034a --- /dev/null +++ b/tests/data/samplesheets/samplesheet-multiple_keys.csv @@ -0,0 +1,5 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3_multiplekeys.mlst.json,1.1.2 diff --git a/tests/data/samplesheets/samplesheet-multiple_queries.csv b/tests/data/samplesheets/samplesheet-multiple_queries.csv new file mode 100644 index 0000000..c8e76de --- /dev/null +++ b/tests/data/samplesheets/samplesheet-multiple_queries.csv @@ -0,0 +1,6 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sampleN,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleN.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 diff --git a/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv b/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv new file mode 100644 index 0000000..90d6289 --- /dev/null +++ b/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv @@ -0,0 +1,5 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3_multiplekeys_nomatch.mlst.json,1.1.2 diff --git a/tests/data/samplesheets/samplesheet1.csv b/tests/data/samplesheets/samplesheet1.csv new file mode 100644 index 0000000..b0d6f9e --- /dev/null +++ b/tests/data/samplesheets/samplesheet1.csv @@ -0,0 +1,5 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 diff --git a/tests/data/samplesheets/samplesheet_emptyJSON.csv b/tests/data/samplesheets/samplesheet_emptyJSON.csv new file mode 100644 index 0000000..0b84688 --- /dev/null +++ b/tests/data/samplesheets/samplesheet_emptyJSON.csv @@ -0,0 +1,6 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2_empty.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 + diff --git a/tests/data/samplesheets/samplesheet_gzip.csv b/tests/data/samplesheets/samplesheet_gzip.csv new file mode 100644 index 0000000..e35b3e9 --- /dev/null +++ b/tests/data/samplesheets/samplesheet_gzip.csv @@ -0,0 +1,5 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json.gz,1.1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1.2 diff --git a/tests/data/samplesheets/samplesheet_thresh_1.csv b/tests/data/samplesheets/samplesheet_thresh_1.csv new file mode 100644 index 0000000..f4b6b93 --- /dev/null +++ b/tests/data/samplesheets/samplesheet_thresh_1.csv @@ -0,0 +1,5 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1 diff --git a/tests/data/samplesheets/samplesheet_thresh_1_0.csv b/tests/data/samplesheets/samplesheet_thresh_1_0.csv new file mode 100644 index 0000000..9260f3f --- /dev/null +++ b/tests/data/samplesheets/samplesheet_thresh_1_0.csv @@ -0,0 +1,5 @@ +sample,mlst_alleles,address +sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json, +sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1.1 +sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample2.mlst.json,1.1 +sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample3.mlst.json,1.1 diff --git a/tests/modules/cluster_file/main.nf.test b/tests/modules/cluster_file/main.nf.test new file mode 100644 index 0000000..3f13833 --- /dev/null +++ b/tests/modules/cluster_file/main.nf.test @@ -0,0 +1,58 @@ +nextflow_process { + name "Test Process CLUSTER_FILE" + script "modules/local/cluster_file/main.nf" + process "CLUSTER_FILE" + + test("Test when sample levels are equal") { + + when { + process { + """ + input[0] = Channel.of( + [['id':'sample1', 'address':'1.1.1'], + ['id':'sample2', 'address':'1.1.1'], + ['id':'sample3', 'address':'1.1.2']] + ) + """ + } + + params { + outdir = "cluster_results" + } + } + + then { + assert process.success + assert path("$launchDir/cluster_results").exists() + + // Check reference_clusters file + def actual_clusters = path("$launchDir/cluster_results/cluster/reference_clusters.txt") + def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters.txt") + assert actual_clusters.text == expected_clusters.text + } + } + + test("Test when sample levels are different") { + + when { + process { + """ + input[0] = Channel.of( + [['id':'sample1', 'address':'1.1.1'], + ['id':'sample2', 'address':'1.1.1'], + ['id':'sample3', 'address':'1.2']] + ) + """ + } + + params { + outdir = "cluster_results" + } + } + + then { + assert process.failed + assert (process.stdout =~ /Inconsistent levels found: expected 3 levels but found 2 levels in sample3/).find() + } + } +} diff --git a/tests/modules/local/assemblystub/main.nf.test b/tests/modules/local/assemblystub/main.nf.test deleted file mode 100644 index 881bf56..0000000 --- a/tests/modules/local/assemblystub/main.nf.test +++ /dev/null @@ -1,38 +0,0 @@ -nextflow_process { - - name "Test Process ASSEMBLY_STUB" - script "modules/local/assemblystub/main.nf" - process "ASSEMBLY_STUB" - - test("Basic execution, check output.") { - - when { - params { - outdir = "tests/results" - } - process { - """ - input[0] = new Tuple(["id": "SAMPLE1"], [file("sample1_R1.fastq.gz"), file("sample1_R2.fastq.gz")]) - """ - } - } - - then { - assert process.success - - with(process.out) { - // check if emitted output has been created - assert assembly.size() == 1 - - // parse assembly file - def assembly_header = path(assembly.get(0)[1]).linesGzip[0] - def assembly_body = path(assembly.get(0)[1]).linesGzip[1] - - assert assembly_header.equals(">SAMPLE1-stub-assembly") - assert assembly_body.equals("ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTTAAAAACCCCCGGGGGTTTTT") - } - } - - } - -} diff --git a/tests/modules/local/generatesamplejson/main.nf.test b/tests/modules/local/generatesamplejson/main.nf.test deleted file mode 100644 index ac071a3..0000000 --- a/tests/modules/local/generatesamplejson/main.nf.test +++ /dev/null @@ -1,40 +0,0 @@ -nextflow_process { - - name "Test Process GENERATE_SAMPLE_JSON" - script "modules/local/generatesamplejson/main.nf" - process "GENERATE_SAMPLE_JSON" - - test("Basic execution, check output.") { - - when { - params { - outdir = "tests/results" - } - process { - """ - input[0] = new Tuple(["id": "SAMPLE1"], [file("sample1_R1.fastq.gz"), file("sample1_R2.fastq.gz")], file("SAMPLE1.assembly.fa.gz")) - """ - } - } - - then { - assert process.success - - with(process.out) { - // check if emitted output has been created - assert json.size() == 1 - - // parse output json file - def sample_json_string = path(json.get(0)[1]).linesGzip.join("\n") - def parser = new groovy.json.JsonSlurper() - def sample_json = parser.parseText(sample_json_string) - - assert sample_json.files.samples.SAMPLE1[0].path.equals("assembly/SAMPLE1.assembly.fa.gz") - assert sample_json.metadata.samples.SAMPLE1.reads[0].equals("sample1_R1.fastq.gz") - assert sample_json.metadata.samples.SAMPLE1.reads[1].equals("sample1_R2.fastq.gz") - } - } - - } - -} diff --git a/tests/modules/local/generatesummary/main.nf.test b/tests/modules/local/generatesummary/main.nf.test deleted file mode 100644 index b2eb189..0000000 --- a/tests/modules/local/generatesummary/main.nf.test +++ /dev/null @@ -1,37 +0,0 @@ -nextflow_process { - - name "Test Process GENERATE_SUMMARY" - script "modules/local/generatesummary/main.nf" - process "GENERATE_SUMMARY" - - test("Basic execution, check output.") { - - when { - params { - outdir = "tests/results" - } - process { - """ - input[0] = [new Tuple(["id": "SAMPLE1"], [file("sample1_R1.fastq.gz"), file("sample1_R2.fastq.gz")], file("SAMPLE1.assembly.fa.gz"))] - """ - } - } - - then { - assert process.success - - with(process.out) { - // check if emitted output has been created - assert summary.size() == 1 - - assert path(summary.get(0)).linesGzip[0].equals("IRIDANEXTEXAMPLE Pipeline Summary") - assert path(summary.get(0)).linesGzip[4].equals("SAMPLE1:") - assert path(summary.get(0)).linesGzip[5].contains("reads.1: ") - assert path(summary.get(0)).linesGzip[6].contains("reads.2: ") - assert path(summary.get(0)).linesGzip[7].contains("assembly: ") - } - } - - } - -} diff --git a/tests/modules/local/iridanextoutput/main.nf.test b/tests/modules/local/iridanextoutput/main.nf.test deleted file mode 100644 index 72808ab..0000000 --- a/tests/modules/local/iridanextoutput/main.nf.test +++ /dev/null @@ -1,51 +0,0 @@ -nextflow_process { - - name "Test Process IRIDA_NEXT_OUTPUT" - script "modules/local/iridanextoutput/main.nf" - process "IRIDA_NEXT_OUTPUT" - - test("Basic execution, check output.") { - - when { - params { - outdir = "tests/results" - } - process { - """ - input[0] = [file("$baseDir/tests/data/SAMPLE1.simple.json.gz"), file("$baseDir/tests/data/SAMPLE2.simple.json.gz"), file("$baseDir/tests/data/SAMPLE3.simple.json.gz")] - """ - } - } - - then { - assert process.success - - with(process.out) { - // check if emitted output has been created - assert output_json.size() == 1 - - // parse output json file - def json_string = path(output_json.get(0)).linesGzip.join("\n") - def parser = new groovy.json.JsonSlurper() - def irida_json = parser.parseText(json_string) - - assert irida_json.files.global[0].path.equals("summary/summary.txt.gz") - - assert irida_json.files.samples.SAMPLE1[0].path.equals("assembly/SAMPLE1.assembly.fa.gz") - assert irida_json.files.samples.SAMPLE2[0].path.equals("assembly/SAMPLE2.assembly.fa.gz") - assert irida_json.files.samples.SAMPLE3[0].path.equals("assembly/SAMPLE3.assembly.fa.gz") - - assert irida_json.metadata.samples.SAMPLE1.'reads.1'.equals("sample1_R1.fastq.gz") - assert irida_json.metadata.samples.SAMPLE1.'reads.2'.equals("sample1_R2.fastq.gz") - - assert irida_json.metadata.samples.SAMPLE2.'reads.1'.equals("sample2_R1.fastq.gz") - assert irida_json.metadata.samples.SAMPLE2.'reads.2'.equals("sample2_R2.fastq.gz") - - assert irida_json.metadata.samples.SAMPLE3.'reads.1'.equals("sample1_R1.fastq.gz") - assert irida_json.metadata.samples.SAMPLE3.'reads.2'.equals("null") - } - } - - } - -} diff --git a/tests/modules/local/simplifyiridajson/main.nf.test b/tests/modules/local/simplifyiridajson/main.nf.test deleted file mode 100644 index 7d61567..0000000 --- a/tests/modules/local/simplifyiridajson/main.nf.test +++ /dev/null @@ -1,41 +0,0 @@ -nextflow_process { - - name "Test Process SIMPLIFY_IRIDA_JSON" - script "modules/local/simplifyiridajson/main.nf" - process "SIMPLIFY_IRIDA_JSON" - - test("Basic execution, check output.") { - - when { - params { - outdir = "tests/results" - } - process { - """ - input[0] = new Tuple(["id": "SAMPLE1"], file("$baseDir/tests/data/SAMPLE1.json.gz")) - """ - } - } - - then { - assert process.success - - with(process.out) { - // check if emitted output has been created - assert simple_json.size() == 1 - - // parse output json file - def json_string = path(simple_json.get(0)[1]).linesGzip.join("\n") - def parser = new groovy.json.JsonSlurper() - def json_simple = parser.parseText(json_string) - - assert json_simple.files.samples.SAMPLE1[0].path.equals("assembly/SAMPLE1.assembly.fa.gz") - - assert json_simple.metadata.samples.SAMPLE1.'reads.1'.equals("sample1_R1.fastq.gz") - assert json_simple.metadata.samples.SAMPLE1.'reads.2'.equals("sample1_R2.fastq.gz") - } - } - - } - -} diff --git a/tests/nextflow.config b/tests/nextflow.config index c19b1ad..2e79f3c 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -3,3 +3,20 @@ Nextflow config file for running tests ======================================================================================== */ + + +params.max_memory = "2.GB" +params.max_cpus = 1 +params.ref_clusters = "$baseDir/tests/data/clusters/expected_clusters.txt" + + +/* This is required to run in WSL/Ubuntu using singularity +Without this, profile_dists was not successfully completing +due to issues with multiprocessing in the container. A similar +error is found at https://github.com/marcelm/cutadapt/issues/583 +*/ +singularity.runOptions = "--contain" + +/* Remove gzipping on JSON output for testing/asserts on file contents +*/ +iridanext.output.path = "${params.outdir}/iridanext.output.json" diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test new file mode 100644 index 0000000..223b7de --- /dev/null +++ b/tests/pipelines/main.nf.test @@ -0,0 +1,357 @@ +nextflow_pipeline { + + name "Integration test of nomenclature assignment pipeline" + script "main.nf" + + test("Small-scale test of full pipeline"){ + tag "pipeline_success" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check merged profiles + def actual_profile_ref = path("$launchDir/results/locidex/merge/reference/merged_ref/merged_profiles_ref.tsv") + def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile1.tsv") + assert actual_profile_ref.text == expected_profile_tsv.text + + // Check query profiles + def actual_profile_query = path("$launchDir/results/locidex/merge/query/merged_value/merged_profiles_value.tsv") + def expected_profile_query_tsv = path("$baseDir/tests/data/profiles/expected-profile2.tsv") + assert actual_profile_query.text == expected_profile_query_tsv.text + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_pairwise_dists.txt") + assert actual_distances.text == expected_distances.text + + // Verify cluster file + def actual_cluster = path("$launchDir/results/cluster/reference_clusters.txt") + def expected_cluster = path("$baseDir/tests/data/clusters/expected_clusters.txt") + assert actual_cluster.text == expected_cluster.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/test_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1.1.3" + } + } + + test("Small-scale test of full pipeline with scaled distances"){ + tag "pipeline_success_scaled" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + + pd_distm = "scaled" + gm_thresholds = "50,20,0" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check merged profiles + def actual_profile_ref = path("$launchDir/results/locidex/merge/reference/merged_ref/merged_profiles_ref.tsv") + def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile_scaled1.tsv") + assert actual_profile_ref.text == expected_profile_tsv.text + + // Check query profiles + def actual_profile_query = path("$launchDir/results/locidex/merge/query/merged_value/merged_profiles_value.tsv") + def expected_profile_query_tsv = path("$baseDir/tests/data/profiles/expected-profile_scaled2.tsv") + assert actual_profile_query.text == expected_profile_query_tsv.text + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_dists_scaled.txt") + assert actual_distances.text == expected_distances.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results_scaled.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/scaled_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1.2.3" + } + } + + + test("Small-scale test of full pipeline with multiple queries"){ + tag "pipeline_success_multiple_queries" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-multiple_queries.csv" + outdir = "results" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check merged profiles + def actual_profile_ref = path("$launchDir/results/locidex/merge/reference/merged_ref/merged_profiles_ref.tsv") + def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile_queries1.tsv") + assert actual_profile_ref.text == expected_profile_tsv.text + + // Check query profiles + def actual_profile_query = path("$launchDir/results/locidex/merge/query/merged_value/merged_profiles_value.tsv") + def expected_profile_query_tsv = path("$baseDir/tests/data/profiles/expected-profile_queries2.tsv") + assert actual_profile_query.text == expected_profile_query_tsv.text + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_pairwise_queries_dists.txt") + assert actual_distances.text == expected_distances.text + + // Verify cluster file + def actual_cluster = path("$launchDir/results/cluster/reference_clusters.txt") + def expected_cluster = path("$baseDir/tests/data/clusters/expected_clusters.txt") + assert actual_cluster.text == expected_cluster.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results_queries.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/queries_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 2 + assert iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.containsKey("sampleN") + + assert iridanext_metadata.sampleQ."address" == "2.2.3" + assert iridanext_metadata.sampleN."address" == "2.2.3" + } + } + + test("Small-scale test of full pipeline with gzipped MLST JSON"){ + tag "Gzipped_MLST_JSON" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet_gzip.csv" + outdir = "results" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check that sample1.mlst.json.gz has been open, read, and that a new gzipped file has been generated + assert path("$launchDir/results/input/sample1.mlst.json.gz").exists() + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/test_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1.1.3" + } + } + + test("Testing when query and reference sample IDs are mismatched with MLST JSON file keys"){ + // IDs in the sample sheet and IDs in the individual MLST JSON files will not match. + // This tests the pipelines ability to handle and correct for this problem. + + tag "mismatched_IDs" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-mismatched_IDs.csv" + outdir = "results" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check outputs + def lines = [] + + // Ensure that the error_reports are generated for query and reference samples + lines = path("$launchDir/results/input/sample2_error_report.csv").readLines() + assert lines.contains("sample2,[\'sample7\'],Reference sample2 ID and JSON key in sample7.mlst.json DO NOT MATCH. The 'sample7' key in sample7.mlst.json has been forcefully changed to 'sample2': User should manually check input files to ensure correctness.") + + lines = path("$launchDir/results/input/sampleR_error_report.csv").readLines() + assert lines.contains("sampleR,[\'sampleF\'],Query sampleR ID and JSON key in sampleF.mlst.json DO NOT MATCH. The 'sampleF' key in sampleF.mlst.json has been forcefully changed to 'sampleR': User should manually check input files to ensure correctness.") + + // Check filter_query csv file + lines = path("$launchDir/results/filter/new_addresses.csv").readLines() + assert lines.contains("sampleQ,2.2.3") + assert lines.contains("sampleR,2.2.3") + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/mismatched_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 2 + assert iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.containsKey("sampleR") + + assert iridanext_metadata.sampleQ."address" == "2.2.3" + assert iridanext_metadata.sampleR."address" == "2.2.3" + } + } + + test("Testing data removal in MLST JSON with a matching sampleID key."){ + // There are multiple sample entries (keys) in the MLST JSON and one of them matches the sampleID. + // This test evaluates the pipeline's ability to address this issue by removing keys that do not match the sampleID. + + tag "multiple_keys_with_matching_ID" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-multiple_keys.csv" + outdir = "results" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results.txt") + assert actual_calls.text == expected_calls.text + + // Check outputs + def lines = [] + + // Ensure that the error_reports are generated for query and reference samples + lines = path("$launchDir/results/input/sample3_error_report.csv").readLines() + assert lines.contains('sample3,"[\'extra_key\', \'sample3\']","MLST JSON file (sample3_multiplekeys.mlst.json) contains multiple keys: [\'extra_key\', \'sample3\']. The MLST JSON file has been modified to retain only the \'sample3\' entry"') + + // Check filtered query csv results + lines = path("$launchDir/results/filter/new_addresses.csv").readLines() + assert lines.contains("sampleQ,1.1.3") + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/multiplekeys_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_samples.sample3.size() == 1 + assert iridanext_samples.sample3[0].path == 'input/sample3_error_report.csv' + + assert iridanext_metadata.size() == 1 + assert iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1.1.3" + } + } + + test("Testing the removal of data in MLST JSON with no sampleID match."){ + // There are multiple sample entries (keys) in the MLST JSON and none of them match the sampleID.. + // This test ensures the pipeline can handle and resolve this issue by retaining only the first JSON key entry and renaming it to match the sampleID. + + tag "multiple_keys_without_matching_ID" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-multiplekeys_nomatch.csv" + outdir = "results" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results.txt") + assert actual_calls.text == expected_calls.text + + // Check outputs + def lines = [] + + // Ensure that the error_reports are generated for query and reference samples + lines = path("$launchDir/results/input/sample3_error_report.csv").readLines() + assert lines.contains("sample3,\"[\'sample4\', \'extra_key\']\",No key in the MLST JSON file (sample3_multiplekeys_nomatch.mlst.json) matches the specified sample ID \'sample3\'. The first key \'sample4\' has been forcefully changed to \'sample3\' and all other keys have been removed.") + + // Check filtered query csv results + lines = path("$launchDir/results/filter/new_addresses.csv").readLines() + assert lines.contains("sampleQ,1.1.3") + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/multiplekeys_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_samples.sample3.size() == 1 + assert iridanext_samples.sample3[0].path == 'input/sample3_error_report.csv' + + assert iridanext_metadata.size() == 1 + assert iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1.1.3" + } + } + + test("Testing when provided MLST JSON file(s) are empty."){ + tag "empty_JSON" + + when{ + params { + input = "$baseDir/tests/data/samplesheets/samplesheet_emptyJSON.csv" + outdir = "results" + } + } + + then { + assert workflow.failed + assert (workflow.stdout =~ /sample2_empty.mlst.json is completely empty!/).find() + } + } +} diff --git a/tests/pipelines/main_gm_threshold.nf.test b/tests/pipelines/main_gm_threshold.nf.test new file mode 100644 index 0000000..fb76112 --- /dev/null +++ b/tests/pipelines/main_gm_threshold.nf.test @@ -0,0 +1,211 @@ +nextflow_pipeline { + + name "Integration Tests of adjusting gm_thresholds parameters" + script "main.nf" + + test("Test fail pipeline if null threshold set") { + tag "pipeline_failure_null_threshold" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + + gm_thresholds = null + } + } + + then { + assert workflow.failed + assert workflow.stdout.contains("ERROR ~ --gm_thresholds null: Cannot pass null or empty string") + } + } + + test("Test fail pipeline if empty threshold set") { + tag "pipeline_failure_no_threshold" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + + gm_thresholds = "" + } + } + + then { + assert workflow.failed + assert workflow.stdout.contains("ERROR ~ --gm_thresholds : Cannot pass null or empty string") + } + } + + test("Test fail pipeline if negative threshold set") { + tag "pipeline_failure_negative_threshold" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + + gm_thresholds = "-1" + } + } + + then { + assert workflow.failed + assert workflow.stderr.contains('* --gm_thresholds: string [-1] does not match pattern ^(\\d+(\\.\\d+)?,)*\\d+(\\.\\d+)?$ (-1)') + } + } + + test("Test fail pipeline if mismatch between thresholds and scaled distm") { + tag "pipeline_failure_threshold_scaled" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + + gm_thresholds = "200,50,0" + pd_distm = "scaled" + } + } + + then { + assert workflow.failed + assert workflow.stdout.contains("ERROR ~ '--pd_distm scaled' is set, but '--gm_thresholds 200,50,0' contains thresholds outside of range [0,100]." + + " Please either set '--pd_distm hamming' or adjust the threshold values.") + } + } + + test("Test fail pipeline if mismatch between thresholds and hamming distm") { + tag "pipeline_failure_threshold_hamming" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + + gm_thresholds = "2,1,0.5" + pd_distm = "hamming" + } + } + + then { + assert workflow.failed + assert workflow.stdout.contains("ERROR ~ '--pd_distm hamming' is set, but '--gm_thresholds 2,1,0.5' contains fractions." + + " Please either set '--pd_distm scaled' or remove fractions from distance thresholds.") + } + } + + test("Test fail pipeline with single threshold set to 1") { + tag "pipeline_thresh_1_fail" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + + gm_thresholds = "1" + } + } + + then { + assert workflow.failed + assert (workflow.stdout =~ /Error \[1.0\] supplied thresholds do not equal the number of threshold columns in reference_clusters.txt/).find() + } + } + + test("Test pipeline with single threshold set to 1") { + tag "pipeline_thresh_1_success" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet_thresh_1.csv" + outdir = "results" + + gm_thresholds = "1" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_dists_thresh_1.txt") + assert actual_distances.text == expected_distances.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results_thresh_1.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/thresh1_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1" + } + } + + test("Test fail pipeline with threshold set to 1,0") { + tag "pipeline_thresh_1_0_fail" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet1.csv" + outdir = "results" + + gm_thresholds = "1,0" + } + } + + then { + assert workflow.failed + assert (workflow.stdout =~ /Error \[1.0, 0.0\] supplied thresholds do not equal the number of threshold columns in reference_clusters.txt/).find() + } + } + + test("Test pipeline with threshold set to 1,0") { + tag "pipeline_thresh_1_0_success" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet_thresh_1_0.csv" + outdir = "results" + + gm_thresholds = "1,0" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_dists_thresh_1_0.txt") + assert actual_distances.text == expected_distances.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results_thresh_1_0.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/thresh1.0_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1.2" + } + } +} diff --git a/tests/pipelines/main_missing_alleles.nf.test b/tests/pipelines/main_missing_alleles.nf.test new file mode 100644 index 0000000..51b2e71 --- /dev/null +++ b/tests/pipelines/main_missing_alleles.nf.test @@ -0,0 +1,174 @@ +nextflow_pipeline { + + name "Integration Tests for parameters dealing with missing or removed alleles" + script "main.nf" + + test("Full pipeline hashes and missing data") { + tag "pipeline_hashes_missing" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-hash_missing.csv" + outdir = "results" + + gm_thresholds = "1" + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check merged profiles + def actual_profile_ref = path("$launchDir/results/locidex/merge/reference/merged_ref/merged_profiles_ref.tsv") + def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile_missing1.tsv") + assert actual_profile_ref.text == expected_profile_tsv.text + + // Check query profiles + def actual_profile_query = path("$launchDir/results/locidex/merge/query/merged_value/merged_profiles_value.tsv") + def expected_profile_query_tsv = path("$baseDir/tests/data/profiles/expected-profile_missing2.tsv") + assert actual_profile_query.text == expected_profile_query_tsv.text + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_dists_missing.txt") + assert actual_distances.text == expected_distances.text + + // Verify cluster file + def actual_cluster = path("$launchDir/results/cluster/reference_clusters.txt") + def expected_cluster = path("$baseDir/tests/data/clusters/expected_clusters_missing.txt") + assert actual_cluster.text == expected_cluster.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results_missing.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/missing_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1" + } + } + + test("Full pipeline hashes and missing data count missing as differences") { + tag "pipeline_hashes_missing_count_missing" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-hash_missing.csv" + outdir = "results" + + gm_thresholds = "1" + pd_count_missing = true + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_dists_count-missing.txt") + assert actual_distances.text == expected_distances.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results_count-missing.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/count-missing_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1" + } + } + + test("Full pipeline remove loci with missing data") { + tag "pipeline_hashes_remove_missing_loci" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-hash_missing.csv" + outdir = "results" + + gm_thresholds = "1" + pd_count_missing = true + pd_missing_threshold = 0.5 + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Check computed pairwise distances + def actual_distances = path("$launchDir/results/distances/results.text") + def expected_distances = path("$baseDir/tests/data/distances/expected_dists_loci-missing.txt") + assert actual_distances.text == expected_distances.text + + // Check called clusters + def actual_calls = path("$launchDir/results/call/Called/results.text") + def expected_calls = path("$baseDir/tests/data/called/expected_results_loci-missing.txt") + assert actual_calls.text == expected_calls.text + + // Check IRIDA Next JSON output + assert path("$launchDir/results/iridanext.output.json").json == path("$baseDir/tests/data/irida/loci-missing_iridanext.output.json").json + + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + assert iridanext_metadata.size() == 1 && iridanext_metadata.containsKey("sampleQ") + assert iridanext_metadata.sampleQ."address" == "1" + } + } + + test("Test fail pipeline if non-existent columns file is passed") { + tag "pipeline_failure_columns_no_exist" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-hash_missing.csv" + outdir = "results" + + pd_columns = "./no-exist" + } + } + + then { + assert workflow.failed + assert workflow.stderr.contains("* --pd_columns: the file or directory './no-exist' does not exist.") + } + } + + test("Test failure of pipeline when keeping no loci") { + tag "pipeline_keep_zero_loci" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-hash_missing.csv" + outdir = "results" + + gm_thresholds = "0" + pd_columns = "$baseDir/tests/data/columns/keep-zero-loci-empty-file.txt" + } + } + + then { + assert workflow.failed + } + } +} + diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf index 453a922..8972669 100644 --- a/workflows/gas_nomenclature.nf +++ b/workflows/gas_nomenclature.nf @@ -22,10 +22,13 @@ include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { GENERATE_SAMPLE_JSON } from '../modules/local/generatesamplejson/main' -include { SIMPLIFY_IRIDA_JSON } from '../modules/local/simplifyiridajson/main' -include { IRIDA_NEXT_OUTPUT } from '../modules/local/iridanextoutput/main' -include { GENERATE_SUMMARY } from '../modules/local/generatesummary/main' +include { INPUT_ASSURE } from "../modules/local/input_assure/main" +include { LOCIDEX_MERGE as LOCIDEX_MERGE_REF } from "../modules/local/locidex/merge/main" +include { LOCIDEX_MERGE as LOCIDEX_MERGE_QUERY } from "../modules/local/locidex/merge/main" +include { PROFILE_DISTS } from "../modules/local/profile_dists/main" +include { CLUSTER_FILE } from "../modules/local/cluster_file/main" +include { GAS_CALL } from "../modules/local/gas/call/main" +include { FILTER_QUERY } from "../modules/local/filter_query/main" /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -37,10 +40,6 @@ include { GENERATE_SUMMARY } from '../modules/local/generatesummary/main' // MODULE: Installed directly from nf-core/modules // include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' -include { LOCIDEX_MERGE as LOCIDEX_MERGE_REF } from "../modules/local/locidex/merge/main" -include { LOCIDEX_MERGE as LOCIDEX_MERGE_QUERY } from "../modules/local/locidex/merge/main" -include { GAS_CALL } from "../modules/local/gas/call/main" -include { PROFILE_DISTS } from "../modules/local/profile_dists/main" /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -48,57 +47,113 @@ include { PROFILE_DISTS } from "../modules/local/profile_dists/main" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ + +def prepareFilePath(String filep, GString debug_msg){ + // Rerturns null if a file is not valid + def return_path = null + if(filep){ + file_in = file(filep) + if(file_in.exists()){ + return_path = file_in + log.debug debug_msg + } + }else{ + return_path = [] + } + + return return_path // empty value if file argument is null +} + workflow GAS_NOMENCLATURE { ch_versions = Channel.empty() // Create a new channel of metadata from a sample sheet // NB: `input` corresponds to `params.input` and associated sample sheet schema - input = Channel.fromSamplesheet("input"); - profiles = input.branch{ - ref: it[0].profile_type - query: !it[0].profile_type - errors: true // TODO add in check on file for erroneous values, may not be needed as nf-validation is working + input = Channel.fromSamplesheet("input") + + // Ensure meta.id and mlst_file keys match; generate error report for samples where id ≠ key + input_assure = INPUT_ASSURE(input) + ch_versions = ch_versions.mix(input_assure.versions) + + // Prepare reference and query TSV files for LOCIDEX_MERGE + profiles = input_assure.result.branch { + query: !it[0].address } + reference_values = input_assure.result.collect{ meta, mlst -> mlst} + query_values = profiles.query.collect{ meta, mlst -> mlst } - reference_values = profiles.ref.collect{ meta, profile -> profile} - query_values = profile.query.collect{ meta, profile -> proifile } - reference_values.view() - query_values.view() - //LOCIDEX_MERGE_REF(reference_values) - //LOCIDEX_MERGE_QUERY(query_values) - - - // A channel of tuples of ({meta}, [read[0], read[1]], assembly) - //ch_tuple_read_assembly = input.join(ASSEMBLY_STUB.out.assembly) - - //GENERATE_SAMPLE_JSON ( - // ch_tuple_read_assembly - //) - //ch_versions = ch_versions.mix(GENERATE_SAMPLE_JSON.out.versions) - - //GENERATE_SUMMARY ( - // ch_tuple_read_assembly.collect{ [it] } - //) - //ch_versions = ch_versions.mix(GENERATE_SUMMARY.out.versions) - - //SIMPLIFY_IRIDA_JSON ( - // GENERATE_SAMPLE_JSON.out.json - //) - //ch_versions = ch_versions.mix(SIMPLIFY_IRIDA_JSON.out.versions) - //ch_simplified_jsons = SIMPLIFY_IRIDA_JSON.out.simple_json.map { meta, data -> data }.collect() // Collect JSONs - - //IRIDA_NEXT_OUTPUT ( - // samples_data=ch_simplified_jsons - //) - //ch_versions = ch_versions.mix(IRIDA_NEXT_OUTPUT.out.versions) - - //CUSTOM_DUMPSOFTWAREVERSIONS ( - // ch_versions.unique().collectFile(name: 'collated_versions.yml') - //) -} + // LOCIDEX modules + ref_tag = Channel.value("ref") + query_tag = Channel.value("value") + + merged_references = LOCIDEX_MERGE_REF(reference_values, ref_tag) + ch_versions = ch_versions.mix(merged_references.versions) + + merged_queries = LOCIDEX_MERGE_QUERY(query_values, query_tag) + ch_versions = ch_versions.mix(merged_queries.versions) + + // PROFILE DISTS processes + mapping_file = prepareFilePath(params.pd_mapping_file, "Selecting ${params.pd_mapping_file} for --pd_mapping_file") + if(mapping_file == null){ + exit 1, "${params.pd_mapping_file}: Does not exist but was passed to the pipeline. Exiting now." + } + + columns_file = prepareFilePath(params.pd_columns, "Selecting ${params.pd_columns} for --pd_mapping_file") + if(columns_file == null){ + exit 1, "${params.pd_columns}: Does not exist but was passed to the pipeline. Exiting now." + } + distances = PROFILE_DISTS(merged_queries.combined_profiles, + merged_references.combined_profiles, + mapping_file, + columns_file) + ch_versions = ch_versions.mix(distances.versions) + + // Generate the expected_clusters.txt file from the addresses of the provided reference samples + clusters = input.filter { meta, file -> + meta.address != null + }.collect { meta, file -> + meta } + + expected_clusters = CLUSTER_FILE(clusters) + + // GAS CALL processes + + if(params.gm_thresholds == null || params.gm_thresholds == ""){ + exit 1, "--gm_thresholds ${params.gm_thresholds}: Cannot pass null or empty string" + } + + gm_thresholds_list = params.gm_thresholds.toString().split(',') + if (params.pd_distm == 'hamming') { + if (gm_thresholds_list.any { it != null && it.contains('.') }) { + exit 1, ("'--pd_distm ${params.pd_distm}' is set, but '--gm_thresholds ${params.gm_thresholds}' contains fractions." + + " Please either set '--pd_distm scaled' or remove fractions from distance thresholds.") + } + } else if (params.pd_distm == 'scaled') { + if (gm_thresholds_list.any { it != null && (it as Float < 0.0 || it as Float > 100.0) }) { + exit 1, ("'--pd_distm ${params.pd_distm}' is set, but '--gm_thresholds ${params.gm_thresholds}' contains thresholds outside of range [0,100]." + + " Please either set '--pd_distm hamming' or adjust the threshold values.") + } + } else { + exit 1, "'--pd_distm ${params.pd_distm}' is an invalid value. Please set to either 'hamming' or 'scaled'." + } + + called_data = GAS_CALL(expected_clusters.text, distances.results) + ch_versions = ch_versions.mix(called_data.versions) + + // Filter the new queried samples and addresses into a CSV/JSON file for the IRIDANext plug in + query_ids = profiles.query.collect { it[0].id } + + new_addresses = FILTER_QUERY(query_ids, called_data.distances, "tsv", "csv") + ch_versions = ch_versions.mix(new_addresses.versions) + + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_versions.unique().collectFile(name: 'collated_versions.yml') + ) + +} /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~